# Data preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../data/sentimentdataset_mapped.csv')

### Looking through the data

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0.1     732 non-null    int64  
 1   Unnamed: 0       732 non-null    int64  
 2   Text             732 non-null    object 
 3   Sentiment        732 non-null    object 
 4   Timestamp        732 non-null    object 
 5   User             732 non-null    object 
 6   Platform         732 non-null    object 
 7   Hashtags         732 non-null    object 
 8   Retweets         732 non-null    float64
 9   Likes            732 non-null    float64
 10  Country          732 non-null    object 
 11  Year             732 non-null    int64  
 12  Month            732 non-null    int64  
 13  Day              732 non-null    int64  
 14  Hour             732 non-null    int64  
 15  Sentiment_clean  732 non-null    object 
dtypes: float64(2), int64(6), object(8)
memory usage: 91.6+ KB


In [4]:
data = data[['Text', 'Sentiment_clean']]

In [5]:
data.isnull().sum()

Text               0
Sentiment_clean    0
dtype: int64

#### Getting rid of duplicates

In [6]:
data.duplicated(subset=['Text']).sum()


26

In [7]:
data = data.drop_duplicates(subset=['Text']).reset_index(drop=True)
print(data.duplicated(subset=['Text']).sum())

0


In [8]:
data['Sentiment_clean'].value_counts()

Sentiment_clean
Positive    413
Negative    162
Neutral     131
Name: count, dtype: int64

### Tokenizing the data

In [None]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [10]:
def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [11]:
data['Clean_Text'] = data['Text'].astype(str).apply(preprocess_text)

In [12]:
data

Unnamed: 0,Text,Sentiment_clean,Clean_Text
0,Enjoying a beautiful day at the park!,Positive,enjoying beautiful day park
1,Traffic was terrible this morning.,Negative,traffic terrible morning
2,Just finished an amazing workout! 💪,Positive,finished amazing workout
3,Excited about the upcoming weekend getaway!,Positive,excited upcoming weekend getaway
4,Trying out a new recipe for dinner tonight.,Neutral,trying new recipe dinner tonight
...,...,...,...
701,Collaborating on a science project that receiv...,Neutral,collaborating science project received recogni...
702,Attending a surprise birthday party organized ...,Neutral,attending surprise birthday party organized fr...
703,Successfully fundraising for a school charity ...,Neutral,successfully fundraising school charity initia...
704,"Participating in a multicultural festival, cel...",Neutral,participating multicultural festival celebrati...


### Saving the data

In [13]:
data[['Clean_Text', 'Sentiment_clean']].to_csv('../data/preprocessed_data.csv', index=False)