# Data preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../data/sentimentdataset_mapped.csv')

### Looking through the data

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0.1     732 non-null    int64  
 1   Unnamed: 0       732 non-null    int64  
 2   Text             732 non-null    object 
 3   Sentiment        732 non-null    object 
 4   Timestamp        732 non-null    object 
 5   User             732 non-null    object 
 6   Platform         732 non-null    object 
 7   Hashtags         732 non-null    object 
 8   Retweets         732 non-null    float64
 9   Likes            732 non-null    float64
 10  Country          732 non-null    object 
 11  Year             732 non-null    int64  
 12  Month            732 non-null    int64  
 13  Day              732 non-null    int64  
 14  Hour             732 non-null    int64  
 15  Sentiment_clean  732 non-null    object 
dtypes: float64(2), int64(6), object(8)
memory usage: 91.6+ KB


In [4]:
data = data[['Text', 'Sentiment_clean']]

In [6]:
data = data.rename(columns={'Sentiment_clean': 'Sentiment'})

### Adding new data

##### Tweets dataset

Source: https://www.kaggle.com/datasets/yasserh/twitter-tweets-sentiment-dataset

In [12]:
tweets = pd.read_csv('../data/tweets.csv')

In [13]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


In [14]:
tweets = tweets[['text','sentiment']]
tweets = tweets.rename(columns={'text':'Text', 'sentiment':'Sentiment'})
tweets['Sentiment'] = tweets['Sentiment'].str.capitalize() 
tweets

Unnamed: 0,Text,Sentiment
0,"I`d have responded, if I were going",Neutral
1,Sooo SAD I will miss you here in San Diego!!!,Negative
2,my boss is bullying me...,Negative
3,what interview! leave me alone,Negative
4,"Sons of ****, why couldn`t they put them on t...",Negative
...,...,...
27476,wish we could come see u on Denver husband l...,Negative
27477,I`ve wondered about rake to. The client has ...,Negative
27478,Yay good for both of you. Enjoy the break - y...,Positive
27479,But it was worth it ****.,Positive


##### Social media dataset

Source: https://www.kaggle.com/datasets/mdismielhossenabir/sentiment-analysis

In [16]:
dataset = pd.read_csv('../data/sentiment.csv')

In [17]:
dataset = dataset[['text', 'sentiment']]
dataset['sentiment'] = dataset['sentiment'].str.capitalize() 
dataset = dataset.rename(columns={'text': 'Text', 'sentiment': 'Sentiment'})
dataset

Unnamed: 0,Text,Sentiment
0,What a great day!!! Looks like dream.,Positive
1,"I feel sorry, I miss you here in the sea beach",Positive
2,Don't angry me,Negative
3,We attend in the class just for listening teac...,Negative
4,"Those who want to go, let them go",Negative
...,...,...
494,"According to , a quarter of families under six...",Negative
495,the plan to not spend money is not going well,Negative
496,uploading all my bamboozle pictures of facebook,Neutral
497,congratulations ! you guys finish a month ear...,Positive


##### Merging datasets together 

In [18]:
data = pd.concat([data, dataset, tweets], ignore_index=True)

In [19]:
data

Unnamed: 0,Text,Sentiment
0,Enjoying a beautiful day at the park!,Positive
1,Traffic was terrible this morning.,Negative
2,Just finished an amazing workout! 💪,Positive
3,Excited about the upcoming weekend getaway!,Positive
4,Trying out a new recipe for dinner tonight.,Neutral
...,...,...
28707,wish we could come see u on Denver husband l...,Negative
28708,I`ve wondered about rake to. The client has ...,Negative
28709,Yay good for both of you. Enjoy the break - y...,Positive
28710,But it was worth it ****.,Positive


In [20]:
data['Sentiment'].value_counts()

Sentiment
Neutral     11450
Positive     9183
Negative     8079
Name: count, dtype: int64

In [21]:
data = pd.concat([
    data[data['Sentiment'] == 'Neutral'].sample(8000, random_state=42),
    data[data['Sentiment'] == 'Positive'].sample(8000, random_state=42),
    data[data['Sentiment'] == 'Negative'].sample(8000, random_state=42)
]).sample(frac=1, random_state=42).reset_index(drop=True)

In [23]:
data['Sentiment'].value_counts()

Sentiment
Neutral     8000
Negative    8000
Positive    8000
Name: count, dtype: int64

### Getting rid of duplicates and missing values

In [24]:
data.duplicated(subset=['Text']).sum()


315

In [25]:
data = data.drop_duplicates(subset=['Text']).reset_index(drop=True)
print(data.duplicated(subset=['Text']).sum())

0


In [27]:
data.isnull().sum()

Text         0
Sentiment    0
dtype: int64

In [28]:
data['Sentiment'].value_counts()

Sentiment
Neutral     7911
Positive    7888
Negative    7886
Name: count, dtype: int64

### Tokenizing the data

In [None]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [30]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)
    

In [31]:
data['Text'] = data['Text'].astype(str).apply(preprocess_text)

In [32]:
data

Unnamed: 0,Text,Sentiment
0,still news roomhowever audio turn around grabb...,Neutral
1,saw fleetwood mac im insanely jealous dont eno...,Negative
2,theres one thing hate friends dats move skl fo...,Negative
3,think cheap sunglasses falling apart oh well,Negative
4,please read blog im best day,Negative
...,...,...
23680,noooo ill miss much went stag prom high school...,Negative
23681,sign tarte news intro newsletter 20 first purc...,Neutral
23682,,Neutral
23683,prayin dad wellington fl via live welly may pr...,Positive


### Saving the data

In [34]:
data.to_csv('../data/preprocessed_data_added.csv', index=False)