# Data preprocessing 

In [133]:
import re
import numpy as np
import pandas as pd

In [166]:
# loading the dataset 
# english language dataset 

df = pd.read_csv('datasets/sentiment.csv', usecols=['tweets','labels'])

In [167]:
df.sample(5)

Unnamed: 0,tweets,labels
174246,This is 5 Script Idea for webseries by #ChatGP...,good
13084,#ChatGPT describes the interior life of a corg...,bad
197369,Why check #crypto prices on other websites whe...,bad
35632,"I asked #ChatGPT this question: \n\n""Will AI ...",bad
53288,Ripple CTO shuts down ChatGPT's XRP conspiracy...,bad


In [168]:
df.labels.value_counts() 

bad        107796
good        56011
neutral     55487
Name: labels, dtype: int64

In [169]:
# change the labeling: neutral = 0, positive = 1, negative = -1

df.labels = df.labels.replace('bad', -1)
df.labels = df.labels.replace('good', 1)
df.labels = df.labels.replace('neutral', 0)

In [170]:
df.sample(5)

Unnamed: 0,tweets,labels
90147,ChatGPT is out here serving up real human dram...,-1
115503,Even a robot knows this one 🤖 #chatGPT #person...,-1
24658,OpenAI ChatGPT is anti national..\nI asked it ...,-1
191774,"Delighted to hear this news, but it is frustra...",0
82536,I asked #chatGPT to automate editorial decisio...,0


In [171]:
df.labels.value_counts()

-1    107796
 1     56011
 0     55487
Name: labels, dtype: int64

In [172]:
# we have too much negative tweets compared to the other classes
# we divide the set of negative tweets by 2

mask = (df.labels == -1)
idx, = np.where(mask)
df = df.drop(df.index[idx[:len(idx)//2]])

In [173]:
df.labels.value_counts() 

 1    56011
 0    55487
-1    53898
Name: labels, dtype: int64

In [174]:
# discard the neutral class of tweets

#df = df.loc[df.labels != 2]
#df.labels.value_counts() 

In [175]:
# lower cases 
df.tweets = df.tweets.str.lower()

In [176]:
df.tweets.head()

0    chatgpt: optimizing language models for dialog...
1    try talking with chatgpt, our new ai system wh...
2    chatgpt: optimizing language models for dialog...
3    thrilled to share that chatgpt, our new model ...
5    just launched chatgpt, our new ai system which...
Name: tweets, dtype: object

In [177]:
# remove english stopwords with the help of the gensim library 
from gensim.parsing.preprocessing import remove_stopwords

df.tweets = df.tweets.apply(lambda text: remove_stopwords(text))


In [178]:
df.tweets.head()

0    chatgpt: optimizing language models dialogue h...
1    try talking chatgpt, new ai optimized dialogue...
2    chatgpt: optimizing language models dialogue h...
3    thrilled share chatgpt, new model optimized di...
5    launched chatgpt, new ai optimized dialogue: h...
Name: tweets, dtype: object

In [179]:
#cleaning and removing punctuation
import string

english_punctuations = string.punctuation
punctuations_list = english_punctuations

def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

df.tweets = df.tweets.apply(lambda x: cleaning_punctuations(x))
df.tweets.head()

0    chatgpt optimizing language models dialogue ht...
1    try talking chatgpt new ai optimized dialogue ...
2    chatgpt optimizing language models dialogue ht...
3    thrilled share chatgpt new model optimized dia...
5    launched chatgpt new ai optimized dialogue htt...
Name: tweets, dtype: object

In [180]:
# cleaning and removing URLs

def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)

df.tweets = df.tweets.apply(lambda x: cleaning_URLs(x))

def remove_hyperlink(word):
    return re.sub(r"http\S+", "", word)

df.tweets = df.tweets.apply(lambda x: remove_hyperlink(x))

df.tweets.head()

0    chatgpt optimizing language models dialogue  o...
1    try talking chatgpt new ai optimized dialogue ...
2    chatgpt optimizing language models dialogue  a...
3    thrilled share chatgpt new model optimized dia...
5    launched chatgpt new ai optimized dialogue  here 
Name: tweets, dtype: object

In [181]:
# cleaning and removing mentions 

def remove_mentions(word):
    return re.sub(r"@\S+", "", word)

df.tweets = df.tweets.apply(lambda x: remove_mentions(x))
df.tweets.head()

0    chatgpt optimizing language models dialogue  o...
1    try talking chatgpt new ai optimized dialogue ...
2    chatgpt optimizing language models dialogue  a...
3    thrilled share chatgpt new model optimized dia...
5    launched chatgpt new ai optimized dialogue  here 
Name: tweets, dtype: object

In [182]:
# removing emojis 

emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

df.tweets = df.tweets.apply(lambda x: emoji_pattern.sub(r'', x))

df.tweets.head()

0    chatgpt optimizing language models dialogue  o...
1    try talking chatgpt new ai optimized dialogue ...
2    chatgpt optimizing language models dialogue  a...
3    thrilled share chatgpt new model optimized dia...
5    launched chatgpt new ai optimized dialogue  here 
Name: tweets, dtype: object

In [183]:
# remove numbers 

df.tweets = df.tweets.apply(lambda x: re.sub("[^a-zA-Z]", " ", x))

df.tweets.head()

0    chatgpt optimizing language models dialogue  o...
1    try talking chatgpt new ai optimized dialogue ...
2    chatgpt optimizing language models dialogue  a...
3    thrilled share chatgpt new model optimized dia...
5    launched chatgpt new ai optimized dialogue  here 
Name: tweets, dtype: object

In [184]:
# cleaning and removing repeating characters

def cleaning_repeating_char(text):
    return re.sub(r'(.)1+', r'1', text)

df.tweets = df.tweets.apply(lambda x: cleaning_repeating_char(x))
df.tweets.head()

0    chatgpt optimizing language models dialogue  o...
1    try talking chatgpt new ai optimized dialogue ...
2    chatgpt optimizing language models dialogue  a...
3    thrilled share chatgpt new model optimized dia...
5    launched chatgpt new ai optimized dialogue  here 
Name: tweets, dtype: object

In [185]:
# remove short words that are of no use 

df.tweets = df.tweets.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
df.tweets.head()

0    chatgpt optimizing language models dialogue op...
1    try talking chatgpt new optimized dialogue fee...
2    chatgpt optimizing language models dialogue ma...
3    thrilled share chatgpt new model optimized dia...
5         launched chatgpt new optimized dialogue here
Name: tweets, dtype: object

In [186]:
# Text normalization : tokenize the tweets 
# split tweets of text into tokens (individual words of terms)

tokenize_tweets = df.tweets.apply(lambda x: x.split())
tokenize_tweets.head()

0    [chatgpt, optimizing, language, models, dialog...
1    [try, talking, chatgpt, new, optimized, dialog...
2    [chatgpt, optimizing, language, models, dialog...
3    [thrilled, share, chatgpt, new, model, optimiz...
5    [launched, chatgpt, new, optimized, dialogue, ...
Name: tweets, dtype: object

In [187]:
# Normalization: use of PorterStemmer()

from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text])


tokenize_tweets = tokenize_tweets.apply(lambda text: stem_words(text))

tokenize_tweets.head()

0           chatgpt optim languag model dialogu openai
1    tri talk chatgpt new optim dialogu feedback he...
2    chatgpt optim languag model dialogu machinelea...
3    thrill share chatgpt new model optim dialog pu...
5                launch chatgpt new optim dialogu here
Name: tweets, dtype: object

In [191]:
# check if there is any NaN values 
df.tweets = tokenize_tweets 

df.tweets.isnull().values.any()
df.tweets.isnull().sum()

0

In [192]:
# write new csv file containing the preprocessed dataset
df.to_csv('datasets/preprocessed_sentiment.csv')