# Data preprocessing 

In [23]:
import re
import numpy as np
import pandas as pd

# remove english stopwords with the help of the gensim library 
from gensim.parsing.preprocessing import remove_stopwords

import string

from nltk.stem.porter import PorterStemmer

## All the methods that will be used to preprocess

In [24]:
english_punctuations = string.punctuation
punctuations_list = english_punctuations

def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

In [25]:
# cleaning and removing URLs

def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)

def remove_hyperlink(word):
    return re.sub(r"http\S+", "", word)

In [26]:
# cleaning and removing mentions 

def remove_mentions(word):
    return re.sub(r"@\S+", "", word)

In [27]:
# removing emojis 
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

In [28]:
# cleaning and removing repeating characters

def cleaning_repeating_char(text):
    return re.sub(r'(.)1+', r'1', text)

In [29]:
stemmer = PorterStemmer()

def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text])

## Example of data preprocessing

In [166]:
# loading the dataset 
# english language dataset 

df = pd.read_csv('datasets/sentiment.csv', usecols=['tweets','labels'])

In [167]:
df.sample(5)

Unnamed: 0,tweets,labels
174246,This is 5 Script Idea for webseries by #ChatGP...,good
13084,#ChatGPT describes the interior life of a corg...,bad
197369,Why check #crypto prices on other websites whe...,bad
35632,"I asked #ChatGPT this question: \n\n""Will AI ...",bad
53288,Ripple CTO shuts down ChatGPT's XRP conspiracy...,bad


In [168]:
df.labels.value_counts() 

bad        107796
good        56011
neutral     55487
Name: labels, dtype: int64

In [169]:
# change the labeling: neutral = 0, positive = 1, negative = -1

df.labels = df.labels.replace('bad', -1)
df.labels = df.labels.replace('good', 1)
df.labels = df.labels.replace('neutral', 0)

In [170]:
df.sample(5)

Unnamed: 0,tweets,labels
90147,ChatGPT is out here serving up real human dram...,-1
115503,Even a robot knows this one ðŸ¤– #chatGPT #person...,-1
24658,OpenAI ChatGPT is anti national..\nI asked it ...,-1
191774,"Delighted to hear this news, but it is frustra...",0
82536,I asked #chatGPT to automate editorial decisio...,0


In [171]:
df.labels.value_counts()

-1    107796
 1     56011
 0     55487
Name: labels, dtype: int64

In [172]:
# we have too much negative tweets compared to the other classes
# we divide the set of negative tweets by 2

mask = (df.labels == -1)
idx, = np.where(mask)
df = df.drop(df.index[idx[:len(idx)//2]])

In [173]:
df.labels.value_counts() 

 1    56011
 0    55487
-1    53898
Name: labels, dtype: int64

In [5]:
# lower cases 
df.tweets = df.tweets.str.lower()
df.tweets.head()

AttributeError: 'DataFrame' object has no attribute 'str'

In [None]:
df.tweets = df.tweets.apply(lambda text: remove_stopwords(text))
df.tweets.head()

In [None]:
df.tweets = df.tweets.apply(lambda x: cleaning_punctuations(x))
df.tweets.head()

In [None]:
df.tweets = df.tweets.apply(lambda x: cleaning_URLs(x))
df.tweets = df.tweets.apply(lambda x: remove_hyperlink(x))
df.tweets.head()

In [None]:
df.tweets = df.tweets.apply(lambda x: remove_mentions(x))
df.tweets.head()

In [None]:
df.tweets = df.tweets.apply(lambda x: emoji_pattern.sub(r'', x))
df.tweets.head()

In [183]:
# remove numbers 
df.tweets = df.tweets.apply(lambda x: re.sub("[^a-zA-Z]", " ", x))
df.tweets.head()

0    chatgpt optimizing language models dialogue  o...
1    try talking chatgpt new ai optimized dialogue ...
2    chatgpt optimizing language models dialogue  a...
3    thrilled share chatgpt new model optimized dia...
5    launched chatgpt new ai optimized dialogue  here 
Name: tweets, dtype: object

In [None]:
df.tweets = df.tweets.apply(lambda x: cleaning_repeating_char(x))
df.tweets.head()

In [185]:
# remove short words that are of no use 
df.tweets = df.tweets.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
df.tweets.head()

0    chatgpt optimizing language models dialogue op...
1    try talking chatgpt new optimized dialogue fee...
2    chatgpt optimizing language models dialogue ma...
3    thrilled share chatgpt new model optimized dia...
5         launched chatgpt new optimized dialogue here
Name: tweets, dtype: object

In [186]:
# Text normalization : tokenize the tweets 
# split tweets of text into tokens (individual words of terms)
tokenize_tweets = df.tweets.apply(lambda x: x.split())
tokenize_tweets.head()

0    [chatgpt, optimizing, language, models, dialog...
1    [try, talking, chatgpt, new, optimized, dialog...
2    [chatgpt, optimizing, language, models, dialog...
3    [thrilled, share, chatgpt, new, model, optimiz...
5    [launched, chatgpt, new, optimized, dialogue, ...
Name: tweets, dtype: object

In [None]:
tokenize_tweets = tokenize_tweets.apply(lambda text: stem_words(text))
tokenize_tweets.head()

In [191]:
# check if there is any NaN values 
df.tweets = tokenize_tweets 

df.tweets.isnull().values.any()
df.tweets.isnull().sum()

0

In [192]:
# write new csv file containing the preprocessed dataset
df.to_csv('datasets/preprocessed_sentiment.csv')

## Defining the function for preprocessing the data

In [30]:
import string

def preprocessing(df, path):
    # lower cases 
    df = df.str.lower()
    print("Text of tweets transformed to lower cases.")
    
    # remove english stopwords with the help of the gensim library 
    df = df.apply(lambda text: remove_stopwords(text))
    print("Removed stopwords from the text of tweets.")
    
    #cleaning and removing punctuation
    df = df.apply(lambda x: cleaning_punctuations(x))
    print("Cleaned and removed the punctuations.")
    
    # cleaning and removing URLs
    df = df.apply(lambda x: cleaning_URLs(x))
    df = df.apply(lambda x: remove_hyperlink(x))
    print("Cleaned and removed the URLs and hyperlinks.")
    
    # cleaning and removing mentions 
    df = df.apply(lambda x: remove_mentions(x))
    print("Cleaned and removed the mentions.")
    
    # removing emojis 
    df = df.apply(lambda x: emoji_pattern.sub(r'', x))
    print("Removed the emojis.")
    
    # remove numbers 
    df = df.apply(lambda x: re.sub("[^a-zA-Z]", " ", x))
    print("Removed the numbers.")
    
    # cleaning and removing repeating characters
    df = df.apply(lambda x: cleaning_repeating_char(x))
    print("Cleaned and removed the repeating characters.")
    
    # remove short words that are of no use 
    df = df.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
    print("Removed the short words (length less than 3).")
    
    # Text normalization : tokenize the tweets 
    # split tweets of text into tokens (individual words of terms)
    tokenize_tweets = df.apply(lambda x: x.split())
    print("Tokenized the text.")
    
    # Normalization: use of PorterStemmer()
    tokenize_tweets = tokenize_tweets.apply(lambda text: stem_words(text))
    print("Normalized the text.")
    
    # write new csv file containing the preprocessed dataset
    df.to_csv(path)

### Preprocessing the dataset of the first month of launch

In [31]:
path = 'datasets/preprocessed_chatgptfirst.csv'

df = pd.read_csv('datasets/chatgptfirst.csv')

In [32]:
df.head()

Unnamed: 0,tweet_id,created_at,like_count,quote_count,reply_count,retweet_count,tweet,country,photo_url,city,country_code
0,1598014056790622225,2022-11-30 18:00:15+00:00,2,0,0,0,ChatGPT: Optimizing Language Models for Dialog...,,,,
1,1598014522098208769,2022-11-30 18:02:06+00:00,12179,889,1130,3252,"Try talking with ChatGPT, our new AI system wh...",,,,
2,1598014741527527435,2022-11-30 18:02:58+00:00,2,0,0,1,ChatGPT: Optimizing Language Models for Dialog...,,https://pbs.twimg.com/media/Fi1J8HbWAAMv_yi.jpg,,
3,1598015493666766849,2022-11-30 18:05:58+00:00,561,8,25,66,"THRILLED to share that ChatGPT, our new model ...",,https://pbs.twimg.com/media/Fi1Km3WUYAAfzHS.jpg,,
4,1598015509420994561,2022-11-30 18:06:01+00:00,1,0,0,0,"As of 2 minutes ago, @OpenAI released their ne...",,,,


In [33]:
df = df['tweet'] # we only retrieve the text of the tweet column

In [34]:
# check if there is any NaN value
df.isnull().values.any()
df.isnull().sum()

0

In [35]:
preprocessing(df, path)