In [718]:
import nltk
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import seaborn as sns

In [719]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

<h1>Look at the data and see if we have any missing values</h1>

In [720]:
display(df_train)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [721]:
df_train.describe()

Unnamed: 0,textID,text,selected_text,sentiment
count,27481,27480,27480,27481
unique,27481,27480,22463,3
top,cb774db0d1,"I`d have responded, if I were going",good,neutral
freq,1,1,199,11118


In [722]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


<h3>Since it is quite small, we can just drop it</h3>

In [723]:
df_train.isna().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [724]:
df_test.isna().sum()

textID       0
text         0
sentiment    0
dtype: int64

In [725]:
df_train.dropna(inplace=True)

In [726]:
df_train.isna().sum()

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64

<h3>We can look more in detail into what these tweets contain</h3>

We can see that we have cases in which the symbol ****  is being used to block a negative word. Often use in curse words for example f*** where, we can still see that the word does not have to entirely be censor.

In [727]:
df_train['text'].head()

0                  I`d have responded, if I were going
1        Sooo SAD I will miss you here in San Diego!!!
2                            my boss is bullying me...
3                       what interview! leave me alone
4     Sons of ****, why couldn`t they put them on t...
Name: text, dtype: object

We download the stopwords that we will use to preprocess the data

In [728]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daguila/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [729]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

<h1>Preprocess</h1>

<h3>Remove hyper links, marks and styles</h3>

In [730]:
def removeHyperMarksStyles(df,text_feature):
    tweetPreprocessing = ""
    for index, row in df.iterrows():
        tweetPreprocessing = re.sub(r'^RT[\s]+','',row[text_feature]) #removes retweet text
        tweetPreprocessing = re.sub(r'https?://[^\s\n\r]+','',tweetPreprocessing) #removes hyperlinks
        tweetPreprocessing = re.sub(r'#', '', tweetPreprocessing) #removes hashtags
        df.at[index,text_feature] = tweetPreprocessing
    return df

In [731]:
df_train = removeHyperMarksStyles(df_train,'text')

Since we are using tweets, nltk has a great tokenizer for it that allows us to remove caps, strips handles and reduces the length

In [732]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) 

<h3>Tokenize</h3>

In [733]:
def getTokenizeList(df,text_feature,tokenizer):
    tokenizeList = []
    for index, row in df.iterrows():
        tokenizeList.append(tokenizer.tokenize(row[text_feature]))
    return tokenizeList

All the tweets have been tokenized

In [734]:
tweet_tokens = getTokenizeList(df_train,'text',tokenizer)
len(tweet_tokens)

27480

Remove Stop Words with NLTK

In [735]:
stopwords_english = stopwords.words('english')

In [736]:
def cleanTweetsOfStopWords(tweet_tokens_list):
    for index, tweet_tokens in enumerate(tweet_tokens_list):
        clean_token = []
        for word in tweet_tokens:
            if(word not in stopwords_english and word not in string.punctuation):
                clean_token.append(word)
        tweet_tokens_list[index] = clean_token
    return tweet_tokens_list

In [737]:
tweet_tokens[0]

['i', '`', 'd', 'have', 'responded', ',', 'if', 'i', 'were', 'going']

We can see how we have remove stop words from the list

In [738]:
tweet_clean_tokens = cleanTweetsOfStopWords(tweet_tokens)
tweet_clean_tokens[0]

['responded', 'going']

Instead of Stemming we are using Lemmatization since it is more accurate, as a trade-off it requires more computing power.

In [739]:
from nltk.stem import WordNetLemmatizer

In [740]:
lemmatizer = WordNetLemmatizer()

In [741]:
def getLemmatizeList(tweet_clean_tokens_list,lemmatizer):
    for index, tweet_clean_tokens in enumerate(tweet_clean_tokens_list):
        tweet_lemm = []
        for word in tweet_clean_tokens:
            tweet_lemm.append(lemmatizer.lemmatize(word))
        tweet_clean_tokens_list[index] = tweet_lemm
    return tweet_clean_tokens_list

In [742]:
tweets_lemm = getLemmatizeList(tweet_clean_tokens,lemmatizer)