## Sentiment Analysis EDA - Twitter

## Import Libraries

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from nltk.tokenize import sent_tokenize
from sklearn.metrics import plot_confusion_matrix
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

## EDA

In [2]:
twitter_training= pd.read_csv('twitter_training.csv', sep=',', names=["Tweet ID", "Entity", "Sentiment", "Tweet Content"])

In [3]:
twitter_training.head()

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet Content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
twitter_training.tail()

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet Content
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...
74681,9200,Nvidia,Positive,Just like the windows partition of my Mac is l...


In [5]:
twitter_validation=pd.read_csv('twitter_training.csv', sep=',', names=["Tweet ID", "Entity", "Sentiment", "Tweet Content"])

In [6]:
twitter_validation.head()

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet Content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [7]:
twitter_validation.head(10)

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet Content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
5,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
6,2402,Borderlands,Positive,So I spent a few hours making something for fu...
7,2402,Borderlands,Positive,So I spent a couple of hours doing something f...
8,2402,Borderlands,Positive,So I spent a few hours doing something for fun...
9,2402,Borderlands,Positive,So I spent a few hours making something for fu...


In [8]:
twitter_validation.tail()

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet Content
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...
74681,9200,Nvidia,Positive,Just like the windows partition of my Mac is l...


In [9]:
twitter_training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Tweet ID       74682 non-null  int64 
 1   Entity         74682 non-null  object
 2   Sentiment      74682 non-null  object
 3   Tweet Content  73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [10]:
len(twitter_training)

74682

In [11]:
twitter_training["Tweet ID"] = range(1, len(twitter_training) + 1)
twitter_training.head()

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet Content
0,1,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2,Borderlands,Positive,I am coming to the borders and I will kill you...
2,3,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,4,Borderlands,Positive,im coming on borderlands and i will murder you...
4,5,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [12]:
type(twitter_training)

pandas.core.frame.DataFrame

In [13]:
twitter_training["Tweet Content"]

0        im getting on borderlands and i will murder yo...
1        I am coming to the borders and I will kill you...
2        im getting on borderlands and i will kill you ...
3        im coming on borderlands and i will murder you...
4        im getting on borderlands 2 and i will murder ...
                               ...                        
74677    Just realized that the Windows partition of my...
74678    Just realized that my Mac window partition is ...
74679    Just realized the windows partition of my Mac ...
74680    Just realized between the windows partition of...
74681    Just like the windows partition of my Mac is l...
Name: Tweet Content, Length: 74682, dtype: object

In [14]:
twitter_training.dropna(inplace=True)
twitter_training.drop_duplicates(inplace=True)

In [15]:
twitter_training.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73996 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Tweet ID       73996 non-null  int64 
 1   Entity         73996 non-null  object
 2   Sentiment      73996 non-null  object
 3   Tweet Content  73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.8+ MB


In [16]:
len(twitter_training)

73996

In [17]:
twitter_training.head()

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet Content
0,1,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2,Borderlands,Positive,I am coming to the borders and I will kill you...
2,3,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,4,Borderlands,Positive,im coming on borderlands and i will murder you...
4,5,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


## Make every tweet lowercase

In [19]:
# making the data type into a string from an Object
twitter_training["Tweet Content"].astype('string')

0        im getting on borderlands and i will murder yo...
1        I am coming to the borders and I will kill you...
2        im getting on borderlands and i will kill you ...
3        im coming on borderlands and i will murder you...
4        im getting on borderlands 2 and i will murder ...
                               ...                        
74677    Just realized that the Windows partition of my...
74678    Just realized that my Mac window partition is ...
74679    Just realized the windows partition of my Mac ...
74680    Just realized between the windows partition of...
74681    Just like the windows partition of my Mac is l...
Name: Tweet Content, Length: 73996, dtype: string

In [20]:
twitter_training.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73996 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Tweet ID       73996 non-null  int64 
 1   Entity         73996 non-null  object
 2   Sentiment      73996 non-null  object
 3   Tweet Content  73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.8+ MB


In [21]:
Lower_tweet=twitter_training["Tweet Content"].str.lower()

In [22]:
Lower_tweet.head()

0    im getting on borderlands and i will murder yo...
1    i am coming to the borders and i will kill you...
2    im getting on borderlands and i will kill you ...
3    im coming on borderlands and i will murder you...
4    im getting on borderlands 2 and i will murder ...
Name: Tweet Content, dtype: object

In [23]:
#test if the string is now lowercase
Lower_tweet.iloc[900]

'the atleast i have borderlands to come cheer me up : ('

## Tokenize the data 

In [24]:
Token_tweet_content=Lower_tweet

In [25]:
Token_tweet_content.head()

0    im getting on borderlands and i will murder yo...
1    i am coming to the borders and i will kill you...
2    im getting on borderlands and i will kill you ...
3    im coming on borderlands and i will murder you...
4    im getting on borderlands 2 and i will murder ...
Name: Tweet Content, dtype: object

In [26]:
Token_tweet_content[1].split()

['i',
 'am',
 'coming',
 'to',
 'the',
 'borders',
 'and',
 'i',
 'will',
 'kill',
 'you',
 'all,']

In [27]:
token_pattern = r"(?u)\b\w\w+\b" #default token pattern
tokenizer = RegexpTokenizer(token_pattern)

In [28]:
#Token_tweet_content2=Token_tweet_content[Token_tweet_content.columns[cols]]

In [29]:
tokenizer.tokenize(Token_tweet_content[900]) #test if the tokenizer removes emojis, bye bye 

['atleast', 'have', 'borderlands', 'to', 'cheer', 'me', 'up']

In [30]:
twitter_training["Tweet Content"]=twitter_training["Tweet Content"].str.lower()

In [31]:
twitter_training.iloc[900] #test to see if the lowercase is the case for the data set

Tweet ID                                                       905
Entity                                                 Borderlands
Sentiment                                                 Positive
Tweet Content    the atleast i have borderlands to come cheer m...
Name: 904, dtype: object

In [32]:
twitter_training.tail(7) #lowercase has been removed

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet Content
74675,74676,Nvidia,Positive,<unk> my elim be no.... nvidia highlights pict...
74676,74677,Nvidia,Positive,just realized the windows partition of my mac ...
74677,74678,Nvidia,Positive,just realized that the windows partition of my...
74678,74679,Nvidia,Positive,just realized that my mac window partition is ...
74679,74680,Nvidia,Positive,just realized the windows partition of my mac ...
74680,74681,Nvidia,Positive,just realized between the windows partition of...
74681,74682,Nvidia,Positive,just like the windows partition of my mac is l...


In [33]:
twitter_training["Tokenized Tweets"]= twitter_training["Tweet Content"].apply(tokenizer.tokenize)
twitter_training["Tokenized Tweets"].head()

0    [im, getting, on, borderlands, and, will, murd...
1    [am, coming, to, the, borders, and, will, kill...
2    [im, getting, on, borderlands, and, will, kill...
3    [im, coming, on, borderlands, and, will, murde...
4    [im, getting, on, borderlands, and, will, murd...
Name: Tokenized Tweets, dtype: object

In [34]:
twitter_training.tail(20) #here we have the new column entry

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet Content,Tokenized Tweets
74662,74663,Nvidia,Neutral,nvidia therefore doesn ’ t want to give up its...,"[nvidia, therefore, doesn, want, to, give, up,..."
74663,74664,Nvidia,Neutral,is doesn’t should i give up its password ‘cryp...,"[is, doesn, should, give, up, its, password, c..."
74664,74665,Nvidia,Negative,nvidia really delayed the 3070 2 weeks .,"[nvidia, really, delayed, the, 3070, weeks]"
74665,74666,Nvidia,Negative,nvidia really delayed the 3070 by 2 weeks.,"[nvidia, really, delayed, the, 3070, by, weeks]"
74666,74667,Nvidia,Negative,nvidia did delay by 3070 2 weeks.,"[nvidia, did, delay, by, 3070, weeks]"
74667,74668,Nvidia,Negative,nvidia really delayed the 3070 several weeks.,"[nvidia, really, delayed, the, 3070, several, ..."
74668,74669,Nvidia,Negative,nvidia really only delayed the 3070 2 flight w...,"[nvidia, really, only, delayed, the, 3070, fli..."
74669,74670,Nvidia,Negative,nvidia really delayed the next 2 weeks.,"[nvidia, really, delayed, the, next, weeks]"
74670,74671,Nvidia,Positive,let no elim go unnoticed. . . . nvidia highlig...,"[let, no, elim, go, unnoticed, nvidia, highlig..."
74671,74672,Nvidia,Positive,t let elim go unnoticed.... nvidia highlights ...,"[let, elim, go, unnoticed, nvidia, highlights,..."


From the sample above, there are words like 'the, and of'. Emojis and punctuation have been removed and it is lowercase for now. 