In [1]:
#https://www.kaggle.com/youben/twitter-sentiment-analysis/notebook?select=train.csv

In [2]:
#import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


#This is for making some large tweets to be displyed
pd.options.display.max_colwidth=100

In [3]:
train_data=pd.read_csv(r"train.csv",encoding='ISO-8859-1')

In [4]:
train_data

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL friend.............
1,2,0,I missed the New Moon trailer...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get ...
4,5,0,i think mi bf is cheating on me!!! T_T
...,...,...,...
99984,99996,0,@Cupcake seems like a repeating problem hope you're able to find something.
99985,99997,1,"@cupcake__ arrrr we both replied to each other over different tweets at the same time , i'll se..."
99986,99998,0,@CuPcAkE_2120 ya i thought so
99987,99999,1,@Cupcake_Dollie Yes. Yes. I'm glad you had more fun with me.


# Visualize the tweets

In [5]:
#we will now look take a random tweets to gain more insights

rand_indexes=np.random.randint(1,len(train_data),50).tolist()
train_data['SentimentText'][rand_indexes]

15052    (contd...) so good last night?? Along with other crappy songs that we're of a 6th class standard...
19662                                                @_Enigma__ looking forward to your return... take care 
94041                                                @coreyellerbe  I know.  How far the Cubs have fallen.  
78378                   @archonline time u went to the tweetups ... BTW am guilty of not attending them too 
23079    @11twenty LOL. I need some fulltime clients or a fulltime agency. if you got the hookup let me k...
16071                                                .. have a look at the activist cow  http://bit.ly/tRlcl
12014    (symphonysoldier.com) OMG. So shocked. I mean seriously Ian? Out of all the people in the world,...
53547    @aubreyoday... I love that show...weeds! But i dont get it here in london  i miss my American tv...
71866    @calamur  yeah... i guess my password was also got changed... i was not able to login my FB  cha...
37275              

 For me, after some execution, I noticed this:

1. There is tweets with a url (like tweet 35546): we must think about a way to handle URLs, I thought about deleting them because a domain name or the protocol used will not make someone happy or sad unless the domain name is 'food.com'.
2. The use of hashtags: we should keep only the words without '#' so words like python and the hashtag '#python' can be seen as the same word, and of course they are.
3. Words like 'as', 'to' and 'so' should be deleted, because they only serve as a way to link phrases and words

# Emoticons

The internet language includes so many emoticons, people also tend to create their own, so we will first analyze the emoticons included in our dataset, try to classify them as happy and said, and make sure that our model know about them.

In [6]:
#we are gonna find what emoticons are used in our dataset
import re
tweet_text=train_data.SentimentText.str.cat()
emos=set(re.findall(r" ([xX:;][-']?.) ",tweet_text))
emos_count=[]

for emo in emos:
    emos_count.append((tweet_text.count(emo),emo))

sorted(emos_count,reverse=True)

[(3281, ':/'),
 (2874, 'x '),
 (2626, ': '),
 (1339, 'x@'),
 (1214, 'xx'),
 (1162, 'xa'),
 (984, ';3'),
 (887, 'xp'),
 (842, 'xo'),
 (713, ';)'),
 (483, 'xe'),
 (431, ';I'),
 (353, ';.'),
 (254, 'xD'),
 (251, 'x.'),
 (245, '::'),
 (234, 'X '),
 (217, ';t'),
 (209, ';s'),
 (185, ':O'),
 (176, ':3'),
 (166, ';D'),
 (159, ":'"),
 (157, 'XD'),
 (146, 'x3'),
 (142, ':p'),
 (126, ":'("),
 (118, ':@'),
 (117, 'xh'),
 (117, ':S'),
 (109, 'xm'),
 (104, ';p'),
 (104, ';-)'),
 (92, ':|'),
 (91, 'x,'),
 (89, ';P'),
 (76, 'xd'),
 (75, ';o'),
 (75, ';d'),
 (71, ':o'),
 (65, 'XX'),
 (63, ':L'),
 (59, 'Xx'),
 (59, ':1'),
 (58, ':]'),
 (57, ':s'),
 (56, ':0'),
 (54, 'XO'),
 (44, ';;'),
 (43, ';('),
 (38, ':-D'),
 (37, 'xk'),
 (36, 'XT'),
 (35, 'x?'),
 (35, 'x)'),
 (34, 'x2'),
 (33, ';/'),
 (32, 'x:'),
 (32, ':\\'),
 (31, 'x-'),
 (27, 'Xo'),
 (27, 'XP'),
 (27, ':-/'),
 (26, ':-P'),
 (25, ':*'),
 (23, 'xX'),
 (22, ":')"),
 (17, 'xP'),
 (16, ':['),
 (16, ':-p'),
 (14, 'x]'),
 (14, 'XM'),
 (13, ':-O'),
 (1

We should by now know which emoticons are used (and its frequency) to build two regex, one for the happy ones and another for the sad ones. We will then use them in the preprocessing process to mark them as using happy emoticons or sad ones.

In [7]:
HAPPY_EMO=r" ([xX;:]-?[dD)]|:-?[\)]|[:;][pP]) "
SAD_EMO=r" (:'?[/|\(]) "
print("Happy Emoticons:",set(re.findall(HAPPY_EMO,tweet_text)))
print("Sad emoticons:",set(re.findall(SAD_EMO,tweet_text)))

Happy Emoticons: {';p', ';)', ';-D', ':-D', ';D', ':p', ';P', 'xD', ':D', ':d', ';d', 'x)', 'XD', 'xd', ';-)'}
Sad emoticons: {':/', ':(', ':|', ":'("}


# MOST USED WORDS

What we are going to do next is to define a function that will show us top words, so we may fix things before running our learning algorithm. This function takes as input a text and output words sorted according to their frequency, starting with the most used word.

In [8]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\INDIA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
def most_used_words(text):
    tokens=word_tokenize(text)
    frequency_dist=nltk.FreqDist(tokens)
    print("There is %d different words" %len(str(tokens)))
    
    return sorted(frequency_dist,key=frequency_dist.__getitem__,reverse=True)

In [10]:
most_used_words(train_data.SentimentText.str.cat())[:100]

There is 13294450 different words


['@',
 '!',
 '.',
 'I',
 ',',
 'to',
 'the',
 'you',
 '?',
 'a',
 'it',
 'i',
 ';',
 'and',
 '&',
 '...',
 'my',
 'for',
 'is',
 'that',
 "'s",
 "n't",
 'in',
 'me',
 'of',
 'have',
 'on',
 'quot',
 "'m",
 'so',
 ':',
 'but',
 '#',
 'do',
 'was',
 'be',
 '..',
 'not',
 'your',
 'are',
 'just',
 'with',
 'like',
 '-',
 'at',
 '*',
 'too',
 'get',
 'good',
 'u',
 'up',
 'know',
 'all',
 'this',
 'now',
 'no',
 'we',
 'out',
 ')',
 'love',
 'lol',
 'can',
 'what',
 'one',
 '(',
 'will',
 'go',
 'about',
 'did',
 'got',
 "'ll",
 'there',
 'amp',
 'day',
 'http',
 'see',
 "'re",
 'if',
 'time',
 'they',
 'think',
 'as',
 'when',
 'from',
 'You',
 'It',
 'going',
 'really',
 'well',
 'am',
 'work',
 'had',
 'would',
 'how',
 'he',
 'here',
 'thanks',
 'some',
 '....',
 'haha']

# Stop Words

What we can see is that stop words are the most used, but in fact they don't help us determine if a tweet is happy/sad, however, they are consuming memory and they are making the learning process slower, so we really need to get rid of them

In [11]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\INDIA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
mw=most_used_words(train_data.SentimentText.str.cat())
most_words=[]
for w in mw:
    if len(most_words)==100:
        break
    if w in stopwords.words('english'):
        continue
    else:
        most_words.append(w)
    

There is 13294450 different words


In [13]:
#Wgar we did is to filter only non stop words
#We will now get a look to the top 1000 words
sorted(most_words)

['!',
 '#',
 '&',
 "'",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '..',
 '...',
 '....',
 '2',
 '3',
 ':',
 ';',
 '?',
 '@',
 'And',
 'But',
 'I',
 'It',
 'LOL',
 'Oh',
 'Thanks',
 'That',
 'The',
 'You',
 'amp',
 'back',
 'bad',
 'better',
 'ca',
 'come',
 'could',
 'day',
 'dont',
 'even',
 'feel',
 'fun',
 'get',
 'go',
 'going',
 'good',
 'got',
 'great',
 'haha',
 'home',
 'hope',
 'http',
 'im',
 'know',
 'last',
 'like',
 'lol',
 'love',
 'lt',
 'make',
 'miss',
 'much',
 "n't",
 'na',
 'need',
 'never',
 'new',
 'nice',
 'night',
 'oh',
 'one',
 'quot',
 'really',
 'right',
 'sad',
 'say',
 'see',
 'sorry',
 'still',
 'sure',
 'thanks',
 'think',
 'though',
 'time',
 'today',
 'u',
 'ur',
 'wait',
 'want',
 'way',
 'well',
 'wish',
 'work',
 'would',
 'yeah',
 'yes']

# Stemming

You should have noticed something, right? There are words that have the same meaning, but written in a different manner, sometimes in the plural and sometimes with a suffix (ing, es ...), this will make our model think that they are different words and also make our vocabulary bigger (waste of memory and time for the learning process). The solution is to reduce those words with the same root, this is called stemming

In [14]:
#I'm defining this function to use it in the
#DAta Preparation phase
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\INDIA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
def stem_tokenize(text):
    stemmer=SnowballStemmer('english')
    stemmer=WordNetLemmatizer()
    return [stemmer.lemmatize(token) for token in word_tokenize(text)]

def lemmatize_tokenize(text):
    lemmatizer=WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in word_tokenize(text)]

# Prepare the Data

In [16]:
#In this phase,we will transform our tweets into a more usable data 
#by our ML models

# Bag of Words

e are going to use the Bag of Words algorithm, which basically takes a text as input, extract words from it (this is our vocabulary) to use them in the vectorization process. When a tweet comes in, it will vectorize it by counting the number of occurrences of each word in our vocabulary.

For example, we have this two tweets: "I learned a lot today" and "hahaha I got you".

We first extract the words present in the two tweets, then for each tweet we count the occurrences of each word in our vocabulary.

This is the simplest form of the Bag of Words algorithm, however, there is other variants, we are gonna use the TF-IDF (Term Frequency - Inverse Document Frequency) variant

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
#BUILDING THE PIPELINE

It's always a good practice to make a pipeline of transformation for your data, it will make the process of data transformation really easy and reusable. We will implement a pipeline for transforming our tweets to something that our ML models can digest (vectors).

In [19]:
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.pipeline import Pipeline

In [20]:
#we need to do some preprocessing of the tweerts
#we will delete strings(like @,#,....) because we think that they will not help
#in determing if the person is Happy/Sad

class TextPreProc(BaseEstimator,TransformerMixin):
    
    def __init__(self,use_mention=False):
        self.use_mention=use_mention
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        #we can choose between keeping the mentions or deleting them
        if self.use_mention:
            X=X.str.replace(r"@[a-zA-Z0-9_]* "," @tags")
        else:
            X=X.str.replace(r"@[a-zA-Z0_9]* ","")
            
        #Keeping only the word after the #
        X=X.str.replace("#","")
        X=X.str.replace(r"[-\.\n]", "")
        #Removing HTML garbage
        X=X.str.replace(r"https?://\S*","")
        
        #replace repeated letters with only two occurences
        #heeeeeeeeelllllooooo =>  heelloo
        X=X.str.replace(r"(.)\1+",r"\1\1")
        #mark emoticionsa as happy or sad
        X=X.str.replace(HAPPY_EMO," happyemoticons ")
        X=X.str.replace(SAD_EMO," sademoticons ")
        X=X.str.lower()
        return X

In [38]:
#This is the pipeline taht will ttransform our tweets to something eatable
#You can see that we are using our previously defined stemmer, it will 
#take care of the stemming process
#For stop words,we let the IDF do the job

from sklearn.model_selection import train_test_split

sentiments=train_data['Sentiment']
tweets=train_data['SentimentText']

#I get those parameters from the  'Fine tune the model'  part
vectorizer=TfidfVectorizer(tokenizer=lemmatize_tokenize,ngram_range=(1,2))

pipeline=Pipeline([
    ('text_pre_processing',TextPreProc(use_mention=True)),
    ('vectorizer',vectorizer),
])

#Let's split our data into learning set and testing set
#This process is done to test the efficiency of our model at the end
#You shouldn't look at the test data after choosing the final model
learn_data,test_data,sentiments_learning,sentiments_test=train_test_split(
                            tweets,sentiments,test_size=0.3
)

#This will transform our learning data from simple text to vector
#by going through the proprocessing transformer
learning_data=pipeline.fit_transform(learn_data)


# SELECT a MODEL

When we have our data ready to be processed by ML models, the question we should ask is which model to use?

The answer varies depending on the problem and data, for example, it's known that Naive Bias has proven good efficacy against Text Based Problems.

A good way to choose a model is to try different candidate, evaluate them using cross validation, then chose the best one which will be later tested against our test data.

In [39]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB,MultinomialNB

In [40]:
lr=LogisticRegression()
bnb=BernoulliNB()
mnb=MultinomialNB()
import warnings
warnings.filterwarnings("ignore")

In [41]:
models={
    'logistic regression':lr,
    'bernoulliNB':bnb,
    'multinomialNB':mnb,
}

for model in models.keys():
    scores=cross_val_score(models[model],learning_data,sentiments_learning,
                          scoring='f1',cv=10)
    print("=====",model,"======")
    print("scores = ",scores)
    print("mean = ",scores.mean())
    print("variance = ",scores.var())
    print("standard deviation = ",scores.std())
    
    models[model].fit(learning_data,sentiments_learning)
    
    print("score on learning data(accuracy) = ",accuracy_score(
        models[model].predict(learning_data),sentiments_learning))
    print("classification report = ")
    print(classification_report(models[model].predict(learning_data),sentiments_learning))
    print("")

scores =  [0.80620531 0.81534366 0.81067375 0.80755722 0.81576734 0.80811042
 0.81213592 0.81444673 0.82026538 0.81086301]
mean =  0.812136873824804
variance =  1.71004260460798e-05
standard deviation =  0.004135266139691592
score on learning data(accuracy) =  0.8718567836324151
classification report = 
              precision    recall  f1-score   support

           0       0.81      0.88      0.85     28143
           1       0.92      0.86      0.89     41849

    accuracy                           0.87     69992
   macro avg       0.87      0.87      0.87     69992
weighted avg       0.88      0.87      0.87     69992


scores =  [0.78230337 0.79682947 0.78532927 0.78383459 0.79459081 0.78225047
 0.7899474  0.79395572 0.79238674 0.78272067]
mean =  0.788414851104457
variance =  2.9604715168555785e-05
standard deviation =  0.00544102151884697
score on learning data(accuracy) =  0.9053606126414447
classification report = 
              precision    recall  f1-score   support

      

None of those models is likely to be overfitting, I will choose the multinomialNB

# Fine tune the Model

I'm going to use the GridSearchCV to choose the best parameters to use.

What the GridSearchCV does is trying different set of parameters, and for each one, it runs a cross validation and estimate the score. At the end we can see what are the best parameter and use them to build a better classifier.

In [42]:
from sklearn.model_selection import GridSearchCV

In [45]:
grid_search_pipeline=Pipeline([
    ('text_pre_processing',TextPreProc()),
    ('vectorizer',TfidfVectorizer()),
    ('model',MultinomialNB()),
])

params=[
    {
        'text_pre_processing__use_mention':[True,False],
        'vectorizer__max_features':[1000,2000,5000,10000,20000,None],
        'vectorizer__ngram_range':[(1,1),(1,2)],
    },
]

grid_search=GridSearchCV(grid_search_pipeline,params,cv=5,scoring='f1')
grid_search.fit(learn_data,sentiments_learning)


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('text_pre_processing', TextPreProc()),
                                       ('vectorizer', TfidfVectorizer()),
                                       ('model', MultinomialNB())]),
             param_grid=[{'text_pre_processing__use_mention': [True, False],
                          'vectorizer__max_features': [1000, 2000, 5000, 10000,
                                                       20000, None],
                          'vectorizer__ngram_range': [(1, 1), (1, 2)]}],
             scoring='f1')

In [46]:
grid_search.best_params_

{'text_pre_processing__use_mention': True,
 'vectorizer__max_features': None,
 'vectorizer__ngram_range': (1, 2)}

# TEST

Testing our model against data other than the data used for training our model will show how well the model is generalising on new data.

Note
We shouldn't test to choose the model, this will only let us confirm that the choosen model is doing well.

In [47]:
mnb.fit(learning_data,sentiments_learning)

MultinomialNB()

In [48]:
testing_data=pipeline.transform(test_data)

In [49]:
mnb.score(testing_data,sentiments_test)

0.7536086942027536

In [50]:
#not badd but will try to improve it futher this result in the future