In [1]:
import pandas as pd
import zipfile
import dill
import gzip
import spacy
from spacy.lang.en import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

## Importing the dataset

In [2]:
zf = zipfile.ZipFile('data/Sentiment-Analysis-Dataset.zip') 
df = pd.read_csv(zf.open('Sentiment Analysis Dataset.csv'), error_bad_lines=False)

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


In [3]:
df.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...


In [4]:
df['SentimentText'].head(20)

0                          is so sad for my APL frie...
1                        I missed the New Moon trail...
2                               omg its already 7:30 :O
3               .. Omgaga. Im sooo  im gunna CRy. I'...
4              i think mi bf is cheating on me!!!   ...
5                     or i just worry too much?        
6                    Juuuuuuuuuuuuuuuuussssst Chillin!!
7            Sunny Again        Work Tomorrow  :-|  ...
8           handed in my uniform today . i miss you ...
9              hmmmm.... i wonder how she my number @-)
10                        I must think about positive..
11          thanks to all the haters up in my face a...
12                       this weekend has sucked so far
13               jb isnt showing in australia any more!
14                                 ok thats it you win.
15        &lt;-------- This is the way i feel right ...
16        awhhe man.... I'm completely useless rt no...
17        Feeling strangely fine. Now I'm gonna 

#### Load spacy

In [48]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

#### Lemmatize the stop words

In [68]:
STOP_WORDS_lemma = [word.lemma_ for word in nlp(" ".join(list(STOP_WORDS)))]
STOP_WORDS_lemma = set(STOP_WORDS_lemma).union({',', '.', ';'})

#### Split the dataset into training and testing set

In [72]:
X = df['SentimentText']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

#### Create pipeline

In [78]:
vectorizer = TfidfVectorizer(preprocessor=preprocessor, 
                            stop_words=STOP_WORDS_lemma)

clf = MultinomialNB()
pipe = Pipeline([('vectorizer', vectorizer),
                 ('classifier', clf)
                ])

In [80]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(preprocessor=<function preprocessor at 0x000001C2EA2AD790>,
                                 stop_words={"'", "'d", "'ll", ',', '.', ';',
                                             'I', 'a', 'about', 'above',
                                             'across', 'after', 'afterwards',
                                             'again', 'against', 'all',
                                             'almost', 'alone', 'along',
                                             'already', 'also', 'although',
                                             'always', 'among', 'amongst',
                                             'amount', 'an', 'and', 'another',
                                             'any', ...})),
                ('classifier', MultinomialNB())])

In [81]:
# Score for training set
pipe.score(X_train, y_train)

0.8353236111803967

In [82]:
# Score for testing set
pipe.score(X_test, y_test)

0.7615758117083646

#### Save the model using dill and gzip

In [87]:
with gzip.open('sentimental_model.dill.gz', 'wb') as f:
    dill.dump(pipe, f, recurse=True)

In [None]:
# These functions were created to preprocess and lemmatize the dataset before feeding it into the machine learning model
# they were not used because using them increased the training time of the model exponetially.
from html import unescape

def preprocessor(doc):
        return unescape(doc).lower()
    
def lemmatizer(doc):
    return [word.lemma_ for word in nlp(doc)]