# CSML1010 Project Working Copy
# Sentiment Analysis with the Sentiment140 dataset
## Pete Gray

---------------------------------------------------------

# Import libraries

In [1]:
import pandas as pd
import numpy as np
np.set_printoptions(precision=2, linewidth=80)
import warnings
warnings.filterwarnings("ignore")
import model_evaluation_utils as meu


# Adjust pandas display

In [2]:
pd.options.display.max_columns = 30
pd.options.display.max_rows = 100
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.precision = 2
pd.options.display.max_colwidth = -1

# Import matplotlib and seaborn and adjust defaults

In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

from matplotlib import pyplot as plt
plt.rcParams['figure.dpi'] = 100

import seaborn as sns
sns.set_style("whitegrid")

## Read data from local filesystem and csv source 

In [4]:
df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding="ISO-8859-1")

Check data with quick visual inspection

In [5]:
df

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
1599994,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best feeling ever
1599995,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interviews! â« http://blip.fm/~8bmta
1599996,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me for details
1599997,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur


## Give dataframe columns

In [6]:
df.columns = ['sentiment', 'ID', 'Time', 'none', 'username', 'Text']

In [7]:
df.count()

sentiment    1599999
ID           1599999
Time         1599999
none         1599999
username     1599999
Text         1599999
dtype: int64

## Now it has columns, this seems better.
#
## We have to cut this down to size, for iterative development.
##
## Don't forget to get rid of this!!! When crunching whole huge dataset.

# Set temporary dataset size, for quicker processing

In [8]:
dev_data_size = 4000

In [9]:
start_row = int(800000-(dev_data_size/2))-1
finish_row = int(800000+(dev_data_size/2))-1
df_sm = df[start_row:finish_row]
df_sm.count()

sentiment    4000
ID           4000
Time         4000
none         4000
username     4000
Text         4000
dtype: int64

In [10]:
columns = [col for col in df.columns if not col.startswith('self')]
columns

['sentiment', 'ID', 'Time', 'none', 'username', 'Text']

In [11]:
raw_text = np.array(df_sm['Text'])
sentiments = np.array(df_sm['sentiment'])
raw_text[5:15]

array(['@StewartWade Yeah, I know--pigs for sure...which is a great visual on my end among all the akimbo-ness. ',
       "ouh @Babe_Franzi was hast du hun'? hoffentlich nichts schlimmes. yes, i miss you rlly much, mary too. ",
       'Woke up with the worst headache ',
       "@MacekMakeupArt I can't remember the last movie I saw in a theatre!  Hope you guys have fun! What are you going to see?",
       'last day of classes   im going to miss chichi !',
       'Damn, time for another pedicure, just chipped my toenail on an open cabinet  Shit happens!',
       "@mikegentile i've never been in a walmart  no joke",
       "@amedelrivero Start putting up $100 every paycheck! We have to prepare ourselves for the future -_-. ONLY $300 is what i'm getting ",
       '@patrickeatworld takboleh. i am so in loveeeeeeeeeeee  life sucks. FMMFL',
       'I have church thur and am always forgetting I can watch fbc on line until Thurs. '],
      dtype=object)

In [12]:
sentiments[4995:5005]

array([], dtype=int64)

-----------------------

# Data Cleaning
-----------------------

## Cleaning function

In [13]:
import re
def clean(s):
    s = s.replace(r'<lb>', "\n")
    s = s.replace(r'<tab>', "\i")
    
    # As a sanity check - s = s.replace(r'W', "Q")
    
    s = re.sub(r'<br */*>', "\n", s)
    s = s.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
    s = s.replace("&amp;", "&")
    # markdown urls
    s = re.sub(r'\(https*://[^\)]*\)', "", s)
    # normal urls
    s = re.sub(r'https*://[^\s]*', "", s)
    s = re.sub(r'_+', ' ', s)
    s = re.sub(r'"+', '"', s)
    return str(s)

## Create new column in dataframe

In [14]:
df_sm["text_clean"] = ''

# Iterate and clean

In [15]:
for i, row in df_sm.iterrows():
    if i % 1000 == 0:
            print('processed:'.format(i), i)
    df_sm.at[i, "text_clean"] = clean(row.Text)

processed: 798000
processed: 799000
processed: 800000
processed: 801000


### Check results

In [16]:
df_sm.head()

Unnamed: 0,sentiment,ID,Time,none,username,Text,text_clean
797999,0,2328378861,Thu Jun 25 09:30:33 PDT 2009,NO_QUERY,skelekitty,"Work is so slow, I'm seriously considering quitting my job this week","Work is so slow, I'm seriously considering quitting my job this week"
798000,0,2328379014,Thu Jun 25 09:30:33 PDT 2009,NO_QUERY,DjinniGenie,@davidvancamp That's awful. I wish mine would stop making fat jokes.,@davidvancamp That's awful. I wish mine would stop making fat jokes.
798001,0,2328379041,Thu Jun 25 09:30:33 PDT 2009,NO_QUERY,Unrated7String,"Well, i guess i need to start a new chapter in professional my life","Well, i guess i need to start a new chapter in professional my life"
798002,0,2328379271,Thu Jun 25 09:30:34 PDT 2009,NO_QUERY,jamesebradford,"@SandraBernhard Miss Lady, since you brought up your web store - it is notoriously known that it takes AGES to rec'v your merch.","@SandraBernhard Miss Lady, since you brought up your web store - it is notoriously known that it takes AGES to rec'v your merch."
798003,0,2328379299,Thu Jun 25 09:30:34 PDT 2009,NO_QUERY,njandecrox,@CarterTwinsZach Im sorry I hope u feel better cuz I love u and it makes feel horrible when ur sick or sad or mad or hurt,@CarterTwinsZach Im sorry I hope u feel better cuz I love u and it makes feel horrible when ur sick or sad or mad or hurt


## Additional pre-processing: tokenization, removing extra whitespaces, lower casing and more advanced operations like spelling corrections, grammatical error corrections, removing repeated characters.

In [17]:
import nltk
wpt = nltk.WordPunctTokenizer()
nltk.download("stopwords")
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Define normalization function

In [18]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [19]:
normalize_corpus = np.vectorize(normalize_document)

In [20]:
df_sm["text_normalized"] = ''

In [21]:
for i, row in df_sm.iterrows():
    if i % 1000 == 0:
            print('processed:'.format(i), i)
    df_sm.at[i, "text_normalized"] = normalize_corpus(row.text_clean)

processed: 798000
processed: 799000
processed: 800000
processed: 801000


### check results

In [22]:
df_sm

Unnamed: 0,sentiment,ID,Time,none,username,Text,text_clean,text_normalized
797999,0,2328378861,Thu Jun 25 09:30:33 PDT 2009,NO_QUERY,skelekitty,"Work is so slow, I'm seriously considering quitting my job this week","Work is so slow, I'm seriously considering quitting my job this week",work slow im seriously considering quitting job week
798000,0,2328379014,Thu Jun 25 09:30:33 PDT 2009,NO_QUERY,DjinniGenie,@davidvancamp That's awful. I wish mine would stop making fat jokes.,@davidvancamp That's awful. I wish mine would stop making fat jokes.,davidvancamp thats awful . wish mine would stop making fat jokes .
798001,0,2328379041,Thu Jun 25 09:30:33 PDT 2009,NO_QUERY,Unrated7String,"Well, i guess i need to start a new chapter in professional my life","Well, i guess i need to start a new chapter in professional my life",well guess need start new chapter professional life
798002,0,2328379271,Thu Jun 25 09:30:34 PDT 2009,NO_QUERY,jamesebradford,"@SandraBernhard Miss Lady, since you brought up your web store - it is notoriously known that it takes AGES to rec'v your merch.","@SandraBernhard Miss Lady, since you brought up your web store - it is notoriously known that it takes AGES to rec'v your merch.",sandrabernhard miss lady since brought web store - notoriously known takes ages rec ' v merch .
798003,0,2328379299,Thu Jun 25 09:30:34 PDT 2009,NO_QUERY,njandecrox,@CarterTwinsZach Im sorry I hope u feel better cuz I love u and it makes feel horrible when ur sick or sad or mad or hurt,@CarterTwinsZach Im sorry I hope u feel better cuz I love u and it makes feel horrible when ur sick or sad or mad or hurt,cartertwinszach im sorry hope u feel better cuz love u makes feel horrible ur sick sad mad hurt
...,...,...,...,...,...,...,...,...
801994,4,1468163268,Tue Apr 07 00:03:40 PDT 2009,NO_QUERY,jerichoK,@FizzyDuck Five? Seems a little bit too late in the morning but what the hell !,@FizzyDuck Five? Seems a little bit too late in the morning but what the hell !,fizzyduck five seems little bit late morning hell !
801995,4,1468163291,Tue Apr 07 00:03:40 PDT 2009,NO_QUERY,ex1up,ryanodonnell: @AttractMode Thanks for putting on such a great event. Can't wait for the inevitable sequels! [.. http://tinyurl.com/c3e3ub,ryanodonnell: @AttractMode Thanks for putting on such a great event. Can't wait for the inevitable sequels! [..,ryanodonnell attractmode thanks putting great event . ' wait inevitable sequels ! [..
801996,4,1468163300,Tue Apr 07 00:03:39 PDT 2009,NO_QUERY,Mmmbaileys,@damygeebo Carli's my friend,@damygeebo Carli's my friend,damygeebo carlis friend
801997,4,1468163315,Tue Apr 07 00:03:39 PDT 2009,NO_QUERY,jasminejoejonas,I feel so great for starting twitter at suzanne but still hardly anyone has it.,I feel so great for starting twitter at suzanne but still hardly anyone has it.,feel great starting twitter suzanne still hardly anyone


In [24]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [25]:
for i, row in df_sm.iterrows():
    if i % 1000 == 0:
        print(i)
    if(row["text_normalized"] and len(str(row["text_normalized"])) < 1000000):
        doc = nlp(str(row["text_normalized"]))
        adjectives = []
        nouns = []
        verbs = []
        lemmas = []

        for token in doc:
            lemmas.append(token.lemma_)
            if token.pos_ == "ADJ":
                adjectives.append(token.lemma_)
            if token.pos_ == "NOUN" or token.pos_ == "PROPN":
                nouns.append(token.lemma_)
            if token.pos_ == "VERB":
                verbs.append(token.lemma_)
                
        df_sm.at[i, "text_lemma"] = " ".join(lemmas)                
        df_sm.at[i, "text_nouns"] = " ".join(nouns)
        df_sm.at[i, "text_adjectives"] = " ".join(adjectives)
        df_sm.at[i, "text_verbs"] = " ".join(verbs)
        df_sm.at[i, "text_nav"] = " ".join(nouns+adjectives+verbs)
        df_sm.at[i, "no_tokens"] = len(lemmas)

798000
799000
800000
801000


In [26]:
df_sm

Unnamed: 0,sentiment,ID,Time,none,username,Text,text_clean,text_normalized,text_lemma,text_nouns,text_adjectives,text_verbs,text_nav,no_tokens
797999,0,2328378861,Thu Jun 25 09:30:33 PDT 2009,NO_QUERY,skelekitty,"Work is so slow, I'm seriously considering quitting my job this week","Work is so slow, I'm seriously considering quitting my job this week",work slow im seriously considering quitting job week,work slow -PRON- be seriously consider quit job week,work job week,slow,be consider quit,work job week slow be consider quit,9.00
798000,0,2328379014,Thu Jun 25 09:30:33 PDT 2009,NO_QUERY,DjinniGenie,@davidvancamp That's awful. I wish mine would stop making fat jokes.,@davidvancamp That's awful. I wish mine would stop making fat jokes.,davidvancamp thats awful . wish mine would stop making fat jokes .,davidvancamp that s awful . wish mine would stop make fat joke .,davidvancamp wish mine joke,awful fat,s would stop make,davidvancamp wish mine joke awful fat s would stop make,13.00
798001,0,2328379041,Thu Jun 25 09:30:33 PDT 2009,NO_QUERY,Unrated7String,"Well, i guess i need to start a new chapter in professional my life","Well, i guess i need to start a new chapter in professional my life",well guess need start new chapter professional life,well guess nee start new chapter professional life,chapter professional life,new,guess nee start,chapter professional life new guess nee start,8.00
798002,0,2328379271,Thu Jun 25 09:30:34 PDT 2009,NO_QUERY,jamesebradford,"@SandraBernhard Miss Lady, since you brought up your web store - it is notoriously known that it takes AGES to rec'v your merch.","@SandraBernhard Miss Lady, since you brought up your web store - it is notoriously known that it takes AGES to rec'v your merch.",sandrabernhard miss lady since brought web store - notoriously known takes ages rec ' v merch .,sandrabernhard miss lady since bring web store - notoriously know take age rec ' v merch .,sandrabernhard miss lady web store age rec v merch,,bring know take,sandrabernhard miss lady web store age rec v merch bring know take,17.00
798003,0,2328379299,Thu Jun 25 09:30:34 PDT 2009,NO_QUERY,njandecrox,@CarterTwinsZach Im sorry I hope u feel better cuz I love u and it makes feel horrible when ur sick or sad or mad or hurt,@CarterTwinsZach Im sorry I hope u feel better cuz I love u and it makes feel horrible when ur sick or sad or mad or hurt,cartertwinszach im sorry hope u feel better cuz love u makes feel horrible ur sick sad mad hurt,cartertwinszach -PRON- be sorry hope u feel better cuz love u make feel horrible ur sick sad mad hurt,cartertwinszach hope love hurt,sorry horrible sick sad mad,be feel make feel,cartertwinszach hope love hurt sorry horrible sick sad mad be feel make feel,19.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
801994,4,1468163268,Tue Apr 07 00:03:40 PDT 2009,NO_QUERY,jerichoK,@FizzyDuck Five? Seems a little bit too late in the morning but what the hell !,@FizzyDuck Five? Seems a little bit too late in the morning but what the hell !,fizzyduck five seems little bit late morning hell !,fizzyduck five seem little bit late morning hell !,fizzyduck bit morning hell,little late,seem,fizzyduck bit morning hell little late seem,9.00
801995,4,1468163291,Tue Apr 07 00:03:40 PDT 2009,NO_QUERY,ex1up,ryanodonnell: @AttractMode Thanks for putting on such a great event. Can't wait for the inevitable sequels! [.. http://tinyurl.com/c3e3ub,ryanodonnell: @AttractMode Thanks for putting on such a great event. Can't wait for the inevitable sequels! [..,ryanodonnell attractmode thanks putting great event . ' wait inevitable sequels ! [..,ryanodonnell attractmode thank put great event . ' wait inevitable sequel ! [ ..,ryanodonnell thank event sequel,great inevitable,attractmode put wait,ryanodonnell thank event sequel great inevitable attractmode put wait,14.00
801996,4,1468163300,Tue Apr 07 00:03:39 PDT 2009,NO_QUERY,Mmmbaileys,@damygeebo Carli's my friend,@damygeebo Carli's my friend,damygeebo carlis friend,damygeebo carlis friend,damygeebo carlis friend,,,damygeebo carlis friend,3.00
801997,4,1468163315,Tue Apr 07 00:03:39 PDT 2009,NO_QUERY,jasminejoejonas,I feel so great for starting twitter at suzanne but still hardly anyone has it.,I feel so great for starting twitter at suzanne but still hardly anyone has it.,feel great starting twitter suzanne still hardly anyone,feel great start twitt suzanne still hardly anyone,suzanne,great twitt,feel start,suzanne great twitt feel start,8.00


----------------------

# Explore data

---------------------

In [None]:
from nltk.probability import FreqDist

explore_text = np.array(df_sm['text_normalized'])
fdist = FreqDist(explore_text)
print(fdist)

## Sentiment Analysis with Afinn

As a quick and dirty sanity check, I've set up Afinn in the early stages of data cleaning, and intend to keep a little record of Afinn's performance, as I increase the rigour of the data cleaning.

In [None]:
from afinn import Afinn

afn = Afinn(emoticons=True)

In [None]:
texts = np.array(df_sm['text_clean'])
sentiments = np.array(df_sm['sentiment'])

# extract data for model evaluation
#train_texts = texts[:10000]
#train_sentiments = sentiments[:10000]

#test_texts = texts[40000:60000]
#test_sentiments = sentiments[40000:60000]
sample_ids = [626, 533, 310, 123, 654, 400]

In [None]:
for text_clean, sentiment in zip(texts[sample_ids], sentiments[sample_ids]):
    print('TEXT:', texts)
    print('Actual Sentiment:', sentiment)
    print('Predicted Sentiment polarity:', afn.score(texts))
    print('-'*60)

In [None]:
# Predict sentiment with Afinn

sentiment_polarity = [afn.score(Text) for Text in normalized_texts]
#predicted_sentiments = ['positive' if score >= 1.0 else 'negative' for score in sentiment_polarity]
predicted_sentiments = [4 if score >= 1.0 else 0 for score in sentiment_polarity]

In [None]:
#meu.display_model_performance_metrics(true_labels=test_texts, predicted_labels=predicted_sentiments, 
#                                  classes=['positive', 'negative'])
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predicted_sentiments, 
                                  classes=[4, 0])

## Checking cleaning with Afinn

I'm curious about how deeper cleaning affects predicitive models. So I set up Afinn after the very first round of data cleaning, and am going to track results here in the markdown. For simplicity, I will monitor the effects of different levels of cleaning on "weighted avg f1-score"

Round 1, most basic cleaning, 20000 rows:  0.63

Round 2, include normalization, 20000 rows: 0.63

## Bag of Words model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

texts = np.array(df_sm['text_normalized'])

cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(texts)
cv_matrix = cv_matrix.toarray()
cv_matrix


In [None]:
# get all unique words in the corpus
vocab = cv.get_feature_names()
# show document feature vectors
pd.DataFrame(cv_matrix, columns=vocab)

#### NLP

## Load spaCy

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

## Iterate over all rows and perform NLP

In [None]:
for i, row in df_sm.iterrows():
    if i % 1000 == 0:
        print(i)
    if(row["text_clean"] and len(str(row["text_clean"])) < 1000000):
        doc = nlp(str(row["text_clean"]))
        adjectives = []
        nouns = []
        verbs = []
        lemmas = []

        for token in doc:
            lemmas.append(token.lemma_)
            if token.pos_ == "ADJ":
                adjectives.append(token.lemma_)
            if token.pos_ == "NOUN" or token.pos_ == "PROPN":
                nouns.append(token.lemma_)
            if token.pos_ == "VERB":
                verbs.append(token.lemma_)
                
        df_sm.at[i, "selftext_lemma"] = " ".join(lemmas)                
        df_sm.at[i, "selftext_nouns"] = " ".join(nouns)
        df_sm.at[i, "selftext_adjectives"] = " ".join(adjectives)
        df_sm.at[i, "selftext_verbs"] = " ".join(verbs)
        df_sm.at[i, "selftext_nav"] = " ".join(nouns+adjectives+verbs)
        df_sm.at[i, "no_tokens"] = len(lemmas)

## Check results

In [None]:
df_sm.head()

## Save to database

In [None]:
df.to_sql('posts_nlp', con)