# CSML1010 Project Working Copy
# Sentiment Analysis with the Sentiment140 dataset
## Pete Gray

---------------------------------------------------------

# Import libraries

In [3]:
import pandas as pd
import numpy as np
np.set_printoptions(precision=2, linewidth=80)
import warnings
warnings.filterwarnings("ignore")
import model_evaluation_utils as meu


# Adjust pandas display

In [4]:
pd.options.display.max_columns = 30
pd.options.display.max_rows = 100
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.precision = 2
pd.options.display.max_colwidth = -1

# Import matplotlib and seaborn and adjust defaults

In [5]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

from matplotlib import pyplot as plt
plt.rcParams['figure.dpi'] = 100

import seaborn as sns
sns.set_style("whitegrid")

## Read data from local filesystem and csv source 

In [66]:
df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding="ISO-8859-1")

Check data with quick visual inspection

In [67]:
df

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
1599994,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best feeling ever
1599995,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interviews! â« http://blip.fm/~8bmta
1599996,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me for details
1599997,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur


## Give dataframe columns

In [68]:
df.columns = ['sentiment', 'ID', 'Time', 'none', 'username', 'Text']

In [69]:
df.count()

sentiment    1599999
ID           1599999
Time         1599999
none         1599999
username     1599999
Text         1599999
dtype: int64

## Now it has columns, this seems better.
#
## We have to cut this down to size, for iterative development.
##
## Don't forget to get rid of this!!! When crunching whole huge dataset.

In [82]:
dev_data_size = 10000
start_row = int(800000-(dev_data_size/2))-1
finish_row = int(800000+(dev_data_size/2))-1
df_sm = df[start_row:finish_row]
df_sm.count()

sentiment    10000
ID           10000
Time         10000
none         10000
username     10000
Text         10000
dtype: int64

In [83]:
columns = [col for col in df.columns if not col.startswith('self')]
columns

['sentiment', 'ID', 'Time', 'none', 'username', 'Text']

In [84]:
raw_text = np.array(df_sm['Text'])
sentiments = np.array(df_sm['sentiment'])
raw_text[5:15]

array(['im sitting alone at TTE myself without my two michigan sisters for the first time in a decade.  ',
       "@Julie90210 It took me three attempts but I got it in the end. I'm sorry for your loss  Did you try the DRU thing and iTunes restore?",
       'awake for summer school. ',
       "@AJBombers Just noticed it is an afternoon game, won't be able to make that game sorry, thanks for the offer but I will have to decline.. ",
       "Sorry for the long listening, but I'm too lazy for twitter  and I had to work hard in the last week",
       'Bleurgh...feeling rough ',
       'Hey @pcwoessner have to leave   back to work.   (Summer PD 09 live &gt; http://ustre.am/3mgf)',
       '@margxwanders awwwwwwwww! busy much?? miss you gaux! ',
       'Just got to work ', "@claireliz81 ....he caught Sanchez's disease "],
      dtype=object)

In [85]:
sentiments[4995:5005]

array([0, 0, 0, 0, 0, 4, 4, 4, 4, 4], dtype=int64)

# Data Cleaning

## Cleaning function

In [100]:
import re
def clean(s):
    s = str(s).replace(r'<lb>', "\n")
    s = s.replace(r'<tab>', "\i")
    
    # As a sanity check - s = s.replace(r'W', "Q")
    
    s = re.sub(r'<br */*>', "\n", s)
    s = s.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
    s = s.replace("&amp;", "&")
    # markdown urls
    s = re.sub(r'\(https*://[^\)]*\)', "", s)
    # normal urls
    s = re.sub(r'https*://[^\s]*', "", s)
    s = re.sub(r'_+', ' ', s)
    s = re.sub(r'"+', '"', s)
    return str(s)

## Create new column in dataframe

In [101]:
df_sm["text_clean"] = ''

# Iterate and clean

In [102]:
for i, row in df_sm.iterrows():
    if i % 1000 == 0:
            print('processed:'.format(i), i)
    df_sm.at[i, "text_clean"] = clean(row.Text)

processed: 800000


Check results

In [103]:
df_sm.head()

Unnamed: 0,sentiment,ID,Time,none,username,Text,text_clean
794999,0,2327192646,Thu Jun 25 08:02:13 PDT 2009,NO_QUERY,quiz_master,"Was having dinner with parents downstairs in Dining Room, they started watching 'Baba Ramdev Yoga' thingy, so I'm back to my room .","Was having dinner with parents downstairs in Dining Room, they started watching 'Baba Ramdev Yoga' thingy, so I'm back to my room ."
795000,0,2327193206,Thu Jun 25 08:02:16 PDT 2009,NO_QUERY,djcampos,Blah 5am still up daang I got deep problems,Blah 5am still up daang I got deep problems
795001,0,2327193455,Thu Jun 25 08:02:17 PDT 2009,NO_QUERY,RKF,@jenspeedy I would suggest avoiding 360 Living. Not goodness Try contacting Scott at MKCC mkccrenovations@rogers.com 905-303-9009,@jenspeedy I would suggest avoiding 360 Living. Not goodness Try contacting Scott at MKCC mkccrenovations@rogers.com 905-303-9009
795002,0,2327193641,Thu Jun 25 08:02:18 PDT 2009,NO_QUERY,AnaHertz,@alexbroun I didn't convince myself I was fat and ugly someone else did a pretty good job of that. Its a long story sorry,@alexbroun I didn't convince myself I was fat and ugly someone else did a pretty good job of that. Its a long story sorry
795003,0,2327193806,Thu Jun 25 08:02:18 PDT 2009,NO_QUERY,yenafer,"@spotzle @jstarrh check on sunscreen, snacks, towels, suits, kids drinks, bags and tape for boys cast, camera. Need toys, chairs ugh :-/","@spotzle @jstarrh check on sunscreen, snacks, towels, suits, kids drinks, bags and tape for boys cast, camera. Need toys, chairs ugh :-/"


## Additional pre-processing: tokenization, removing extra whitespaces, lower casing and more advanced operations like spelling corrections, grammatical error corrections, removing repeated characters.

In [104]:
import nltk
wpt = nltk.WordPunctTokenizer()
nltk.download("stopwords")
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [105]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [106]:
normalize_corpus = np.vectorize(normalize_document)

In [107]:
df_sm["text_normalized"] = ''

In [111]:
for i, row in df_sm.iterrows():
    if i % 1000 == 0:
            print('processed:'.format(i), i)
    df_sm.at[i, "text_normalized"] = normalize_corpus(row.text_clean)

processed: 795000
processed: 796000
processed: 797000
processed: 798000
processed: 799000
processed: 800000
processed: 801000
processed: 802000
processed: 803000
processed: 804000


In [112]:
df_sm

Unnamed: 0,sentiment,ID,Time,none,username,Text,text_clean,text_normalized
794999,0,2327192646,Thu Jun 25 08:02:13 PDT 2009,NO_QUERY,quiz_master,"Was having dinner with parents downstairs in Dining Room, they started watching 'Baba Ramdev Yoga' thingy, so I'm back to my room .","Was having dinner with parents downstairs in Dining Room, they started watching 'Baba Ramdev Yoga' thingy, so I'm back to my room .","dinner parents downstairs dining room started watching baba ramdev yoga ' thingy , ' back room ."
795000,0,2327193206,Thu Jun 25 08:02:16 PDT 2009,NO_QUERY,djcampos,Blah 5am still up daang I got deep problems,Blah 5am still up daang I got deep problems,blah 5am still daang got deep problems
795001,0,2327193455,Thu Jun 25 08:02:17 PDT 2009,NO_QUERY,RKF,@jenspeedy I would suggest avoiding 360 Living. Not goodness Try contacting Scott at MKCC mkccrenovations@rogers.com 905-303-9009,@jenspeedy I would suggest avoiding 360 Living. Not goodness Try contacting Scott at MKCC mkccrenovations@rogers.com 905-303-9009,jenspeedy would suggest avoiding 360 living goodness try contacting scott mkcc mkccrenovations @ rogers . com 905 - 303 - 9009
795002,0,2327193641,Thu Jun 25 08:02:18 PDT 2009,NO_QUERY,AnaHertz,@alexbroun I didn't convince myself I was fat and ugly someone else did a pretty good job of that. Its a long story sorry,@alexbroun I didn't convince myself I was fat and ugly someone else did a pretty good job of that. Its a long story sorry,alexbroun didnt convince fat ugly someone else pretty good job . long story sorry
795003,0,2327193806,Thu Jun 25 08:02:18 PDT 2009,NO_QUERY,yenafer,"@spotzle @jstarrh check on sunscreen, snacks, towels, suits, kids drinks, bags and tape for boys cast, camera. Need toys, chairs ugh :-/","@spotzle @jstarrh check on sunscreen, snacks, towels, suits, kids drinks, bags and tape for boys cast, camera. Need toys, chairs ugh :-/","spotzle jstarrh check sunscreen , snacks , towels , suits , kids drinks , bags tape boys cast , camera . need toys , chairs ugh :-/"
...,...,...,...,...,...,...,...,...
804994,4,1468599653,Tue Apr 07 02:39:03 PDT 2009,NO_QUERY,danalynbyers,"@lbran, thanks for sending us the package - got it this morning!","@lbran, thanks for sending us the package - got it this morning!",lbran thanks sending us package - got morning !
804995,4,1468599688,Tue Apr 07 02:39:04 PDT 2009,NO_QUERY,joscelinyeo,@ickleoriental hahahha.. U obviously don't hv one!! But maybe u can give me advice? Fdw.. Foreign domestic worker,@ickleoriental hahahha.. U obviously don't hv one!! But maybe u can give me advice? Fdw.. Foreign domestic worker,ickleoriental hahahha . u obviously ' hv one !! maybe u give advice ? fdw .. foreign domestic worker
804996,4,1468599702,Tue Apr 07 02:39:04 PDT 2009,NO_QUERY,serengetisunset,"@juliekoh It's an internet term, but it's spilled over into common use, in real life","@juliekoh It's an internet term, but it's spilled over into common use, in real life","juliekoh internet term , ' spilled common use , real life"
804997,4,1468599765,Tue Apr 07 02:39:06 PDT 2009,NO_QUERY,broombeck,new day.... NEW TRACK!!!!,new day.... NEW TRACK!!!!,new day .. new track !!!!


In [52]:
texts = np.array(df['selftext_clean'])
norm_texts = texts[40000:60000]
normalized_texts = normalize_corpus(norm_texts)
normalized_texts

array(['sherilynmoon unfortunate signal tweet photos phone whilst wales',
       'dropped car get brakes looked hope doesnt cost insane amounts money',
       'colin kelly clyde1 get one republic neyo im school unwell', ...,
       'zenaweist could also tweet beccaroberts',
       "good lord still 125 work emails catch actually read ' teach go vacation .",
       'gig northampton racehorse tmw night'], dtype='<U191')

## Sentiment Analysis with Afinn

As a quick and dirty sanity check, I've set up Afinn in the early stages of data cleaning, and intend to keep a little record of Afinn's performance, as I increase the rigour of the data cleaning.

In [43]:
from afinn import Afinn

afn = Afinn(emoticons=True)

In [44]:
texts = np.array(df['selftext_clean'])
sentiments = np.array(df['sentiment'])

# extract data for model evaluation
train_texts = texts[:10000]
train_sentiments = sentiments[:10000]

test_texts = texts[40000:60000]
test_sentiments = sentiments[40000:60000]
sample_review_ids = [5626, 3533, 6010, 123, 2654, 4000]

In [48]:
for selftext_clean, sentiment in zip(normalized_texts[sample_review_ids], test_sentiments[sample_review_ids]):
    print('TEXT:', selftext_clean)
    print('Actual Sentiment:', sentiment)
    print('Predicted Sentiment polarity:', afn.score(Text))
    print('-'*60)

TEXT: hot
Actual Sentiment: 0
Predicted Sentiment polarity: 3.0
------------------------------------------------------------
TEXT: 5dollardinners georgegmithjr pics must change - everytime refresh new pic ! ( always shoes )
Actual Sentiment: 0
Predicted Sentiment polarity: 3.0
------------------------------------------------------------
TEXT: want twitter fone
Actual Sentiment: 0
Predicted Sentiment polarity: 3.0
------------------------------------------------------------
TEXT: watching tennis sam stosur playing well today
Actual Sentiment: 0
Predicted Sentiment polarity: 3.0
------------------------------------------------------------
TEXT: spilled cup coffee embarassing
Actual Sentiment: 0
Predicted Sentiment polarity: 3.0
------------------------------------------------------------
TEXT: work studying / want enjoy radiant sun
Actual Sentiment: 0
Predicted Sentiment polarity: 3.0
------------------------------------------------------------


In [53]:
# Predict sentiment with Afinn

sentiment_polarity = [afn.score(Text) for Text in normalized_texts]
#predicted_sentiments = ['positive' if score >= 1.0 else 'negative' for score in sentiment_polarity]
predicted_sentiments = [4 if score >= 1.0 else 0 for score in sentiment_polarity]

In [54]:
#meu.display_model_performance_metrics(true_labels=test_texts, predicted_labels=predicted_sentiments, 
#                                  classes=['positive', 'negative'])
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predicted_sentiments, 
                                  classes=[4, 0])

Model Performance metrics:
------------------------------
Accuracy: 0.632
Precision: 0.635
Recall: 0.632
F1 Score: 0.6299

Model Classification report:
------------------------------
              precision    recall  f1-score   support

           4       0.66      0.56      0.60     10001
           0       0.61      0.71      0.66      9999

    accuracy                           0.63     20000
   macro avg       0.64      0.63      0.63     20000
weighted avg       0.64      0.63      0.63     20000


Prediction Confusion Matrix:
------------------------------
          Predicted:      
                   4     0
Actual: 4  5568       4433
        0  2928       7071


## Checking cleaning with Afinn

I'm curious about how deeper cleaning affects predicitive models. So I set up Afinn after the very first round of data cleaning, and am going to track results here in the markdown. For simplicity, I will monitor the effects of different levels of cleaning on "weighted avg f1-score"

Round 1, most basic cleaning, 20000 rows:  0.63

Round 2, include normalization, 20000 rows: 0.63

## Bag of Words model

In [55]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(test_texts)
cv_matrix = cv_matrix.toarray()
cv_matrix


MemoryError: 

In [41]:
# get all unique words in the corpus
vocab = cv.get_feature_names()
# show document feature vectors
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,00,000,0000r0cx,0007,000th,00am,01,01000101,01614948343,018,01yt,02,03,04,05,...,½o,½quiï,½r,½re,½rmï,½s,½se,½stand,½t,½tï,½ve,½vel,½y,½ï,ã¼ã
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### NLP

## Load spaCy

In [18]:
import spacy
nlp = spacy.load('en_core_web_sm')

## Iterate over all rows and perform NLP

In [19]:
for i, row in df.iterrows():
    if i % 1000 == 0:
        print(i)
    if(row["selftext_clean"] and len(str(row["selftext_clean"])) < 1000000):
        doc = nlp(str(row["selftext_clean"]))
        adjectives = []
        nouns = []
        verbs = []
        lemmas = []

        for token in doc:
            lemmas.append(token.lemma_)
            if token.pos_ == "ADJ":
                adjectives.append(token.lemma_)
            if token.pos_ == "NOUN" or token.pos_ == "PROPN":
                nouns.append(token.lemma_)
            if token.pos_ == "VERB":
                verbs.append(token.lemma_)
                
        df.at[i, "selftext_lemma"] = " ".join(lemmas)                
        df.at[i, "selftext_nouns"] = " ".join(nouns)
        df.at[i, "selftext_adjectives"] = " ".join(adjectives)
        df.at[i, "selftext_verbs"] = " ".join(verbs)
        df.at[i, "selftext_nav"] = " ".join(nouns+adjectives+verbs)
        df.at[i, "no_tokens"] = len(lemmas)

750000
751000
752000
753000
754000
755000
756000
757000
758000
759000
760000
761000
762000
763000


KeyboardInterrupt: 

## Check results

In [20]:
df.head()

Unnamed: 0,sentiment,ID,Time,none,username,Text,selftext_clean,selftext_lemma,selftext_nouns,selftext_adjectives,selftext_verbs,selftext_nav,no_tokens
750000,0,2285370823,Mon Jun 22 15:02:49 PDT 2009,NO_QUERY,xbeautifulmessx,@Idristwilight You can post HAN when you want. It's great! I am still working on TLD though. I got a little distracted so sorry.,@Idristwilight You can post HAN when you want. It's great! I am still working on TLD though. I got a little distracted so sorry.,@Idristwilight -PRON- can post HAN when -PRON- want . -PRON- be great ! -PRON- be still work on TLD though . -PRON- get a little distracted so sorry .,@Idristwilight HAN TLD,great distracted sorry,can post want work get,@Idristwilight HAN TLD great distracted sorry can post want work get,29.0
750001,0,2285371185,Mon Jun 22 15:02:51 PDT 2009,NO_QUERY,thefirstsight,"@rose_7 Ohh poor jan please tell her that if she cans, send us an email!!","@rose 7 Ohh poor jan please tell her that if she cans, send us an email!!","@rose 7 Ohh poor jan please tell -PRON- that if -PRON- can , send -PRON- an email ! !",Ohh jan email,poor,tell can send,Ohh jan email poor tell can send,20.0
750002,0,2285371495,Mon Jun 22 15:02:52 PDT 2009,NO_QUERY,Sarah2713,Finally home from work...It was a looong day!! And it's only Monday,Finally home from work...It was a looong day!! And it's only Monday,finally home from work ... -PRON- be a looong day ! ! and -PRON- be only Monday,work day Monday,looong,,work day Monday looong,17.0
750003,0,2285371762,Mon Jun 22 15:02:54 PDT 2009,NO_QUERY,dierockerfrau,im very sad 4 chantelle and tom,im very sad 4 chantelle and tom,-PRON- be very sad 4 chantelle and tom,chantelle tom,sad,be,chantelle tom sad be,8.0
750004,0,2285372377,Mon Jun 22 15:02:57 PDT 2009,NO_QUERY,alexbates,I chatted with someone on the online Apple store and they said it would be better to buy a new one. I don't have $200 to waste,I chatted with someone on the online Apple store and they said it would be better to buy a new one. I don't have $200 to waste,-PRON- chat with someone on the online Apple store and -PRON- say -PRON- would be well to buy a new one . -PRON- do not have $ 200 to waste,Apple store,online well new,chat say would buy waste,Apple store online well new chat say would buy waste,31.0


## Save to database

In [None]:
df.to_sql('posts_nlp', con)