## Twitter Sentiment Analysis 

---

### Pre-process cleaned data for machine learning 

While cleanup involved simply reformatting a Tweet's text by standardizing it and reducing the feature space (less punctuation, lower casing, tokenizing, lemmatizing), pre-processing for machine learning is more involved, it mainly consists of further data cleanup steps such as imputing NAs, but also some feature engineering and most importantly, creating a `document-frequency matrix (DFM) - INCLUDE LINK TO NOTEBOOK` for our tokens since most machine-learning algorithms do not accept text input.


---

### Load cleaned data


In [89]:
import os 
import time
import numpy as np

import loading_module as lm

start_time = time.time()
df = lm.load_clean_data('X_train')
mins, secs = divmod(time.time() - start_time, 60)
print(f'Elapsed Time: {mins:0.0f} minute(s) and {secs:0.0f} second(s)')

Elapsed Time: 0 minute(s) and 5 second(s)


In [90]:
df.shape

(1199999, 3)

In [91]:
df.head() 

Unnamed: 0,username,text,lemmatized
0,irishsamom,@kris38 I don't need any encouragement. It is...,USERNAME i dont need any encouragement my favo...
0,wiseleo,@kongchang Thanks. As long as you were enterta...,USERNAME thanks long you entertained thats all...
0,C_alvino,Broke a pair of sandals I bought at Old Navy. ...,broke pair sandal i bought old navy i guess th...
0,willysandi,i am so sleepy but there's still lots of assig...,i am so sleepy but there still lot assignment ...
0,Pace,Wish I was in Bournemouth today - how's it lo...,wish i bournemouth today hows looking down the...


In [92]:
# load original train indices and subset
raw_path = os.path.join("..","data","1_raw","sentiment140")  
train_ix = np.load(os.path.join(raw_path, "train_ix.npy"))
df.index = list(train_ix)

In [93]:
df.head()

Unnamed: 0,username,text,lemmatized
66270,irishsamom,@kris38 I don't need any encouragement. It is...,USERNAME i dont need any encouragement my favo...
428045,wiseleo,@kongchang Thanks. As long as you were enterta...,USERNAME thanks long you entertained thats all...
1307927,C_alvino,Broke a pair of sandals I bought at Old Navy. ...,broke pair sandal i bought old navy i guess th...
1112400,willysandi,i am so sleepy but there's still lots of assig...,i am so sleepy but there still lot assignment ...
840793,Pace,Wish I was in Bournemouth today - how's it lo...,wish i bournemouth today hows looking down the...


In [94]:
df.tail()

Unnamed: 0,username,text,lemmatized
259178,AngusGibson,"@annspade I be struggling to sleep, myself, 'c...",USERNAME i struggling sleep myself cept 430am ...
1414414,braidead,@yerhuber try to listen to @monkmusic songs i ...,USERNAME try listen USERNAME song i think youl...
131932,nabsworth,@AkashaTheKitty but your &quot;today&quot; is ...,USERNAME but your today different mine my toda...
671155,Altrntvgurl,Good morning twitter,good morning twitter
121958,t_kawai,@antonea Absolutely! I gave ya 5 stars on it t...,USERNAME absolutely i gave ya 5 star too


### Import ML pre-processing modules

In [95]:
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

### Impute NAs created during cleanup

We do not want to drop since the fact they ended up as empty strings is possibly informative - but we do want to impute them with some value.

In [96]:
df.isnull().sum()

username       0
text           0
lemmatized    56
dtype: int64

In [97]:
NA_ix = df.loc[df['lemmatized'].isnull(), ].index
df.loc[list(NA_ix), ].head(6)

Unnamed: 0,username,text,lemmatized
602804,SSNOB,ÃÂÃÂ° ÃÂÃÂ¶! ÃÂ£ tut.by Ã?ÃÂµÃÂÃÂ...,
1519405,Nathan_Irvine,www.incrediblyhungover.com/me,
299658,cassiebland,thebasementlive.com,
38437,samanthablews,facebook.com/samantha.hatch,
771099,kate4samh,is sad......www.whatkatedidnext.wordpress.com,
1421447,mgpyone,faceyourmanga.com Ã¡ÂÂÃ¡ÂÂ±Ã¡ÂÂÃ¡ÂÂ¬Ã¡Â...,
296741,ChickWithAName,. . . . . and it's on!,
1374484,SkydiveSummer,myspace.com/skydivesummer,
725197,sangofsorrow,He is...,
788998,Galiiit,youtube.com/user/galitfob,


In [98]:
import pandas as pd
raw_path = os.path.join("..","data","1_raw","sentiment140")
filename = "y_train.csv"
filepath = os.path.join(raw_path, filename)
y_train = pd.read_csv(filepath)

In [101]:
y_train.index = list(train_ix)
y_train["count"] = 'ct'

y_train.loc[list(NA_ix), ].groupby("target").count()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,29
1,27


In [102]:
df['lemmatized'].loc[list(NA_ix), ] = 'EMPTY'

In [103]:
df.isnull().sum()

username      0
text          0
lemmatized    0
dtype: int64

In [105]:
df.loc[list(NA_ix), ].tail()

Unnamed: 0,username,text,lemmatized
464093,egiltve,www.quality-rx.com/?fid=3498,EMPTY
1247878,ukactivism,www.twitteractivism.com,EMPTY
766913,willyouatme,www.WillYouAtMe.com?,EMPTY
23386,iloveyouduh98,...............myspace.com/laceylynnwilliams98,EMPTY
446636,radjah,juick.com: ?????????? ?????? - ????? ????? ht...,EMPTY


### Deduplicate based on lemmatized text


In [114]:
dupes = df[df['lemmatized'].duplicated(keep='first')]
dec = len(dupes)/len(df)
print(f'{100*dec:0.2f}% duplicated Tweets')

4.86% duplicated Tweets


In [115]:
df = df.drop(dupes.index)

In [116]:
df.shape # wait a second though... y_train needed to be paired with!

(1141626, 3)

### Vectorize with Bag of Words (BoW) and Term Frequency - Inverse Document Frequency (TF-IDF) methods


In [9]:
# target
y = np.array(dfm.iloc[:, 0]).ravel() 

In [10]:
len(y)

1565203

In [19]:
y[0:10], y[len(y)-10:len(y)]

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64))

In [23]:
# lemmatized column (as array)
X_lemm_array = np.array(dfm.iloc[:, 5]).ravel()

In [25]:
# instantiate vectorizers
bow_vectorizer = CountVectorizer() # simple BoW 
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True) # log(tf) version

In [27]:
# create a document-frequency matrix (dfm), aka Bag of Words
start = time.perf_counter()
X_lemm_bow_dfm = bow_vectorizer.fit_transform(X_lemm_array)
end_timer(start)

Finished in 41.98 second(s)


In [37]:
X_lemm_bow_dfm[1:10,1:20].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

In [14]:
start = time.perf_counter()
X_lemm_tfidf_dfm = tfidf_vectorizer.fit_transform(X_lemm)
end_timer(start)

Finished in 100.43 second(s)


### Save pre-processed data

In [16]:
import os
import scipy.sparse as sp

In [17]:
dirpath = os.path.join("..","data","3_processed","sentiment140")
filenames = ['X_text_bow'
            ,'X_tokn_bow'
            ,'X_filt_bow'
            ,'X_stem_bow'
            ,'X_lemm_bow'
            ,'X_text_tfidf'
            ,'X_tokn_tfidf'
            ,'X_filt_tfidf'
            ,'X_stem_tfidf'
            ,'X_lemm_tfidf'
            ,'X_text_log_tfidf'
            ,'X_tokn_log_tfidf'
            ,'X_filt_log_tfidf'
            ,'X_stem_log_tfidf'
            ,'X_lemm_log_tfidf']

filepaths = [os.path.join(dirpath, ''.join([filename, '.npz'])) for filename in filenames]

In [18]:
start = time.perf_counter()
sp.save_npz(filepaths[0], X_text_bow)
sp.save_npz(filepaths[1], X_tokn_bow)
sp.save_npz(filepaths[2], X_filt_bow)
sp.save_npz(filepaths[3], X_stem_bow)
sp.save_npz(filepaths[4], X_lemm_bow)
sp.save_npz(filepaths[5], X_text_tfidf)
sp.save_npz(filepaths[6], X_tokn_tfidf)
sp.save_npz(filepaths[7], X_filt_tfidf)
sp.save_npz(filepaths[8], X_stem_tfidf)
sp.save_npz(filepaths[9], X_lemm_tfidf)
sp.save_npz(filepaths[10], X_text_log_tfidf)
sp.save_npz(filepaths[11], X_tokn_log_tfidf)
sp.save_npz(filepaths[12], X_filt_log_tfidf)
sp.save_npz(filepaths[13], X_stem_log_tfidf)
sp.save_npz(filepaths[14], X_lemm_log_tfidf)
end_timer(start)

Finished in 117.94 second(s)


In [19]:
# save y target vector == we already have this!
np.save(os.path.join(dirpath, 'y'), y)

---