## Twitter Sentiment Analysis 

### Part 2: Pre-process cleaned data for machine learning (ML)




### Load Cleaned Data

In [1]:
import time
import load_data as ld

start = time.perf_counter()

df = ld.run_processes()

def end_timer(start):
    end = time.perf_counter()
    print(f'Finished in {round(end-start, 2)} second(s)')

end_timer(start)

Finished in 11.39 second(s)


In [2]:
df.head()

Unnamed: 0,target,text,tokenized,filtered,stemmed,lemmatized
0,0,is upset that he can't update his Facebook by ...,is upset that he cant update his facebook by t...,upset cant update his facebook texting might c...,upset cant updat hi facebook text might cri re...,upset cant update his facebook texting might c...
1,0,@Kenichan I dived many times for the ball. Man...,kenichan i dived many times for the ball manag...,kenichan i dived many times ball managed save ...,kenichan i dive mani time ball manag save 50 r...,kenichan i dived many time ball managed save 5...
2,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy like fire,my whole bodi feel itchi like fire,my whole body feel itchy like fire
3,0,"@nationwideclass no, it's not behaving at all....",nationwideclass no its not behaving at all im ...,nationwideclass no not behaving all im mad why...,nationwideclass no not behav all im mad whi am...,nationwideclass no not behaving all im mad why...
4,0,@Kwesidei not the whole crew,kwesidei not the whole crew,kwesidei not whole crew,kwesidei not whole crew,kwesidei not whole crew


### Import ML pre-processing modules

In [3]:
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

### Impute NAs created during cleanup

We do not want to drop since the fact they ended up as empty strings is possibly informative - for ex. possibly more positive Tweets? (see TODO at the end)


In [4]:
df.isnull().sum()

target         0
text           0
tokenized     55
filtered      74
stemmed       74
lemmatized    74
dtype: int64

In [5]:
dfm = df.dropna() # drop for now
dfm.index = range(1,len(dfm) + 1)

In [6]:
f'{round((len(df)-len(dfm))/len(df), 6):.6f}' # prop is very low

'0.000046'

### Deduplicate based on tokenized text


In [7]:
dupes = dfm[dfm['tokenized'].duplicated(keep='first')]

round(len(dupes)/len(dfm), 3) # 2.2 % duplicated after tokenizing

0.022

In [8]:
dfm = dfm.drop(dupes.index)

# check target distribution
dfm[['target','text']].groupby('target').count()

Unnamed: 0_level_0,text
target,Unnamed: 1_level_1
0,781332
1,783871


### Vectorize with Bag of Words (BoW) and Term Frequency - Inverse Document Frequency (TF-IDF) methods


In [9]:
# target
y = np.array(dfm.iloc[:, 0]).ravel() 

In [10]:
len(y)

1565203

In [19]:
y[0:10], y[len(y)-10:len(y)]

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64))

In [23]:
# lemmatized column (as array)
X_lemm_array = np.array(dfm.iloc[:, 5]).ravel()

In [25]:
# instantiate vectorizers
bow_vectorizer = CountVectorizer() # simple BoW 
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True) # log(tf) version

In [27]:
# create a document-frequency matrix (dfm), aka Bag of Words
start = time.perf_counter()
X_lemm_bow_dfm = bow_vectorizer.fit_transform(X_lemm_array)
end_timer(start)

Finished in 41.98 second(s)


In [37]:
X_lemm_bow_dfm[1:10,1:20].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

In [14]:
start = time.perf_counter()
X_lemm_tfidf_dfm = tfidf_vectorizer.fit_transform(X_lemm)
end_timer(start)

Finished in 100.43 second(s)


### Set test data aside

### Save

In [16]:
import os
import scipy.sparse as sp

In [17]:
dirpath = os.path.join("..","data","3_processed","sentiment140")
filenames = ['X_text_bow'
            ,'X_tokn_bow'
            ,'X_filt_bow'
            ,'X_stem_bow'
            ,'X_lemm_bow'
            ,'X_text_tfidf'
            ,'X_tokn_tfidf'
            ,'X_filt_tfidf'
            ,'X_stem_tfidf'
            ,'X_lemm_tfidf'
            ,'X_text_log_tfidf'
            ,'X_tokn_log_tfidf'
            ,'X_filt_log_tfidf'
            ,'X_stem_log_tfidf'
            ,'X_lemm_log_tfidf']

filepaths = [os.path.join(dirpath, ''.join([filename, '.npz'])) for filename in filenames]

In [18]:
start = time.perf_counter()
sp.save_npz(filepaths[0], X_text_bow)
sp.save_npz(filepaths[1], X_tokn_bow)
sp.save_npz(filepaths[2], X_filt_bow)
sp.save_npz(filepaths[3], X_stem_bow)
sp.save_npz(filepaths[4], X_lemm_bow)
sp.save_npz(filepaths[5], X_text_tfidf)
sp.save_npz(filepaths[6], X_tokn_tfidf)
sp.save_npz(filepaths[7], X_filt_tfidf)
sp.save_npz(filepaths[8], X_stem_tfidf)
sp.save_npz(filepaths[9], X_lemm_tfidf)
sp.save_npz(filepaths[10], X_text_log_tfidf)
sp.save_npz(filepaths[11], X_tokn_log_tfidf)
sp.save_npz(filepaths[12], X_filt_log_tfidf)
sp.save_npz(filepaths[13], X_stem_log_tfidf)
sp.save_npz(filepaths[14], X_lemm_log_tfidf)
end_timer(start)

Finished in 117.94 second(s)


In [19]:
# save y target vector
np.save(os.path.join(dirpath, 'y'), y)

---