# Twitter Sentiment Analysis - POC
---


In [1]:
import re
import os
import time
import json

import numpy as np
import pandas as pd
import scipy.sparse as sp

import cleanup_module_POC as Cmod

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [2]:
# time notebook
start_notebook = time.time()

# load minimally prepared X, y train subsets
raw_path = os.path.join("..","data","1_raw","sentiment140")
X_train = pd.read_csv(os.path.join(raw_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(raw_path, "y_train.csv"))

### Sample down to $m\approx250k$

In [38]:
X, X_rest, y, y_rest = train_test_split(X_train, y_train, test_size=0.9, random_state=42)

In [39]:
# check
print(f'Dataset size: {len(X):0.0f}')
print(f'Target distribution: {sum(y["target"])/len(y):0.3f}')

Dataset size: 119747
Target distribution: 0.498


In [40]:
# transform into arrays
X_array = np.array(X.iloc[:, 2]).ravel()
y_array = y.iloc[:,0].ravel()

In [44]:
 proc_dir = os.path.join("..","data","3_processed","sentiment140")
try:
    # load X_train_transformed 
    X_train_transformed = sp.load_npz(os.path.join(proc_dir, "X_train_transformed_BoW_120k_30k.npz"))
    print('Loading X_train_transformed...')
except:
    print('Preprocessing X_array...')
    preprocess_pipeline = Pipeline([
        ("document_to_wordcount", Cmod.DocumentToWordCounterTransformer()),
        ("wordcount_to_vector", Cmod.WordCounterToVectorTransformer(vocabulary_size=30000)),
    ])
    start_pipeline = time.time()
    X_train_transformed = preprocess_pipeline.fit_transform(X_array) 
    # save preprocessed data
    sp.save_npz(os.path.join(proc_dir, 'X_train_transformed_BoW_120k_30k.npz'), X_train_transformed)
    # print processing time
    mins, secs = divmod(time.time() - start_pipeline, 60)
    print(f'Preprocessing time: {mins:0.0f} minute(s) and {secs:0.0f} second(s).')
    # Preprocessing time: 2 minute(s) and 41 second(s).

Preprocessing X_array...
Preprocessing time: 1 minute(s) and 16 second(s).


In [45]:
X_train_transformed

<119747x30001 sparse matrix of type '<class 'numpy.int32'>'
	with 1220493 stored elements in Compressed Sparse Row format>

### Cross Validation 

In [46]:
NB_clf = MultinomialNB()
log_clf = LogisticRegression(solver="liblinear", random_state=42)

In [47]:
score = cross_val_score(NB_clf, X_train_transformed, y_array, cv=5, verbose=0, scoring='accuracy')
print(f'Accuracy: {round(score.mean(),4):0.4f} (+/- {np.std(score):0.4f})')

Accuracy: 0.7702 (+/- 0.0014)


In [48]:
score = cross_val_score(log_clf, X_train_transformed, y_array, cv=5, verbose=0, scoring='accuracy')
print(f'Accuracy: {round(score.mean(),4):0.4f} (+/- {np.std(score):0.4f})')

Accuracy: 0.7819 (+/- 0.0016)


### Peek at some predictions

In [84]:
new_array1 = np.array([
                      "loving this feeling amazing happy",
                      "yay we're going bowling - so excited",
                      "what a wonderful day!",
                       "they can go to hell, idiots",
                      "I hate you and I hate everyone",
                      "what a poor excuse - sorry but they're terrible liars"
                      ])

In [85]:
new_array2 = np.array([
                       "they can go to hell, idiots",
                      "I hate you and I hate everyone",
                      "what a poor excuse - sorry but they're terrible liars",
                      "loving this feeling amazing happy",
                      "yay we're going bowling - so excited",
                      "what a wonderful day!"
                      ])

In [86]:
narray1_transformed = preprocess_pipeline.fit_transform(new_array1)
narray2_transformed = preprocess_pipeline.fit_transform(new_array2)

In [88]:
narray1_transformed

<6x30001 sparse matrix of type '<class 'numpy.int32'>'
	with 31 stored elements in Compressed Sparse Row format>

In [80]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(X_train_transformed, y_array)

LogisticRegression(random_state=42, solver='liblinear')

In [89]:
log_clf.predict_proba(narray1_transformed)

array([[0.70974634, 0.29025366],
       [0.49105686, 0.50894314],
       [0.53170177, 0.46829823],
       [0.23791629, 0.76208371],
       [0.42847116, 0.57152884],
       [0.62319776, 0.37680224]])

In [90]:
log_clf.predict(narray1_transformed)

array([0, 1, 0, 1, 1, 0], dtype=int64)

In [91]:
log_clf.predict_proba(narray2_transformed)

array([[0.68408587, 0.31591413],
       [0.33733676, 0.66266324],
       [0.58661048, 0.41338952],
       [0.33339196, 0.66660804],
       [0.74986453, 0.25013547],
       [0.30658423, 0.69341577]])

In [92]:
log_clf.predict(narray2_transformed)

array([0, 1, 0, 1, 0, 1], dtype=int64)

---