# Twitter Sentiment Analysis - POC
---

## TextBlob

Using TextBlob as a classifier. Questions remain:

- why are predictions unstable with Logistic Regression and stable with TextBlob?
- why is TextBlob's accuracy lower then - could it be that we're not training positive/negative but training to recognize whatever set of emoji's were used to create the sentiment140 dataset?

In [2]:
import re
import os
import time
import json

import numpy as np
import pandas as pd
import scipy.sparse as sp

import cleanup_module_POC as Cmod

from textblob import TextBlob
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [3]:
# time notebook
start_notebook = time.time()

# load minimally prepared X, y train subsets
raw_path = os.path.join("..","data","1_raw","sentiment140")
X_train = pd.read_csv(os.path.join(raw_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(raw_path, "y_train.csv"))

### Sample down to $m\approx120k$

In [4]:
X, X_rest, y, y_rest = train_test_split(X_train, y_train, test_size=0.9, random_state=42)

In [5]:
# check
print(f'Dataset size: {len(X):0.0f}')
print(f'Target distribution: {sum(y["target"])/len(y):0.3f}')

Dataset size: 119747
Target distribution: 0.498


In [6]:
# transform into arrays
X_array = np.array(X.iloc[:, 2]).ravel()
y_array = y.iloc[:,0].ravel()

In [7]:
preprocess_pipeline = Pipeline([
    ("document_to_wordcount", Cmod.DocumentToWordCounterTransformer()),
    ("wordcount_to_vector", Cmod.WordCounterToVectorTransformer(vocabulary_size=30000)),
])

In [8]:
proc_dir = os.path.join("..","data","3_processed","sentiment140")

try:
    # load X_train_transformed 
    X_train_transformed = sp.load_npz(os.path.join(proc_dir, "X_train_transformed_BoW_120k_30k.npz"))
    print('Loading X_train_transformed...')
except:
    print('Preprocessing X_array...')
    start_pipeline = time.time()
    X_train_transformed = preprocess_pipeline.fit_transform(X_array) 
    # save preprocessed data
    sp.save_npz(os.path.join(proc_dir, 'X_train_transformed_BoW_120k_30k.npz'), X_train_transformed)
    # print processing time
    mins, secs = divmod(time.time() - start_pipeline, 60)
    print(f'Preprocessing time: {mins:0.0f} minute(s) and {secs:0.0f} second(s).')
    # Preprocessing time: 2 minute(s) and 41 second(s).

Loading X_train_transformed...


In [9]:
X_train_transformed

<119747x30001 sparse matrix of type '<class 'numpy.int32'>'
	with 1220493 stored elements in Compressed Sparse Row format>

### Get TextBlob.sentiment.polarity

In [10]:
def make_binary(val):
    if val <= 0: # better accuracy than val < 0
        return 0
    else:
        return 1
        
def extract_polarity(array):
    bin_vals = []
    for entry in array:
        blob = TextBlob(entry)
        polarity = blob.sentiment.polarity
        bin_vals.append(make_binary(polarity))
    return bin_vals

In [11]:
y_preds = extract_polarity(X_array)

In [12]:
from sklearn.metrics import accuracy_score
print(f'TexBlob\'s accuracy: {accuracy_score(y_array, y_preds):0.4f}')

TexBlob's accuracy: 0.6233


In [13]:
def calc_accuracy(y_true, y_pred):
    sum_ones = []
    for yt, yp in zip(y_array, y_pred):
        if yt == yp:
            sum_ones.append(1)
        else:
            sum_ones.append(0)
    return sum(sum_ones)/len(sum_ones)

In [14]:
print(f'TexBlob\'s accuracy: {calc_accuracy(y_array, y_preds):0.4f}') 

TexBlob's accuracy: 0.6233


That's quite good without any preprocessing...

### Naive Bayes and LR for comparison

In [15]:
NB_clf = MultinomialNB()
log_clf = LogisticRegression(solver="liblinear", random_state=42)

In [16]:
score = cross_val_score(NB_clf, X_train_transformed, y_array, cv=5, verbose=0, scoring='accuracy')
print(f'Accuracy: {round(score.mean(),4):0.4f} (+/- {np.std(score):0.4f})')

Accuracy: 0.7702 (+/- 0.0014)


In [17]:
score = cross_val_score(log_clf, X_train_transformed, y_array, cv=5, verbose=0, scoring='accuracy')
print(f'Accuracy: {round(score.mean(),4):0.4f} (+/- {np.std(score):0.4f})')

Accuracy: 0.7819 (+/- 0.0016)


### Peek at some predictions

In [18]:
Xtr, Xts, ytr, yts = train_test_split(X_train_transformed, y_array, test_size=0.25, random_state=42)

In [19]:
new_array = np.array([
                      "loving this feeling amazing happy",
                      "yay we're going bowling - so excited",
                      "what a wonderful day!",
                      "they can go to hell, idiots",
                      "I hate you and I hate everyone",
                      "what a poor excuse - sorry but they're terrible liars"
                      ])

In [20]:
new_array_transformed = preprocess_pipeline.fit_transform(new_array)

In [21]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(Xtr, ytr)

LogisticRegression(random_state=42, solver='liblinear')

In [22]:
y_pred = log_clf.predict(Xts)
accuracy_score(yts, y_pred)

0.7780338711293717

In [24]:
new_yts = np.hstack([yts, np.array([1,1,1,0,0,0])])

In [25]:
from scipy.sparse import vstack
new_Xts = vstack((Xts, new_array_transformed))

In [26]:
y_pred = log_clf.predict(new_Xts)

In [27]:
round(accuracy_score(y_pred, new_yts), 4)

0.7779

In [28]:
for ix, val in enumerate(y_pred):
    if ix > 29936:
        print(ix, val)

29937 0
29938 1
29939 0
29940 1
29941 1
29942 0


In [29]:
log_clf.predict(new_array_transformed)

array([0, 1, 0, 1, 1, 0], dtype=int64)

In [30]:
for string in new_array:
    blob = TextBlob(string)
    print(string + f' {blob.sentiment.polarity:0.3f}')

loving this feeling amazing happy 0.667
yay we're going bowling - so excited 0.375
what a wonderful day! 1.000
they can go to hell, idiots -0.800
I hate you and I hate everyone -0.800
what a poor excuse - sorry but they're terrible liars -0.487


In [31]:
log_clf.predict_proba(new_array_transformed)

array([[0.72122637, 0.27877363],
       [0.4937703 , 0.5062297 ],
       [0.52578153, 0.47421847],
       [0.23334983, 0.76665017],
       [0.43009253, 0.56990747],
       [0.61629388, 0.38370612]])

---