# Twitter Sentiment Analysis - POC
---

## TextBlob

Using TextBlob as a classifier. Questions remain:

- why are predictions unstable with Logistic Regression and stable with TextBlob?
- why is TextBlob's accuracy lower then - could it be that we're not training positive/negative but training to recognize whatever set of emoji's were used to create the sentiment140 dataset?

In [1]:
import re
import os
import time
import json

import numpy as np
import pandas as pd
import scipy.sparse as sp

import cleanup_module_POC as Cmod

from textblob import TextBlob
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [2]:
# time notebook
start_notebook = time.time()

# load minimally prepared X, y train subsets
deduped_path = os.path.join("..","data","1.2_deduped","tweets","prepared")
X_train = pd.read_csv(os.path.join(deduped_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(deduped_path, "y_train.csv"))

### Sample down to $m\approx100k$

In [13]:
X, X_rest, y, y_rest = train_test_split(X_train, y_train, test_size=0.87, random_state=42)

In [14]:
# check
print(f'Dataset size: {len(X):0.0f}')
print(f'Target distribution: {sum(y["target"])/len(y):0.3f}')

Dataset size: 101287
Target distribution: 0.533


In [15]:
# transform into arrays
X_array = np.array(X.iloc[:, 2]).ravel()
y_array = y.iloc[:,0].ravel()

In [16]:
preprocess_pipeline = Pipeline([
    ("document_to_wordcount", Cmod.DocumentToWordCounterTransformer()),
    ("wordcount_to_vector", Cmod.WordCounterToVectorTransformer(vocabulary_size=30000)),
])

In [17]:
proc_dir = os.path.join("..","data","3_processed","tweets")

try:
    # load X_train_transformed 
    X_train_transformed = sp.load_npz(os.path.join(proc_dir, "X_train_transformed_BoW_100k_30k.npz"))
    print('Loading X_train_transformed...')
except:
    print('Preprocessing X_array...')
    start_pipeline = time.time()
    X_train_transformed = preprocess_pipeline.fit_transform(X_array) 
    # save preprocessed data
    sp.save_npz(os.path.join(proc_dir, 'X_train_transformed_BoW_100k_30k.npz'), X_train_transformed)
    # print processing time
    mins, secs = divmod(time.time() - start_pipeline, 60)
    print(f'Preprocessing time: {mins:0.0f} minute(s) and {secs:0.0f} second(s).')
    # Preprocessing time: 2 minute(s) and 41 second(s).

Preprocessing X_array...
Preprocessing time: 1 minute(s) and 9 second(s).


In [18]:
X_train_transformed

<101287x30001 sparse matrix of type '<class 'numpy.int32'>'
	with 989028 stored elements in Compressed Sparse Row format>

### Get TextBlob.sentiment.polarity

In [19]:
def make_binary(val):
    if val <= 0: # better accuracy than val < 0
        return 0
    else:
        return 1
        
def extract_polarity(array):
    bin_vals = []
    for entry in array:
        blob = TextBlob(entry)
        polarity = blob.sentiment.polarity
        bin_vals.append(make_binary(polarity))
    return bin_vals

In [20]:
y_preds = extract_polarity(X_array)

In [21]:
from sklearn.metrics import accuracy_score
print(f'TexBlob\'s accuracy: {accuracy_score(y_array, y_preds):0.4f}')

TexBlob's accuracy: 0.5240


### Naive Bayes and LR for comparison

In [28]:
NB_clf = MultinomialNB()
log_clf = LogisticRegression(solver="liblinear", random_state=42)

In [29]:
score = cross_val_score(NB_clf, X_train_transformed, y_array, cv=5, verbose=0, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {round(score.mean(),4):0.4f} (+/- {np.std(score):0.4f})')

Accuracy: 0.6355 (+/- 0.0036)


In [30]:
score = cross_val_score(log_clf, X_train_transformed, y_array, cv=5, verbose=0, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {round(score.mean(),4):0.4f} (+/- {np.std(score):0.4f})')

Accuracy: 0.6407 (+/- 0.0050)


### Peek at some predictions

In [31]:
new_array = np.array([
                      "loving this feeling amazing happy",
                      "yay we're going bowling - so excited",
                      "what a wonderful day!",
                      "they can go to hell, idiots",
                      "I hate you and I hate everyone",
                      "what a poor excuse - sorry but they're terrible liars"
                      ])

In [32]:
new_array_transformed = preprocess_pipeline.fit_transform(new_array)

In [33]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(X_train_transformed, y_array)

LogisticRegression(random_state=42, solver='liblinear')

In [36]:
log_clf.predict(new_array_transformed)

array([0, 0, 0, 0, 1, 0], dtype=int64)

---