# Text Classifiers

## Imports

In [339]:
import numpy as np
import pandas as pd
import sklearn
import imblearn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, classification_report, mean_squared_error, r2_score, multilabel_confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

## Data preparation

### Retrieving data

In [340]:
data = pd.read_csv('https://query.data.world/s/ucw6adw4twegolswpwcjwipvlke2y6?dws=00000', encoding = "ISO-8859-1")
data = data[data['sentiment'] != 'not_relevant']
data.reset_index(drop=True, inplace=True)
data["sentiment"] = pd.to_numeric(data["sentiment"])
data.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,sentiment,sentiment:confidence,date,id,query,sentiment_gold,text
0,623495513,True,golden,10,,3,0.6264,Mon Dec 01 19:30:03 +0000 2014,5.4e+17,#AAPL OR @Apple,3\nnot_relevant,#AAPL:The 10 best Steve Jobs emails ever...htt...
1,623495514,True,golden,12,,3,0.8129,Mon Dec 01 19:43:51 +0000 2014,5.4e+17,#AAPL OR @Apple,3\n1,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...
2,623495515,True,golden,10,,3,1.0,Mon Dec 01 19:50:28 +0000 2014,5.4e+17,#AAPL OR @Apple,3,My cat only chews @apple cords. Such an #Apple...
3,623495516,True,golden,17,,3,0.5848,Mon Dec 01 20:26:34 +0000 2014,5.4e+17,#AAPL OR @Apple,3\n1,I agree with @jimcramer that the #IndividualIn...
4,623495517,False,finalized,3,12/12/14 12:14,3,0.6474,Mon Dec 01 20:29:33 +0000 2014,5.4e+17,#AAPL OR @Apple,,Nobody expects the Spanish Inquisition #AAPL


In [341]:
data['sentiment'].value_counts()

sentiment
3    2162
1    1219
5     423
Name: count, dtype: int64

In [342]:
X = data['text'].copy()
y = data['sentiment'].copy()

### Splitting data

In [343]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [344]:
print("Samples per class: {}".format(np.bincount(y_train)))
print("Samples per class: {}".format(np.bincount(y_test)))

Samples per class: [   0  970    0 1717    0  356]
Samples per class: [  0 249   0 445   0  67]


## Model building

### Pipelines

In [345]:
class_weights = {3: 1, 1: 2, 5: 10}

pipeLR = Pipeline([('tfidf', TfidfVectorizer()),( 'clf', LogisticRegression())])
pipeNB = Pipeline([('tfidf', TfidfVectorizer()),( 'clf', MultinomialNB())])
pipeRF = Pipeline([('tfidf', TfidfVectorizer()),( 'clf', RandomForestClassifier())])
pipeSVC = Pipeline([('tfidf', TfidfVectorizer()),( 'clf', LinearSVC(class_weight=class_weights, dual=False))])

#from imblearn.pipeline import Pipeline as ImbPipeline
#from imblearn.over_sampling import RandomOverSampler
#pipelines with preprocessing smote
#pipeLR = ImbPipeline([('tfidf', TfidfVectorizer()), ('sampler', RandomOverSampler(sampling_strategy="not majority")), ('clf', LogisticRegression())])
#pipeNB = ImbPipeline([('tfidf', TfidfVectorizer()), ('sampler', RandomOverSampler(sampling_strategy="not majority")), ('clf', MultinomialNB())])
#pipeRF = ImbPipeline([('tfidf', TfidfVectorizer()), ('sampler', RandomOverSampler(sampling_strategy="not majority")), ('clf', RandomForestClassifier())])
#pipeSVC = ImbPipeline([('tfidf', TfidfVectorizer()), ('sampler', RandomOverSampler(sampling_strategy="not majority")),( 'clf', LinearSVC(class_weight=class_weights, dual=False))])

### Training models

In [346]:
pipeLR.fit(X_train, y_train)
predict_test_LR = pipeLR.predict(X_test)

pipeNB.fit(X_train, y_train)
predict_test_NB = pipeNB.predict(X_test)

pipeRF.fit(X_train, y_train)
predict_test_RF = pipeRF.predict(X_test)

pipeSVC.fit(X_train, y_train)
predict_test_SVC = pipeSVC.predict(X_test)

### Testing predictions

In [355]:
lr_test_accuracy = accuracy_score(y_test, predict_test_LR)
lr_test_precision = precision_score(y_test, predict_test_LR,average='macro', labels=np.unique(predict_test_LR))
lr_test_mse = mean_squared_error(y_test, predict_test_LR)
lr_test_rmse = np.sqrt(lr_test_mse)
lr_test_r2 = r2_score(y_test, predict_test_LR)

nb_test_accuracy = accuracy_score(y_test, predict_test_NB)
nb_test_precision = precision_score(y_test, predict_test_NB,average='macro', labels=np.unique(predict_test_NB))
nb_test_mse = mean_squared_error(y_test, predict_test_NB)
nb_test_rmse = np.sqrt(nb_test_mse)
nb_test_r2 = r2_score(y_test, predict_test_NB)

rf_test_accuracy = accuracy_score(y_test, predict_test_RF)
rf_test_precision = precision_score(y_test, predict_test_RF,average='macro', labels=np.unique(predict_test_RF))
rf_test_mse = mean_squared_error(y_test, predict_test_RF)
rf_test_rmse = np.sqrt(rf_test_mse)
rf_test_r2 = r2_score(y_test, predict_test_RF)

svc_test_accuracy = accuracy_score(y_test, predict_test_SVC)
svc_test_precision = precision_score(y_test, predict_test_SVC,average='macro', labels=np.unique(predict_test_SVC))
svc_test_mse = mean_squared_error(y_test, predict_test_SVC)
svc_test_rmse = np.sqrt(svc_test_mse)
svc_test_r2 = r2_score(y_test, predict_test_SVC)

## Results

## Comparing results

### Testing results

In [356]:
lr_results = pd.DataFrame(['Linear regression', lr_test_accuracy, lr_test_precision, lr_test_mse, lr_test_rmse, lr_test_r2]).transpose()
nb_results = pd.DataFrame(['Naive Bayes', nb_test_accuracy, nb_test_precision, nb_test_mse, nb_test_rmse, nb_test_r2]).transpose()
rf_results = pd.DataFrame(['Random Forest', rf_test_accuracy, rf_test_precision, rf_test_mse, rf_test_rmse, rf_test_r2]).transpose()
svc_results = pd.DataFrame(['SVC', svc_test_accuracy, svc_test_precision, svc_test_mse, svc_test_rmse, svc_test_r2]).transpose()

models_results = pd.concat([lr_results, nb_results, rf_results, svc_results], axis=0)
models_results.columns = ['Method', 'Accuracy', 'Precision', 'MSE', 'RMSE', 'R2']
models_results

Unnamed: 0,Method,Accuracy,Precision,MSE,RMSE,R2
0,Linear regression,0.759527,0.763285,1.135348,1.065527,0.207261
0,Naive Bayes,0.755585,0.770451,1.151117,1.072901,0.196251
0,Random Forest,0.760841,0.751236,1.240473,1.113765,0.133859
0,SVC,0.750329,0.668733,1.219448,1.104286,0.14854


### Confusion  matrices

In [357]:
print(f"LR-confusion-matrix:\n accuracy_score:\n {multilabel_confusion_matrix(y_test, predict_test_LR)}\n ")
print(f"NB-confusion-matrix:\n accuracy_score:\n {multilabel_confusion_matrix(y_test, predict_test_NB)}\n ")
print(f"RF-confusion-matrix:\n accuracy_score:\n {multilabel_confusion_matrix(y_test, predict_test_RF)}\n ")
print(f"SVC-confusion-matrix:\n accuracy_score:\n {multilabel_confusion_matrix(y_test, predict_test_SVC)}\n ")

LR-confusion-matrix:
 accuracy_score:
 [[[444  68]
  [ 73 176]]

 [[206 110]
  [ 62 383]]

 [[689   5]
  [ 48  19]]]
 
NB-confusion-matrix:
 accuracy_score:
 [[[457  55]
  [ 78 171]]

 [[186 130]
  [ 45 400]]

 [[693   1]
  [ 63   4]]]
 
RF-confusion-matrix:
 accuracy_score:
 [[[435  77]
  [ 69 180]]

 [[217  99]
  [ 65 380]]

 [[688   6]
  [ 48  19]]]
 
SVC-confusion-matrix:
 accuracy_score:
 [[[445  67]
  [ 66 183]]

 [[229  87]
  [ 89 356]]

 [[658  36]
  [ 35  32]]]
 


## Examples

In [358]:
sentiment_mapping = { 1: "negative", 3: "neutral", 5: "positive"}

review1 = "I hate this!"
review2 = "Not bad, but I wish there was more."
review3 = "Latest news regarding the stock market."
review4 = "I used to love it but no more..."
review5 = "Great! Wonderful!"

prediction_LR_r1 = pipeLR.predict([review1])
prediction_LR_r2 = pipeLR.predict([review2])
prediction_LR_r3 = pipeLR.predict([review3])
prediction_LR_r4 = pipeLR.predict([review4])
prediction_LR_r5 = pipeLR.predict([review5])


sent_label_LR_r1 = sentiment_mapping.get(prediction_LR_r1[0], "unknown")
sent_label_LR_r2 = sentiment_mapping.get(prediction_LR_r2[0], "unknown")
sent_label_LR_r3 = sentiment_mapping.get(prediction_LR_r3[0], "unknown")
sent_label_LR_r4 = sentiment_mapping.get(prediction_LR_r4[0], "unknown")
sent_label_LR_r5 = sentiment_mapping.get(prediction_LR_r5[0], "unknown")

 
print(f"Linear Regression: review \"{review1}\" is {sent_label_LR_r1}")
print(f"Linear Regression: review \"{review2}\" is {sent_label_LR_r2}")
print(f"Linear Regression: review \"{review3}\" is {sent_label_LR_r3}")
print(f"Linear Regression: review \"{review4}\" is {sent_label_LR_r4}")
print(f"Linear Regression: review \"{review5}\" is {sent_label_LR_r5}")

Linear Regression: review "I hate this!" is negative
Linear Regression: review "Not bad, but I wish there was more." is negative
Linear Regression: review "Latest news regarding the stock market." is neutral
Linear Regression: review "I used to love it but no more..." is negative
Linear Regression: review "Great! Wonderful!" is positive
