# Logistic Regression

In [None]:
import time
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_curve, roc_auc_score, log_loss, accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [None]:
def classif_reports(y_true, y_pred):
    print('Accuracy Score:', accuracy_score(y_true, y_pred))
    # print('\nNum of Values per Class:\n', pd.Series(swn_preds).value_counts())
    swn_report = classification_report(y_true, y_pred, labels = [0, 1, -1], target_names = ['Neut', 'Pos', 'Neg'])
    print('\nClassification Report',swn_report)
    conf_matrix = pd.DataFrame(confusion_matrix(y_true, y_pred, normalize = 'true'), index = ['True Neg', 'True Neut', 'True Pos'], columns = ['Pred Neg', 'Pred Neut', 'Pred Pos'])
    return conf_matrix

In [None]:
import os
root_logdir = os.path.join(os.curdir, "my_logs")


def get_run_logdir(model_name):
    run_id = time.strftime('run_%Y_%m_%d-%H_%M_%S_') + str(model_name)
    return os.path.join(root_logdir, run_id)

In [None]:
# Import Response Variable
train_swn_target = pd.read_csv('./data/train_swn_target.csv', header = None)
train_afn_target = pd.read_csv('./data/train_afn_target.csv', header = None)
val_swn_target = pd.read_csv('./data/val_swn_target.csv', header = None)
val_afn_target = pd.read_csv('./data/val_afn_target.csv', header = None)

## SentiWordNet

In [None]:
logit_model = LogisticRegression(random_state = 42, solver = 'saga',
                                 multi_class = 'multinomial', max_iter = 1500)

### Doc2Vec DM Model

In [None]:
# Import Data
dm_train_vecs = pd.read_csv('./data/dm_train_vecs.csv', header = None)
dm_val_vecs = pd.read_csv('./data/dm_val_vecs.csv', header = None)

In [None]:
# Join training and validation sets to use in RandomizedSearchCV to find optimal parameters
dm_full_vecs = pd.concat([dm_train_vecs, dm_val_vecs]).reset_index(drop = True)
swn_full_targets = pd.concat([train_swn_target, val_swn_target]).reset_index(drop = True)

In [None]:
start_time = time.time()
param_grid = [
    {'penalty': ['none']},
    {'penalty': ['l1', 'l2'], 'C':[0.01, 0.1, 1, 5, 10]}, 
    {'penalty': ['elasticnet'], 'C':[0.01, 0.1, 1, 5, 10], 
        'l1_ratio': [0.2, 0.5, 0.8]}
]

rand_search = RandomizedSearchCV(logit_model, param_grid, random_state = 42, cv = 6, scoring = 'neg_log_loss')
rand_search.fit(dm_train_vecs, np.ravel(train_swn_target))
print(rand_search.best_params_)
logit_dm = rand_search.best_estimator_
print('------- Time to execute: %f -------' %(time.time() - start_time))

{'penalty': 'elasticnet', 'l1_ratio': 0.8, 'C': 0.1}
------- Time to execute: 375.372798 -------


In [None]:
rand_search.cv_results_

{'mean_fit_time': array([ 1.02502855,  1.29008826,  1.03509629, 18.89740364,  1.2665803 ,
         6.58263405,  1.1371208 ,  1.15411886, 28.4990803 ,  1.31206659]),
 'std_fit_time': array([0.02776818, 0.02654471, 0.01185894, 0.56628145, 0.07020506,
        0.11813427, 0.06394292, 0.06007851, 0.10920948, 0.02172603]),
 'mean_score_time': array([0.00970654, 0.01015985, 0.01114988, 0.00996459, 0.00978728,
        0.00998735, 0.00973531, 0.01030934, 0.00990895, 0.01009707]),
 'std_score_time': array([0.00061818, 0.00068942, 0.00247572, 0.00019662, 0.00014094,
        0.00022748, 0.00016918, 0.00071842, 0.00010279, 0.00045131]),
 'param_penalty': masked_array(data=['l2', 'elasticnet', 'none', 'elasticnet', 'elasticnet',
                    'l1', 'elasticnet', 'l1', 'elasticnet', 'l1'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_C': masked_array(data=[5, 0.1, --, 10

In [None]:
dm_preds = logit_dm.predict(dm_train_vecs)
classif_reports(train_swn_target, dm_preds)

Accuracy Score: 0.4759521517943077

Classification Report               precision    recall  f1-score   support

        Neut       0.40      0.15      0.22      9632
         Pos       0.47      0.70      0.57     14691
         Neg       0.51      0.46      0.48     12042

    accuracy                           0.48     36365
   macro avg       0.46      0.44      0.42     36365
weighted avg       0.46      0.48      0.45     36365



Unnamed: 0,Pred Neg,Pred Neut,Pred Pos
True Neg,0.463627,0.083375,0.452998
True Neut,0.234738,0.15189,0.613372
True Pos,0.217344,0.084133,0.698523


In [None]:
dm_val_preds = logit_dm.predict(dm_val_vecs)
classif_reports(val_swn_target, dm_val_preds)

Accuracy Score: 0.4534

Classification Report               precision    recall  f1-score   support

        Neut       0.38      0.20      0.26      1313
         Pos       0.47      0.60      0.53      2005
         Neg       0.46      0.47      0.46      1682

    accuracy                           0.45      5000
   macro avg       0.43      0.43      0.42      5000
weighted avg       0.44      0.45      0.44      5000



Unnamed: 0,Pred Neg,Pred Neut,Pred Pos
True Neg,0.473841,0.120095,0.406064
True Neut,0.296268,0.201828,0.501904
True Pos,0.2798,0.119202,0.600998


### Doc2Vec DM Mean Model

In [None]:
# Import Data
dm_mean_train_vecs = pd.read_csv('./data/dm_mean_train_vecs.csv', header = None)
dm_mean_val_vecs = pd.read_csv('./data/dm_mean_val_vecs.csv', header = None)


In [None]:
logit_model = LogisticRegression(random_state = 42, solver = 'saga',
                                 multi_class = 'multinomial', max_iter = 1500)

start_time = time.time()
param_grid = [
    {'penalty': ['none']},
    {'penalty': ['l1', 'l2'], 'C':[0.01, 0.1, 1, 5, 10]}, 
    {'penalty': ['elasticnet'], 'C':[0.01, 0.1, 1, 5, 10], 
        'l1_ratio': [0.2, 0.5, 0.8]}
]

rand_search = RandomizedSearchCV(logit_model, param_grid, random_state = 42, cv = 6)
rand_search.fit(dm_mean_train_vecs, np.ravel(train_swn_target))
print(rand_search.best_params_)
logit_dm = rand_search.best_estimator_
print('------- Time to execute: %f -------' %(time.time() - start_time))

{'penalty': 'none'}
------- Time to execute: 346.838643 -------


In [None]:
dmm_train_preds = logit_dm.predict(dm_mean_train_vecs)
classif_reports(train_swn_target, dmm_train_preds)

Accuracy Score: 0.47537467344974565

Classification Report               precision    recall  f1-score   support

        Neut       0.39      0.15      0.22      9632
         Pos       0.48      0.69      0.56     14691
         Neg       0.50      0.47      0.49     12042

    accuracy                           0.48     36365
   macro avg       0.46      0.44      0.42     36365
weighted avg       0.46      0.48      0.45     36365



Unnamed: 0,Pred Neg,Pred Neut,Pred Pos
True Neg,0.47243,0.085534,0.442036
True Neut,0.243148,0.153654,0.603198
True Pos,0.225716,0.085563,0.688721


In [None]:
dmm_val_preds = logit_dm.predict(dm_mean_val_vecs)
classif_reports(val_swn_target, dmm_val_preds)

Accuracy Score: 0.464

Classification Report               precision    recall  f1-score   support

        Neut       0.38      0.20      0.26      1313
         Pos       0.48      0.62      0.54      2005
         Neg       0.47      0.49      0.48      1682

    accuracy                           0.46      5000
   macro avg       0.44      0.44      0.43      5000
weighted avg       0.45      0.46      0.45      5000



Unnamed: 0,Pred Neg,Pred Neut,Pred Pos
True Neg,0.485731,0.121284,0.392985
True Neut,0.288652,0.202589,0.508759
True Pos,0.268329,0.114713,0.616958


### D2V DM Concat

In [None]:
# Import Data
dm_concat_train_vecs = pd.read_csv('./data/dm_concat_train_vecs.csv', header = None)
dm_concat_val_vecs = pd.read_csv('./data/dm_concat_val_vecs.csv', header = None)


In [None]:
logit_model = LogisticRegression(random_state = 42, solver = 'saga',
                                 multi_class = 'multinomial', max_iter = 1500)

start_time = time.time()
param_grid = [
    {'penalty': ['none']},
    {'penalty': ['l1', 'l2'], 'C':[0.01, 0.1, 1, 5, 10]}, 
    {'penalty': ['elasticnet'], 'C':[0.01, 0.1, 1, 5, 10], 
        'l1_ratio': [0.2, 0.5, 0.8]}
]

rand_search = RandomizedSearchCV(logit_model, param_grid, random_state = 42, cv = 6)
rand_search.fit(dm_concat_train_vecs, np.ravel(train_swn_target))
print(rand_search.best_params_)
logit_dm = rand_search.best_estimator_
print('------- Time to execute: %f -------' %(time.time() - start_time))

{'penalty': 'none'}
------- Time to execute: 370.043563 -------


In [None]:
dmc_train_preds = logit_dm.predict(dm_concat_train_vecs)
classif_reports(train_swn_target, dmc_train_preds)

Accuracy Score: 0.4741922177918328

Classification Report               precision    recall  f1-score   support

        Neut       0.40      0.16      0.23      9632
         Pos       0.47      0.69      0.56     14691
         Neg       0.50      0.46      0.48     12042

    accuracy                           0.47     36365
   macro avg       0.46      0.44      0.42     36365
weighted avg       0.46      0.47      0.45     36365



Unnamed: 0,Pred Neg,Pred Neut,Pred Pos
True Neg,0.46288,0.085534,0.451586
True Neut,0.236919,0.161337,0.601744
True Pos,0.225036,0.086379,0.688585


In [None]:
dmc_val_preds = logit_dm.predict(dm_concat_val_vecs)
classif_reports(val_swn_target, dmc_val_preds)

Accuracy Score: 0.4572

Classification Report               precision    recall  f1-score   support

        Neut       0.39      0.21      0.27      1313
         Pos       0.48      0.60      0.53      2005
         Neg       0.45      0.48      0.47      1682

    accuracy                           0.46      5000
   macro avg       0.44      0.43      0.42      5000
weighted avg       0.45      0.46      0.44      5000



Unnamed: 0,Pred Neg,Pred Neut,Pred Pos
True Neg,0.478597,0.117122,0.404281
True Neut,0.309977,0.207159,0.482864
True Pos,0.278803,0.118204,0.602993


### D2V DBow

In [None]:
dbow_train_vecs = pd.read_csv('./data/dbow_train_vecs.csv', header = None)
dbow_val_vecs = pd.read_csv('./data/dbow_val_vecs.csv', header = None)


In [None]:
logit_model = LogisticRegression(random_state = 42, solver = 'saga',
                                 multi_class = 'multinomial', max_iter = 1500)

start_time = time.time()
param_grid = [
    {'penalty': ['none']},
    {'penalty': ['l1', 'l2'], 'C':[0.01, 0.1, 1, 5, 10]}, 
    {'penalty': ['elasticnet'], 'C':[0.01, 0.1, 1, 5, 10], 
        'l1_ratio': [0.2, 0.5, 0.8]}
]

rand_search = RandomizedSearchCV(logit_model, param_grid, random_state = 42, cv = 6)
rand_search.fit(dbow_train_vecs, np.ravel(train_swn_target))
print(rand_search.best_params_)
logit_dm = rand_search.best_estimator_
print('------- Time to execute: %f -------' %(time.time() - start_time))

{'penalty': 'l2', 'C': 5}
------- Time to execute: 367.399518 -------


In [None]:
dbow_train_preds = logit_dm.predict(dbow_train_vecs)
classif_reports(train_swn_target, dbow_train_preds)

Accuracy Score: 0.4737247353224254

Classification Report               precision    recall  f1-score   support

        Neut       0.40      0.16      0.23      9632
         Pos       0.47      0.68      0.56     14691
         Neg       0.50      0.47      0.49     12042

    accuracy                           0.47     36365
   macro avg       0.46      0.44      0.42     36365
weighted avg       0.46      0.47      0.45     36365



Unnamed: 0,Pred Neg,Pred Neut,Pred Pos
True Neg,0.470271,0.084704,0.445026
True Neut,0.233596,0.157288,0.609115
True Pos,0.227554,0.088421,0.684024


In [None]:
dbow_val_preds = logit_dm.predict(dbow_val_vecs)
classif_reports(val_swn_target, dbow_val_preds)

Accuracy Score: 0.4534

Classification Report               precision    recall  f1-score   support

        Neut       0.37      0.21      0.27      1313
         Pos       0.47      0.60      0.53      2005
         Neg       0.46      0.46      0.46      1682

    accuracy                           0.45      5000
   macro avg       0.43      0.43      0.42      5000
weighted avg       0.44      0.45      0.44      5000



Unnamed: 0,Pred Neg,Pred Neut,Pred Pos
True Neg,0.464328,0.124257,0.411415
True Neut,0.284082,0.210967,0.50495
True Pos,0.266334,0.130673,0.602993


### TF - IDF Unigram

In [None]:
# def fit_rand_search(model, params, cv = 6, scoring = None):
    

In [None]:
tf_uni_train_vecs = pd.read_csv('./data/tf_uni_train_vecs.csv', header = None)
tf_uni_val_vecs = pd.read_csv('./data/tf_uni_val_vecs.csv', header = None)

In [None]:
logit_model = LogisticRegression(random_state = 42, solver = 'saga', 
                                    multi_class = 'multinomial', max_iter = 500)

start_time = time.time()
param_grid = [
    {'penalty': ['none']},
    {'penalty': ['l1', 'l2'],   'C': [0.01, 0.1, 1, 5, 10]},
    {'penalty': ['elasticnet'], 'C': [0.01, 0.1, 1, 5, 10],
        'l1_ratio': [0.1, 0.3, 0.5, 0.8]}
]

rand_search = RandomizedSearchCV(logit_model, param_grid, cv = 6, scoring = 'neg_log_loss')
rand_search.fit(tf_uni_train_vecs, np.ravel(train_swn_target))
print(rand_search.best_params_)
logit_tfuni = rand_search.best_estimator_
print('------- Time to execute: %f -------' %(time.time() - start_time))

{'penalty': 'elasticnet', 'l1_ratio': 0.5, 'C': 1}
------- Time to execute: 1620.926753 -------


In [None]:
tfuni_preds = logit_tfuni.predict(tf_uni_train_vecs)
classif_reports(train_swn_target, tfuni_preds)

Accuracy Score: 0.6504331087584215

Classification Report               precision    recall  f1-score   support

        Neut       0.57      0.62      0.59      9632
         Pos       0.67      0.71      0.69     14691
         Neg       0.70      0.60      0.65     12042

    accuracy                           0.65     36365
   macro avg       0.65      0.64      0.64     36365
weighted avg       0.65      0.65      0.65     36365



Unnamed: 0,Pred Neg,Pred Neut,Pred Pos
True Neg,0.601063,0.1763,0.222637
True Neut,0.134032,0.619913,0.246055
True Pos,0.126268,0.162821,0.710911


In [None]:
tfuni_val_preds = logit_tfuni.predict(tf_uni_val_vecs)
classif_reports(val_swn_target, tfuni_val_preds)

Accuracy Score: 0.6198

Classification Report               precision    recall  f1-score   support

        Neut       0.52      0.57      0.54      1313
         Pos       0.66      0.69      0.67      2005
         Neg       0.67      0.58      0.62      1682

    accuracy                           0.62      5000
   macro avg       0.61      0.61      0.61      5000
weighted avg       0.62      0.62      0.62      5000



Unnamed: 0,Pred Neg,Pred Neut,Pred Pos
True Neg,0.579073,0.198573,0.222354
True Neut,0.162224,0.571973,0.265804
True Pos,0.13616,0.178554,0.685287


### TF - IDF Bigram

In [None]:
tf_bi_train_vecs = pd.read_csv('./data/tf_bi_train_vecs.csv', header = None)
tf_bi_val_vecs = pd.read_csv('./data/tf_bi_val_vecs.csv', header = None)

In [None]:
logit_model = LogisticRegression(random_state = 42, solver = 'saga', 
                                    multi_class = 'multinomial', max_iter = 500)

start_time = time.time()
param_grid = [
    {'penalty': ['none']},
    {'penalty': ['l1', 'l2'],   'C': [0.01, 0.1, 1, 5, 10]},
    {'penalty': ['elasticnet'], 'C': [0.01, 0.1, 1, 5, 10],
        'l1_ratio': [0.1, 0.3, 0.5, 0.8]}
]

rand_search = RandomizedSearchCV(logit_model, param_grid, cv = 6, scoring = 'neg_log_loss')
rand_search.fit(tf_bi_train_vecs, np.ravel(train_swn_target))
print(rand_search.best_params_)
logit_tfbi = rand_search.best_estimator_
print('------- Time to execute: %f -------' %(time.time() - start_time))

{'penalty': 'elasticnet', 'l1_ratio': 0.1, 'C': 1}
------- Time to execute: 1026.702643 -------


In [None]:
tfbi_preds = logit_tfbi.predict(tf_bi_train_vecs)
classif_reports(train_swn_target, tfbi_preds)

Accuracy Score: 0.6436683624364087

Classification Report               precision    recall  f1-score   support

        Neut       0.56      0.61      0.59      9632
         Pos       0.67      0.70      0.69     14691
         Neg       0.69      0.60      0.64     12042

    accuracy                           0.64     36365
   macro avg       0.64      0.64      0.64     36365
weighted avg       0.65      0.64      0.64     36365



Unnamed: 0,Pred Neg,Pred Neut,Pred Pos
True Neg,0.595997,0.177794,0.226208
True Neut,0.1386,0.61005,0.25135
True Pos,0.131305,0.16391,0.704785


In [None]:
tfbi_val_preds = logit_tfbi.predict(tf_bi_val_vecs)
classif_reports(val_swn_target, tfbi_val_preds)

Accuracy Score: 0.613

Classification Report               precision    recall  f1-score   support

        Neut       0.52      0.57      0.54      1313
         Pos       0.65      0.67      0.66      2005
         Neg       0.65      0.57      0.61      1682

    accuracy                           0.61      5000
   macro avg       0.61      0.61      0.61      5000
weighted avg       0.62      0.61      0.61      5000



Unnamed: 0,Pred Neg,Pred Neut,Pred Pos
True Neg,0.574316,0.202735,0.222949
True Neut,0.16984,0.568165,0.261995
True Pos,0.145137,0.18005,0.674813


### TF - IDF Trigram

In [None]:
tf_tri_train_vecs = pd.read_csv('./data/tf_tri_train_vecs.csv', header = None)
tf_tri_val_vecs = pd.read_csv('./data/tf_tri_val_vecs.csv', header = None)

In [None]:
logit_model = LogisticRegression(random_state = 42, solver = 'saga', 
                                    multi_class = 'multinomial', max_iter = 1000)

start_time = time.time()
param_grid = [
    {'penalty': ['none']},
    {'penalty': ['l1', 'l2'],   'C': [0.01, 0.1, 1, 5, 10]},
    {'penalty': ['elasticnet'], 'C': [0.01, 0.1, 1, 5, 10],
        'l1_ratio': [0.1, 0.3, 0.5, 0.8]}
]

rand_search = RandomizedSearchCV(logit_model, param_grid, cv = 6, scoring = 'neg_log_loss')
rand_search.fit(tf_tri_train_vecs, np.ravel(train_swn_target))
print(rand_search.best_params_)
logit_tftri = rand_search.best_estimator_
print('------- Time to execute: %f -------' %(time.time() - start_time))

{'penalty': 'elasticnet', 'l1_ratio': 0.5, 'C': 1}
------- Time to execute: 1385.407607 -------


In [None]:
tftri_preds = logit_tftri.predict(tf_tri_train_vecs)
classif_reports(train_swn_target, tftri_preds)

Accuracy Score: 0.6444383335624914

Classification Report               precision    recall  f1-score   support

        Neut       0.56      0.62      0.59      9632
         Pos       0.67      0.70      0.69     14691
         Neg       0.69      0.59      0.64     12042

    accuracy                           0.64     36365
   macro avg       0.64      0.64      0.64     36365
weighted avg       0.65      0.64      0.64     36365



Unnamed: 0,Pred Neg,Pred Neut,Pred Pos
True Neg,0.594503,0.17987,0.225627
True Neut,0.135174,0.617629,0.247197
True Pos,0.130352,0.166701,0.702947


In [None]:
tftri_val_preds = logit_tftri.predict(tf_tri_val_vecs)
classif_reports(val_swn_target, tftri_val_preds)

Accuracy Score: 0.6134

Classification Report               precision    recall  f1-score   support

        Neut       0.51      0.57      0.54      1313
         Pos       0.65      0.67      0.66      2005
         Neg       0.66      0.57      0.61      1682

    accuracy                           0.61      5000
   macro avg       0.61      0.61      0.61      5000
weighted avg       0.62      0.61      0.61      5000



Unnamed: 0,Pred Neg,Pred Neut,Pred Pos
True Neg,0.571344,0.206302,0.222354
True Neut,0.164509,0.573496,0.261995
True Pos,0.140648,0.184539,0.674813


## Afin