In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.api as sm

from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [None]:
def calculate_plot_roc(y_test, y_score, classes):
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    lw = 2
    for i in range(len(classes)):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = roc_auc_score(y_test, y_score, average='micro', multi_class='ovo', labels=classes)

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(classes))]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(len(classes)):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= len(classes)

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = roc_auc_score(y_test, y_score, average='macro', multi_class='ovo', labels=classes)

    # Plot all ROC curves
    plt.figure(figsize=(10, 10))
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]),
             color='deeppink', linestyle=':', linewidth=4)

    plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["macro"]),
             color='navy', linestyle=':', linewidth=4)

    for i in range(len(classes)):
        plt.plot(fpr[i], tpr[i],  lw=lw,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(list(classes.values())[i], roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC')
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()
    return {'fpr':fpr, 'tpr':tpr, 'roc_auc':roc_auc}

## Read and split data

In [None]:
df = pd.read_pickle('reviews_w_topics_test.pkl')

In [None]:
df['sentiment_cat'] = np.round(df['sentiment'])

In [None]:
y_col = 'sentiment_cat'
X_cols = [
#     'rating', 
    'rating_1', 
    'rating_2',
    'rating_3',
    'rating_4',
    'rating_5',
    'topic_1',
    'topic_2',
    'topic_3',
    'topic_4',
    'topic_5',
    'topic_6',
    'topic_7',
    'topic_8']
classes = {1:'positive',
           0:'neutral',
           -1:'negative'}

In [None]:
X = df[X_cols]
y = label_binarize(df[y_col], classes=list(classes.keys()))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=df['sentiment_cat'], random_state=3)

## Models

In [None]:
evaluation = {}

### Logistic Regression

In [None]:
logreg = sm.MNLogit(y_train, sm.add_constant(X_train)).fit()

In [None]:
print(logreg.summary())

In [None]:
y_score = logreg.predict(sm.add_constant(X_test)).values

In [None]:
evaluation['Logistic'] = calculate_plot_roc(y_test, y_score, classes=classes)

### SVM

In [None]:
sample = df.sample(10000, random_state=3)
X = sample[X_cols]
y = label_binarize(sample[y_col], classes=list(classes.keys()))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=sample['sentiment_cat'], random_state=3)

In [None]:
param_grid = [
  {'estimator__C': [1, 10, 100], 'estimator__kernel': ['linear']},
  {'estimator__C': [1, 10, 100], 'estimator__gamma': [‘scale’, ‘auto’], 'estimator__kernel': ['rbf', 'poly']},
 ]

In [None]:
model = OneVsRestClassifier(svm.SVC())
grid = GridSearchCV(model, param_grid, verbose=10, n_jobs=-1, cv=2).fit(X_train, y_train)

In [None]:
y_score = grid.predict(X_test)

In [None]:
evaluation['SVM'] = calculate_plot_roc(y_test, y_score, classes=classes)

### CART