# Second approach: Support Vector Machine

## Dataset preprocess

In [29]:
import pandas as pd

df = pd.read_csv('../data/train_imdb_reviews.csv')

In [30]:
# This cell is only for experiments of balancing dataset sentiments
# We don't use this cell for the original experiments, only for the experiments of balancing dataset sentiments

positive_count = df[df["sentiment"] == 1].shape[0]
negative_count = df[df["sentiment"] == 0].shape[0]
print("Total reviews:", df.shape[0])
print("Positive reviews:", positive_count)
print("Negative reviews:", negative_count)
print("ratio:", positive_count/negative_count)

print("-------------------------")

positive = df[df["sentiment"] == 1][:negative_count]
negative = df[df["sentiment"] == 0]
print("Positive reviews:", positive.shape[0])
print("Negative reviews:", negative.shape[0])
print("ratio:", positive.shape[0]/negative.shape[0])

df = pd.concat([positive, negative])

Total reviews: 41669
Positive reviews: 27295
Negative reviews: 14374
ratio: 1.8989147071100598
-------------------------
Positive reviews: 14374
Negative reviews: 14374
ratio: 1.0


In [None]:
# get the dataset we use in training, and convert them to numpy arrays
X_train = df['review']
y_train = df['sentiment']

X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

print(len(X_train))
print(len(y_train))

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a Tfidfvectorizer and fit it to the training set
vectorizer = TfidfVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(X_train)

## Define SVM Model

In [33]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix

# define a scorer that returns the numbers in a confusion matrix, and accuracy
def confusion_matrix_scorer(model, X, y):
    y_pred = model.predict(X)
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    acc = (tp + tn) / (tp + tn + fp + fn)
    return {'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp, 'accuracy': acc}

# create a model
model = SVC(kernel='linear', C=1)
# use 5-fold cross validation to evaluate the model
scores = cross_validate(model, X_train, y_train, cv=5, 
                        scoring=confusion_matrix_scorer, return_estimator=True)

In [34]:
# initially set the best estimator to the first estimator, and the best score to the first score
best_estimator = scores['estimator'][0]
best_score = scores['test_accuracy'][0]
b = 0

# we want to find the best estimator first and train the best estimator on all train set again
for i, estimator in enumerate(scores['estimator']):
    if scores['test_accuracy'][i] > best_score:
        best_estimator = estimator
        best_score = scores['test_accuracy'][i]
        b = i

In [37]:
scores.keys()

dict_keys(['fit_time', 'score_time', 'estimator', 'test_tn', 'test_fp', 'test_fn', 'test_tp', 'test_accuracy'])

In [None]:
# this cell prints the confusion matrix of the best estimator on training dataset
train_tp = scores['test_tp'][b]
train_fp = scores['test_fp'][b]
train_tn = scores['test_tn'][b]
train_fn = scores['test_fn'][b]

print('true positive:', train_tp)
print('false positive:', train_fp)
print('true negative:', train_tn)
print('false negative:', train_fn)

# compute all metrics as below
def auc(tp, fp, tn, fn):
    return (tp / (tp + fn) + tn / (tn + fp)) / 2

acc = (train_tp + train_tn) / (train_tp + train_tn + train_fp + train_fn)
precision = train_tp / (train_tp + train_fp)
recall = train_tp / (train_tp + train_fn)
f1 = 2 * (precision * recall) / (precision + recall)

print("Accuracy: ", acc)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)
print("AUC: ", auc(train_tp, train_fp, train_tn, train_fn))

In [39]:
# train the best estimator again on the whole training set
# to use it for prediction on testing dataset
best_estimator = best_estimator.fit(X_train, y_train)

## Evaluate on test dataset

In [40]:
# load the testing dataset

df = pd.read_csv('../data/test_imdb_reviews.csv')
X_test = df['review']
y_test = df['sentiment']

X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

X_test = vectorizer.transform(X_test)

In [41]:
# predict the testing dataset
y_pred = best_estimator.predict(X_test)

In [None]:
# print the confusion matrix of the prediction

tp = 0
tn = 0
fp = 0
fn = 0

for i in range(len(y_test)):
    if y_test[i] == y_pred[i]:
        if y_test[i] == 1:
            tp += 1
        else:
            tn += 1
    else:
        if y_test[i] == 1:
            fp += 1
        else:
            fn += 1

print('True Positives:', tp)
print('True Negatives:', tn)
print('False Positives:', fp)
print('False Negatives:', fn)

In [None]:
# compute all metrics for testing dataset

def auc(tp, fp, tn, fn):
    return (tp / (tp + fn) + tn / (tn + fp)) / 2

acc = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print("Accuracy: ", acc)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)
print("AUC: ", auc(tp, fp, tn, fn))
print(tp + tn + fp + fn)