# Support Vector Machine (SVM) model

https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

For deeper intuition of SVM, see https://towardsdatascience.com/deep-dive-into-support-vector-machine-654c8d517103

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

In [2]:
reviews = pd.read_csv("../Data/comments_preproc.csv").sample(n=200000, random_state=0)
reviews.dropna(subset=["cleanedComment"], inplace=True)
reviews.reset_index(inplace=True, drop=True)

reviews["sentiment"] = reviews["clarityRating"].apply(lambda x: 1 if x > 2.5 else 0)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(reviews["cleanedComment"], reviews["sentiment"], test_size=0.2, random_state=1)

In [4]:
def runSVM(ngr=(1,1)):
    tfidf_vect = TfidfVectorizer(max_features=7000, ngram_range=ngr)
    tfidf_vect.fit(reviews["cleanedComment"])

    train_feat = tfidf_vect.transform(X_train)
    test_feat = tfidf_vect.transform(X_test)

    svm = SVC(C=1.0, kernel="linear", degree=3, gamma="auto")
    svm.fit(train_feat, y_train)

    y_pred = svm.predict(test_feat)

    print("Accuracy: {0}".format(accuracy_score(y_test, y_pred) * 100))
    print("F1 Score: {0}".format(f1_score(y_test, y_pred) * 100))

In [5]:
runSVM(ngr=(1,1))

Accuracy: 86.66683334166709
F1 Score: 91.13486377312698


In [6]:
runSVM(ngr=(2,2))

Accuracy: 84.87174358717937
F1 Score: 90.16017562403447


In [7]:
runSVM(ngr=(1,2))

Accuracy: 87.63938196909845
F1 Score: 91.73465293566937
