# Naive Bayes Classifier

Sticking to multinomial now, as I've seen it used much more

Here's a really in-depth resource explaining the entire intuition about NB sentiment analysis: https://web.stanford.edu/~jurafsky/slp3/4.pdf

In [1]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

In [2]:
reviews = pd.read_csv("comments_preproc.csv").sample(n=200000, random_state=1)
reviews.dropna(subset=["cleanedComment"], inplace=True)
reviews.reset_index(inplace=True, drop=True)

In [3]:
#reviews["sentiment"] = reviews["clarityRating"].apply(lambda x: 1 if x > 3 else 0 if x == 3 else -1)
reviews["sentiment"] = reviews["clarityRating"].apply(lambda x: 1 if x > 2.5 else 0)
reviews["sentiment"].value_counts()

1    147956
0     52037
Name: sentiment, dtype: int64

In [4]:
comments_proper = [x for x in reviews["cleanedComment"]]

In [5]:
def getNGrams(n=2):
    all_words = " ".join(comments_proper)
    all_words = all_words.split()               # get full list of unigrams in data
    
    ngs = nltk.ngrams(all_words, n=n)           # find ngrams of data
    
    ngram_vectors = []
    for item in ngs:
        ngram_vectors.append(" ".join(item))    # create vectors for each ngram
    return ngram_vectors

For more info on eval metrics used, 
* Confusion Matrix: https://towardsdatascience.com/understanding-the-confusion-matrix-from-scikit-learn-c51d88929c79
* Precision + Recall: https://en.m.wikipedia.org/wiki/Precision_and_recall
* F1 Score: https://www.educative.io/answers/what-is-the-f1-score

In [6]:
def evalPerformance(y_pred, y_test, mode="weighted"):
    conf_m = confusion_matrix(y_test, y_pred)
    acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy Score: " + str(acc_score * 100) + "\n")
    print("Confusion Matrix: ")
    print(conf_m)
    print()

    precision = precision_score(y_test, y_pred, average=mode)
    recall = recall_score(y_test, y_pred, average=mode)

    print("Precision: {0}".format(precision * 100))
    print("Recall: {0}\n".format(recall * 100))

    f1 = f1_score(y_test, y_pred, average=mode)
    print("F1 Score: {0}".format(f1 * 100))

In [7]:
def optimizeFeatures():
    feat = range(4750, 5250, 100)
    accs = []
    for i in feat:
        cv = CountVectorizer(max_features=i, ngram_range=(1,1), binary=True)
        X = cv.fit_transform(comments_proper).toarray()       # get list of features (comments)
        y = pd.get_dummies(reviews.loc[:, ["sentiment"]])     # isolate sentiments
        y = y.loc[:, "sentiment"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)        # split into training and testing subsets
        model = MultinomialNB().fit(X_train, y_train)           # create and fit model, use it to predict outcomes on test set
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        accs.append(acc)
    
    plt.scatter(feat, accs)
    plt.figure(figsize=(25,5))
    plt.show()

In [8]:
def optimizeNgrams():
    feat = range(1, 5)
    accs = []
    for i in feat:
        cv = CountVectorizer(max_features=3000, ngram_range=(i,i), binary=True)
        X = cv.fit_transform(comments_proper).toarray()       # get list of features (comments)
        y = pd.get_dummies(reviews.loc[:, ["sentiment"]])     # isolate sentiments
        y = y.loc[:, "sentiment"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)        # split into training and testing subsets
        model = MultinomialNB().fit(X_train, y_train)           # create and fit model, use it to predict outcomes on test set
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        accs.append(acc)
    
    plt.scatter(feat, accs)
    plt.figure(figsize=(25,5))
    plt.show()

In [9]:
#optimizeFeatures()

In [10]:
#optimizeNgrams()

#### General function

Takes ngram range as input

In [15]:
def runNBModel(ngr=(1,1)):
    cv = CountVectorizer(max_features=4000, ngram_range=ngr, binary=True)
    X = cv.fit_transform(comments_proper).toarray()       # get list of features (comments)

    y = pd.get_dummies(reviews.loc[:, ["sentiment"]])     # isolate sentiments
    y = y.loc[:, "sentiment"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)        # split into training and testing subsets
    
    model = MultinomialNB().fit(X_train, y_train)           # create and fit model, use it to predict outcomes on test set
    y_pred = model.predict(X_test)

    evalPerformance(y_pred, y_test)

## Start trying out Naive Bayes model

In [16]:
# unigram based
runNBModel(ngr=(1,1))

Accuracy Score: 85.54213855346383

Confusion Matrix: 
[[ 7616  2792]
 [ 2991 26600]]

Precision: 85.63516928044189
Recall: 85.54213855346383

F1 Score: 85.58620302924409


In [17]:
# bigram based
runNBModel(ngr=(2,2))

Accuracy Score: 83.73709342733568

Confusion Matrix: 
[[ 6865  3543]
 [ 2962 26629]]

Precision: 83.46984458709136
Recall: 83.73709342733568

F1 Score: 83.58266987248302


In [18]:
# unigram + bigram
runNBModel(ngr=(1,2))

Accuracy Score: 85.8046451161279

Confusion Matrix: 
[[ 8362  2046]
 [ 3632 25959]]

Precision: 86.71566507545359
Recall: 85.8046451161279

F1 Score: 86.11169501271068
