# Chi Squared Model

http://ethen8181.github.io/machine-learning/text_classification/chisquare.html

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_selection import chi2, SelectKBest, SelectPercentile
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
reviews = pd.read_csv("../Data/comments_preproc.csv").sample(n=20000, random_state=0)
reviews.dropna(subset=["cleanedComment"], inplace=True)
reviews.reset_index(drop=True, inplace=True)

In [3]:
def graphPercentiles(X_new, y):
    percs = range(5, 31, 5)
    accs = []
    for p in percs:
        print("Starting loop iteration for top {0}%".format(p))
        percbest = SelectPercentile(score_func=chi2, percentile=p)
        X_percbest = percbest.fit_transform(X_new, y)
        print("\tDone selecting, fitting, and transforming percentile")

        X_train, X_test, y_train, y_test = train_test_split(X_percbest, y, test_size=0.2, stratify=y, random_state=1)        # split into training and testing subsets
    
        model = MultinomialNB().fit(X_train, y_train)           # create and fit model, use it to predict outcomes on test set
        print("\tDone fitting Naive Bayes")
        y_pred = model.predict(X_test)
        print("\tDone predicting")

        accs.append(accuracy_score(y_test, y_pred))
        print("Finished loop iteration for top {0}%\n".format(p))
    
    plt.scatter(percs, accs)
    plt.figure(figsize=(25,5))
    plt.show()

In [4]:
def graphSelections(X_new, y):
    ks = range(1000, 5001, 250)
    accs = []
    for k in ks:
        print("Starting loop iteration for top {0} features".format(k))
        percbest = SelectKBest(score_func=chi2, k=k)
        X_percbest = percbest.fit_transform(X_new, y)

        X_train, X_test, y_train, y_test = train_test_split(X_percbest, y, test_size=0.2, stratify=y, random_state=1)        # split into training and testing subsets
    
        model = MultinomialNB().fit(X_train, y_train)           # create and fit model, use it to predict outcomes on test set
        y_pred = model.predict(X_test)

        accs.append(accuracy_score(y_test, y_pred))
        print("Finished loop iteration for top {0} features".format(k))
    
    plt.scatter(ks, accs)
    plt.figure(figsize=(25,5))
    plt.show()

### Graph performance

wrt to top % or top k features selected

In [5]:
#graphPercentiles(X_dtm, y)

In [6]:
#graphSelections(X_dtm, y)

In [7]:
def evalPerformance(y_pred, y_test, mode="weighted"):
    acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy Score: " + str(acc_score * 100))

    f1 = f1_score(y_test, y_pred, average=mode)
    print("F1 Score: {0}".format(f1 * 100))

In [8]:
def runChi2Model(ngr=(1,1)):
    cv = CountVectorizer(ngram_range=ngr)           # prepare elements for general NB model
    X = cv.fit_transform(reviews["cleanedComment"]).toarray()
    y = reviews["sentiment"]

    percbest = SelectPercentile(score_func=chi2, percentile=25)      # select 25% most important features
    X_percbest = percbest.fit_transform(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X_percbest, y, test_size=0.2, stratify=y, random_state=1)        # split into training and testing subsets
    model = MultinomialNB().fit(X_train, y_train)           # create and fit model, use it to predict outcomes on test set
    y_pred = model.predict(X_test)

    evalPerformance(y_pred, y_test)

### Use optimal k% with all three input types

In [9]:
runChi2Model(ngr=(1,1))     # unigrams

Accuracy Score: 88.55
F1 Score: 88.34534227118931


In [10]:
runChi2Model(ngr=(2,2))     # bigrams

Accuracy Score: 81.8
F1 Score: 78.78602787559751


In [11]:
runChi2Model(ngr=(1,2))     # unigrams + bigrams

Accuracy Score: 90.17500000000001
F1 Score: 89.56781012495274
