# Seemingly pretty good attempt

In [1]:
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

from sklearn.feature_selection import chi2, SelectPercentile

In [2]:
reviews = pd.read_csv("../Data/scraped_comments.csv").sample(n=15000, random_state=0)

### Barebones preprocessing

In [3]:
reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

reviews.dropna(subset=["comment"], inplace=True)

reviews = reviews[reviews["comment"] != "No Comments"]

reviews["comment"] = reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)

reviews.dropna(subset=["comment"], inplace=True)

reviews.reset_index(drop=True, inplace=True)

In [4]:
comments_proper = []

for i in range(reviews.shape[0]):
    review = reviews["comment"][i]
    review = re.sub('&([a-zA-z]+|#\d+);', "", review)           # remove HTML codes
    review = re.sub('&#63;?', '', review)                       # HTML code for question mark evades erasure on occasion, handle here
    review = re.sub(r'\s*https?://\S+(\s+|$)', ' ', review)                                     # remove links
    review = re.sub("^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$", ' ', review)         # remove phone numbers
    review = re.sub("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", " ", review)              # remove email addresses

    review = re.sub(r'(.)\1\1+', '\g<1>', review)               # replace any three characters in a row with one

    review = re.sub('[^a-zA-Z]+', ' ', review)               # remove non-alphabetic characters, spaces, and underscores

    review = re.sub('\s+', ' ', review)
    review = review.lower()                                     # lowercase review for uniformity

    comments_proper.append(review)

In [5]:
reviews = reviews.loc[:, ["firstName", "lastName", "comment", "clarityRating"]]
reviews["cleanedComment"] = pd.Series(comments_proper)
reviews["sentiment"] = reviews["clarityRating"].apply(lambda x: 1 if x > 2.5 else 0)

reviews.head()

Unnamed: 0,firstName,lastName,comment,clarityRating,cleanedComment,sentiment
0,Robbin,Jeffries,an amazing caring teacher. please do the readi...,4,an amazing caring teacher please do the readin...,1
1,Judith,Rood,Rude! Did not care about students or class. Co...,1,rude did not care about students or class cons...,0
2,Ruben,Murillo,I LOVE YOU MURILLO! YOU ARE HILLARIOUS,5,i love you murillo you are hillarious,1
3,Hazel,Sanderson,Dr. Sanderson taught in the first class. She ...,1,dr sanderson taught in the first class she doe...,0
4,Annette,McGregor,"This class is hard for a gen ed, but not that ...",5,this class is hard for a gen ed but not that b...,1


### Run Naive Bayes model

Use chi squared for selecting top quarter of features

In [6]:
def evalPerformance(y_pred, y_test, mode="weighted"):
    acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy Score: " + str(acc_score * 100))
    
    f1 = f1_score(y_test, y_pred, average=mode)
    print("F1 Score: {0}".format(f1 * 100))

In [7]:
def runNBModel(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)        # split into training and testing subsets
    
    model = MultinomialNB().fit(X_train, y_train)           # create and fit model, use it to predict outcomes on test set
    y_pred = model.predict(X_test)

    evalPerformance(y_pred, y_test)

In [8]:
def featureSelectChi2(ngr=(1,1)):
    cv = CountVectorizer(ngram_range=ngr)           # prepare elements for general NB model
    X = cv.fit_transform(reviews["cleanedComment"]).toarray()
    y = reviews["sentiment"]

    percbest = SelectPercentile(score_func=chi2, percentile=25)      # select top quartile of features by importance
    X_percbest = percbest.fit_transform(X, y)

    return X_percbest, y

In [9]:
X, y = featureSelectChi2(ngr=(1,2))
runNBModel(X, y)

Accuracy Score: 91.21245828698554
F1 Score: 90.74000564965314
