## Use `GridSearchCV` to determine optimal parameters

In [1]:
import pandas as pd
import re
import nltk
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter

from sklearn.feature_selection import chi2, SelectPercentile
from sklearn.pipeline import Pipeline

In [2]:
reviews = pd.read_csv("Data/scraped_comments_with_professor.csv")#.sample(n=300000, random_state=1)

In [3]:
reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

reviews.dropna(subset=["comment"], inplace=True)
reviews = reviews[reviews["comment"] != "No Comments"]

reviews['firstName'].fillna('', inplace=True)
reviews['lastName'].fillna('', inplace=True)

reviews["comment"] = reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)
reviews.dropna(subset=["comment"], inplace=True)

In [4]:
reviews["starRating"] = (reviews["clarityRating"] + reviews["helpfulRating"]) / 2
reviews = reviews[reviews["starRating"] != 3.0]         # drop 3 star reviews
reviews.reset_index(drop=True, inplace=True)

reviews["starRating"].value_counts()

 5.0    176812
 4.5     58705
 4.0     52643
 1.0     45128
 2.0     26945
 3.5     22896
 1.5     19230
 2.5     17510
-1.0         1
Name: starRating, dtype: int64

In [5]:
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', ' ', text)

def remove_phones(text):
    return re.sub(r'\d{3}-\d{3}-\d{4}', ' ', text)

def remove_emails(text):
    return re.sub(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', ' ', text)

def remove_html_entities(text):
  text = re.sub('&[0-9a-zA-Z#]+;', ' ', text)
  return re.sub('&#63;?', '', text)

def remove_html_tags(text):
  return re.sub('<.{1,6}?>', ' ', text)

In [6]:
EMOTICONS = {
    u":‑)":"emopos",
    u":-))":"emopos",
    u":-)))":"emopos",
    u":)":"emopos",
    u":))":"emopos",
    u":)))":"emopos",
    u":-]":"emopos",
    u":]":"emopos",
    u":-3":"emopos",
    u":3":"emopos",
    u":->":"emopos",
    u":>":"emopos",
    u"8-)":"emopos",
    u":-}":"emopos",
    u":}":"emopos",
    u":-)":"emopos",
    u":c)":"emopos",
    u":^)":"emopos",
    u"=]":"emopos",
    u"=)":"emopos",
    u":‑D":"emopos",
    u":D":"emopos",
    u"8‑D":"emopos",
    u"8D":"emopos",
    u"X‑D":"emopos",
    u"XD":"emopos",
    u"=D":"emopos",
    u"=3":"emopos",
    u"B^D":"emopos",
    u":-))":"emopos",
    u":-(":"emoneg",
    u":‑(":"emoneg",
    u":(":"emoneg",
    u":‑c":"emoneg",
    u":c":"emoneg",
    u":‑<":"emoneg",
    u":<":"emoneg",
    u":‑[":"emoneg",
    u":[":"emoneg",
    u":-||":"emoneg",
    u">:[":"emoneg",
    u":{":"emoneg",
    u">:(":"emoneg",
    u":'‑(":"emoneg",
    u":'(":"emoneg",
    u":'‑)":"emopos",
    u":')":"emopos",
    u"D‑':":"emoneg",
    u"D:<":"emoneg",
    u"D:":"emoneg",
    u"D8":"emoneg",
    u"D;":"emoneg",
    u"D=":"emoneg",
    u"DX":"emoneg",
    u";‑)":"emopos",
    u";)":"emopos",
    u"*-)":"emopos",
    u"*)":"emopos",
    u";‑]":"emopos",
    u";]":"emopos",
    u";^)":"emopos",
    u":‑,":"emopos",
    u";D":"emopos",
    u":‑P":"emopos",
    u":P":"emopos",
    u"X‑P":"emopos",
    u"XP":"emopos",
    u":‑Þ":"emopos",
    u":Þ":"emopos",
    u"=p":"emopos",
    u":‑/":"emoneg",
    u":/":"emoneg",
    u":-[.]":"emoneg",
    u">:[(\)]":"emoneg",
    u">:/":"emoneg",
    u":[(\)]":"emoneg",
    u"=/":"emoneg",
    u"=[(\)]":"emoneg",
    u":L":"emoneg",
    u"=L":"emoneg",
    u":‑|":"emoneg",
    u":|":"emoneg",
    u"O:‑)":"emopos",
    u"O:)":"emopos",
    u"0:‑3":"emopos",
    u"0:3":"emopos",
    u"0:‑)":"emopos",
    u"0:)":"emopos",
    u":‑b":"emopos",
    u"(>_<)":"emoneg",
    u"(>_<)>":"emoneg",
    u"^_^":"emopos",
    u"(^_^)/":"emopos",
    u"(^O^)／":"emopos",
    u"(^o^)／":"emopos",
    u"('_')":"emoneg",
    u"(/_;)":"emoneg",
    u"(T_T) (;_;)":"emoneg",
    u"(;_;":"emoneg",
    u"(;_:)":"emoneg",
    u"(;O;)":"emoneg",
    u"(:_;)":"emoneg",
    u"(ToT)":"emoneg",
    u";_;":"emoneg",
    u";-;":"emoneg",
    u";n;":"emoneg",
    u";n;":"emoneg",
    u"Q.Q":"emoneg",
    u"T.T":"emoneg",
    u"Q_Q":"emoneg",
    u"(-.-)":"emopos",
    u"(-_-)":"emopos",
    u"(；一_一)":"emopos",
    u"(=_=)":"emoneg",
    u"^m^":"emopos",
    u">^_^<":"emopos",
    u"<^!^>":"emopos",
    u"^/^":"emopos",
    u"（*^_^*）" :"emopos",
    u"(^<^) (^.^)":"emopos",
    u"(^^)":"emopos",
    u"(^.^)":"emopos",
    u"(^_^.)":"emopos",
    u"(^_^)":"emopos",
    u"(^^)":"emopos",
    u"(^J^)":"emopos",
    u"(*^.^*)":"emopos",
    u"(^—^）":"emopos",
    u"(#^.^#)":"emopos",
    u"(*^0^*)":"emopos",
    u"(*^^)v":"emopos",
    u"(^_^)v":"emopos",
    u'(-"-)':"emoneg",
    u"(ーー;)":"emoneg",
    u"(＾ｖ＾)":"emopos",
    u"(＾ｕ＾)":"emopos",
    u"(^)o(^)":"emopos",
    u"(^O^)":"emopos",
    u"(^o^)":"emopos",
    u")^o^(":"emopos",
    u":O o_O":"emoneg",
    u"o_0":"emoneg",
    u"o.O":"emoneg",
    u"(o.o)":"emoneg",
    u"(*￣m￣)": "emoneg",
}

for emote, val in EMOTICONS.items():
    EMOTICONS[emote] = val.lower().replace(',', ' ').replace(' ', '_')

def convert_emoticons(text):
  return EMOTICONS.get(text, text)

In [7]:
stopwords = ["he", "her", "she", "him", "guy", "we", "professor", "professors", "prof", "profs", "teacher", "teachers", "mr", "ms", "mrs",
"dr", "doctor", "class", "classes", "course", "courses", "college", "colleges", "university", "universities", "lecture", "lectures", "lab", "labs"]

In [8]:
comments_proper = []
unseen = Counter()

for index, review in reviews.iterrows():
  comment = review['comment']
  fname = review['firstName'].lower().split(' ')
  lname = review['lastName'].lower().split(' ')
  names = set(fname + lname)

  comment = remove_urls(comment)
  comment = remove_phones(comment)
  comment = remove_emails(comment)
  comment = remove_html_entities(comment)
  comment = remove_html_tags(comment)

  comment_split = comment.split(' ')
  new_comment_split = []
  for i, word in enumerate(comment_split):
    word = convert_emoticons(word)
    word = word.lower()
    word = re.sub("[^a-zA-Z\s]+", ' ', word)   # replace characters that are not alphabetic, space, or underscore
    word = re.sub(r'(.)\1\1+', '\g<1>', word)  # replace any three character+ sequence with one
    word = re.sub('\s+', ' ', word)
    word = word.strip() # trailing whitespace because punctuation replaced by space
    new_comment_split.extend(word.split(' '))

  # Remove names from the comment
  for i, word in enumerate(new_comment_split):
    if word in names:
      new_comment_split[i] = ''

  comment = ' '.join(new_comment_split)
  comment = re.sub('\s+', ' ', comment)
  comment = comment.strip()

  comment = ' '.join(word for word in comment.split() if len(word) > 1 and word not in stopwords)

  comments_proper.append(comment)

In [9]:
reviews = reviews.loc[:, ["professor_id", "firstName", "lastName", "comment", "starRating"]]
reviews["cleanedComment"] = pd.Series(comments_proper)
reviews["sentiment"] = reviews["starRating"].apply(lambda x: 1 if x > 2.5 else 0)

reviews.head()

Unnamed: 0,professor_id,firstName,lastName,comment,starRating,cleanedComment,sentiment
0,VGVhY2hlci0xMjQzMzQ3,Kurt,Douglass,"Good teacher, good lectures. Obviously cares a...",5.0,good good obviously cares about the subject ma...,1
1,VGVhY2hlci0xMjQzMzQ3,Kurt,Douglass,"Good teacher, very lenient with grading and at...",5.0,good very lenient with grading and attendance ...,1
2,VGVhY2hlci0xMjQzMzQ3,Kurt,Douglass,Very difficult class. His grading is hard to ...,1.5,very difficult his grading is hard to understa...,0
3,VGVhY2hlci0xMDExMDU2,Paula,Zobisch,Excellent mentor. Created valuable foundations...,5.0,excellent mentor created valuable foundations ...,1
4,VGVhY2hlci0xMDExMDU2,Paula,Zobisch,Awesome...Had her for two classes at baker col...,5.0,awesome had for two at baker reccommended,1


In [10]:
prof_counts = reviews["professor_id"].value_counts()
prof_train, prof_test, cnt_train, cnt_test = train_test_split(prof_counts.index, prof_counts.values, test_size=0.2, random_state=1)

In [11]:
comm_train = reviews[reviews["professor_id"].isin(prof_train)]["cleanedComment"]
comm_test = reviews[reviews["professor_id"].isin(prof_test)]["cleanedComment"]

sent_train = reviews[reviews["professor_id"].isin(prof_train)]["sentiment"]
sent_test = reviews[reviews["professor_id"].isin(prof_test)]["sentiment"]

In [12]:
def evalPerformance(y_pred, y_test, mode="weighted"):
    acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy Score: " + str(acc_score * 100))
    
    f1 = f1_score(y_test, y_pred, average=mode)
    print("F1 Score: {0}".format(f1 * 100))

https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html

In [13]:
pipeline = Pipeline([
    ("vectorizer", CountVectorizer()), 
    ("selector"  , SelectPercentile(score_func=chi2)),
    ("classifier", MultinomialNB())
])

ngr: (1,1) - (1,4)     ::         4

sel: 2 - 40, 2         ::         20

nb alpha: 1, 10, 100   ::         3

max_df: .5 - .9, 0.05  ::         8

min_df: 0 - 0.25, 0.05 ::         5

**9600 models will be run** :: ran for 6 hours (360 min) without finishing

Gonna need to make changes to it and try again
* can probably get rid of alpha tuning
* maybe reduce amount of iterations in percentile

ALSO: maybe add verbose parameter to GSCV, though it printing thousands of times is not desirable so maybe not

In [14]:
param_list = {
    "vectorizer__ngram_range": [(1,1), (1,2), (1,3)],
    # "vectorizer__max_df": np.arange(0.5, 0.91, 0.05),
    # "vectorizer__min_df": np.arange(0, 0.26, 0.05),
    "classifier__alpha": range(0, 41, 5),
    "selector__percentile": range(10, 44, 3)
}

In [15]:
search = GridSearchCV(pipeline, param_list, n_jobs=2, verbose=1)
search.fit(reviews["cleanedComment"], reviews["sentiment"])

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


In [None]:
print("LET'S FUCKING GOOOOOOOO!!!!")