# Preprocessing Setup
Basic steps, because there are domain specific problems to account for later

In [None]:
import pandas as pd
import numpy as np
import re
import spacy
import pkg_resources
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words, wordnet, brown

In [None]:
# !pip install symspellpy

from spacy.cli import download
download('en_core_web_md')
nltk.download('words')
nltk.download('omw-1.4')
nltk.download('wordnet')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# from symspellpy import SymSpell, Verbosity

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
reviews = pd.read_csv("/content/drive/MyDrive/RMP/reviews_filtered.csv").sample(n=300000, random_state=1)
# reviews = pd.read_csv("/content/drive/MyDrive/RMP/scraped_comments_with_professor.csv").sample(n=300000, random_state=1)
# reviews[['comment_id', 'firstName', 'lastName', 'prof_class', 'comment', 'clarityRating', 'helpfulRating']].to_csv("/content/drive/MyDrive/RMP/scraped_comments_sample.csv")

In [None]:
reviews.head()

Unnamed: 0.1,Unnamed: 0,firstName,lastName,prof_class,comment,date,clarityRating,helpfulRating,professor_id,comment_id,qualityRating,sentiment
307424,652065,Daniel,Propson,PHIL1010,Amazing professor! He is so kind and friendly ...,2013-08-25 04:50:41+00:00,5,5,VGVhY2hlci0xNDEyOTA1,UmF0aW5nLTIyMDgxOTI2,5.0,True
265217,620499,Carol,Nauman,MTH94,"I took this class in Spring of 2015, this was ...",2016-06-14 21:08:43+00:00,5,5,VGVhY2hlci0xMzMxMDMy,UmF0aW5nLTI2NzI2MDc1,5.0,True
140855,351290,Christine,Mains,GNED1401,Condescending and does not respect students. T...,2015-12-02 17:01:35+00:00,1,1,VGVhY2hlci0xMzA0MTgy,UmF0aW5nLTI1NTI4MTc1,1.0,False
181201,818880,Laura,Rutledge,BI55,she didnt seem enthused to teach at all..she w...,2009-08-18 15:26:33+00:00,1,1,VGVhY2hlci05OTMzMDk=,UmF0aW5nLTE2MTMxMTQy,1.0,False
143393,238563,Caryl,Rahn,CGS1060,This professor has a very well organized class...,2014-03-19 11:20:29+00:00,5,5,VGVhY2hlci0xMDQwMzI5,UmF0aW5nLTIyOTI0NTcx,5.0,True


## Duplicates and Nulls

In [None]:
print("Shape before dropping:", reviews.shape)
reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

reviews.drop_duplicates(subset="comment", keep="first", inplace=True)

# drop rows containing only "No Comments" (default value assigned by RMP to a review that didn't enter a comment)
reviews = reviews[reviews["comment"] != "No Comments"]

# drop rows containing NaN comment
reviews.dropna(subset=["comment"], inplace=True)

# fill null names with empty string
reviews['firstName'].fillna('', inplace=True)
reviews['lastName'].fillna('', inplace=True)

# Dropping reviews with qualityRating == 3.0
reviews['qualityRating'] = (reviews['helpfulRating']+reviews['clarityRating'])/2.0
reviews = reviews[reviews["qualityRating"] != 3.0]
reviews["sentiment"] = reviews["qualityRating"] > 3.0

print("Shape after dropping:", reviews.shape)
reviews.reset_index(drop=True, inplace=True)

Shape before dropping: (300000, 12)
Shape after dropping: (300000, 12)


## Removing Urls, Phone Numbers, and Emails

In [None]:
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', ' ', text)

def remove_phones(text):
    return re.sub(r'\d{3}-\d{3}-\d{4}', ' ', text)

def remove_emails(text):
    return re.sub(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', ' ', text)

print(remove_urls('Hey! Check out this link: www.somelink.com'))
print(remove_phones("Hey! Check out this phone number: 742-457-0417"))
print(remove_emails("Hey! Check out this email address: nooneuses@yahoo.com"))

Hey! Check out this link:  
Hey! Check out this phone number:  
Hey! Check out this email address:  


## Html Artifacts

In [None]:
# TODO: Convert html entites of quotes -> "'" to normalize
def remove_html_entities(text):
  text = re.sub('&[0-9a-zA-Z#]+;', ' ', text)
  return re.sub('&#63;?', '', text)

def remove_html_tags(text):
  return re.sub('<.{1,6}?>', ' ', text)

text = "This professor is such an easy &quot;A&quot;, why are y'all struggling &#63;&#63;&#63 </div>"
print(remove_html_entities(text))
print(remove_html_tags(text))

This professor is such an easy  A , why are y'all struggling    </div>
This professor is such an easy &quot;A&quot;, why are y'all struggling &#63;&#63;&#63  


## Emoticon Conversion to Words

In [None]:
#@title Emoticon Mapping
EMOTICONS = {
    u"<3": "emopos",
    u":‑)":"emopos",
    u":-))":"emopos",
    u":-)))":"emopos",
    u":)":"emopos",
    u":))":"emopos",
    u":)))":"emopos",
    u":-]":"emopos",
    u":]":"emopos",
    u":-3":"emopos",
    u":3":"emopos",
    u":->":"emopos",
    u":>":"emopos",
    u"8-)":"emopos",
    u":-}":"emopos",
    u":}":"emopos",
    u":-)":"emopos",
    u":c)":"emopos",
    u":^)":"emopos",
    u"=]":"emopos",
    u"=)":"emopos",
    u":‑D":"emopos",
    u":D":"emopos",
    u"8‑D":"emopos",
    u"8D":"emopos",
    u"X‑D":"emopos",
    u"XD":"emopos",
    u"=D":"emopos",
    u"=3":"emopos",
    u"B^D":"emopos",
    u":-))":"emopos",
    u":-(":"emoneg",
    u":‑(":"emoneg",
    u":(":"emoneg",
    u":‑c":"emoneg",
    u":c":"emoneg",
    u":‑<":"emoneg",
    u":<":"emoneg",
    u":‑[":"emoneg",
    u":[":"emoneg",
    u":-||":"emoneg",
    u">:[":"emoneg",
    u":{":"emoneg",
    u">:(":"emoneg",
    u":'‑(":"emoneg",
    u":'(":"emoneg",
    u":'‑)":"emopos",
    u":')":"emopos",
    u"D‑':":"emoneg",
    u"D:<":"emoneg",
    u"D:":"emoneg",
    u"D8":"emoneg",
    u"D;":"emoneg",
    u"D=":"emoneg",
    u"DX":"emoneg",
    u";‑)":"emopos",
    u";)":"emopos",
    u"*-)":"emopos",
    u"*)":"emopos",
    u";‑]":"emopos",
    u";]":"emopos",
    u";^)":"emopos",
    u":‑,":"emopos",
    u";D":"emopos",
    u":‑P":"emopos",
    u":P":"emopos",
    u"X‑P":"emopos",
    u"XP":"emopos",
    u":‑Þ":"emopos",
    u":Þ":"emopos",
    u"=p":"emopos",
    u":‑/":"emoneg",
    u":/":"emoneg",
    u":-[.]":"emoneg",
    u">:[(\)]":"emoneg",
    u">:/":"emoneg",
    u":[(\)]":"emoneg",
    u"=/":"emoneg",
    u"=[(\)]":"emoneg",
    u":L":"emoneg",
    u"=L":"emoneg",
    u":‑|":"emoneg",
    u":|":"emoneg",
    u"O:‑)":"emopos",
    u"O:)":"emopos",
    u"0:‑3":"emopos",
    u"0:3":"emopos",
    u"0:‑)":"emopos",
    u"0:)":"emopos",
    u":‑b":"emopos",
    u"(>_<)":"emoneg",
    u"(>_<)>":"emoneg",
    u"^_^":"emopos",
    u"(^_^)/":"emopos",
    u"(^O^)／":"emopos",
    u"(^o^)／":"emopos",
    u"('_')":"emoneg",
    u"(/_;)":"emoneg",
    u"(T_T) (;_;)":"emoneg",
    u"(;_;":"emoneg",
    u"(;_:)":"emoneg",
    u"(;O;)":"emoneg",
    u"(:_;)":"emoneg",
    u"(ToT)":"emoneg",
    u";_;":"emoneg",
    u";-;":"emoneg",
    u";n;":"emoneg",
    u"Q.Q":"emoneg",
    u"T.T":"emoneg",
    u"Q_Q":"emoneg",
    u"(-.-)":"emopos",
    u"(-_-)":"emopos",
    u"(；一_一)":"emopos",
    u"(=_=)":"emoneg",
    u"^m^":"emopos",
    u">^_^<":"emopos",
    u"<^!^>":"emopos",
    u"^/^":"emopos",
    u"（*^_^*）" :"emopos",
    u"(^<^) (^.^)":"emopos",
    u"(^^)":"emopos",
    u"(^.^)":"emopos",
    u"(^_^.)":"emopos",
    u"(^_^)":"emopos",
    u"(^^)":"emopos",
    u"(^J^)":"emopos",
    u"(*^.^*)":"emopos",
    u"(^—^）":"emopos",
    u"(#^.^#)":"emopos",
    u"(*^0^*)":"emopos",
    u"(*^^)v":"emopos",
    u"(^_^)v":"emopos",
    u'(-"-)':"emoneg",
    u"(ーー;)":"emoneg",
    u"(＾ｖ＾)":"emopos",
    u"(＾ｕ＾)":"emopos",
    u"(^)o(^)":"emopos",
    u"(^O^)":"emopos",
    u"(^o^)":"emopos",
    u")^o^(":"emopos",
    u":O o_O":"emoneg",
    u"o_0":"emoneg",
    u"o.O":"emoneg",
    u"(o.o)":"emoneg",
    u"(*￣m￣)": "emoneg",
}

for emote, val in EMOTICONS.items():
  EMOTICONS[emote] = val.lower().replace(',', ' ').replace(' ', '_')

In [None]:
def convert_emoticons(text):
  return EMOTICONS.get(text, text)
  
text = "Hello :-) :-)"
text_split = text.split()
for i, txt in enumerate(text_split):
  text_split[i] = convert_emoticons(txt)
print(' '.join(text_split))

Hello emopos emopos


## Contractions

In [None]:
#@title Contraction Mapping
contraction_mapping = {
    "dont": "do not",
    "doesnt": "does not",
    "arent": "are not",
    "cant": "can not",
    "couldve": "could have",
    "couldnt": "could not",
    "didnt": "did not",
    "aint": "is not",
    "arent": "are not",
    "hes": "he is",
    "shes": "she is",
    "havent": "have not",
    "hasnt": "has not",
    'youll': "you will",
    "ive": "i have",
    "youve": "you have",
    "shouldve": "should have",
    "im": "i am",
    "isnt": "is not",
    "ain't": "is not", 
    "aren't": "are not",
    "can't": "cannot",
    "'cause": "because", 
    "could've": "could have", 
    "couldn't": "could not", 
    "didn't": "did not",  
    "doesn't": "does not", 
    "don't": "do not", 
    "hadn't": "had not", 
    "hasn't": "has not", 
    "shouldnt": "should not",
    "haven't": "have not", 
    "he'd": "he would",
    "he'll": "he will", 
    "he's": "he is", 
    "how'd": "how did", 
    "how'd'y": "how do you", 
    "how'll": "how will", 
    "how's": "how is",
    "i'd": "i would", 
    "i'd've": "i would have", 
    "i'll": "i will",  
    "i'll've": "i will have",
    "i'm": "i am", 
    "i've": "i have", 
    "isn't": "is not", 
    "it'd": "it would", 
    "it'd've": "it would have", 
    "it'll": "it will", 
    "it'll've": "it will have",
    "it's": "it is", 
    "let's": "let us", 
    "ma'am": "madam", 
    "mayn't": "may not", 
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have", 
    "must've": "must have", 
    "mustn't": "must not", 
    "mustn't've": "must not have", 
    "needn't": "need not", 
    "needn't've": "need not have",
    "o'clock": "of the clock", 
    "oughtn't": "ought not", 
    "oughtn't've": "ought not have", 
    "shan't": "shall not", 
    "sha'n't": "shall not", 
    "shan't've": "shall not have", 
    "she'd": "she would", 
    "she'd've": "she would have", 
    "she'll": "she will", 
    "she'll've": "she will have", 
    "she's": "she is", 
    "should've": "should have", 
    "shouldn't": "should not", 
    "shouldn't've": "should not have", 
    "this's": "this is",
    "this'll": "this will",
    "thisll": "this will",
    "that'd": "that would", 
    "that'd've": "that would have", 
    "that's": "that is", 
    "thats": "that is",
    "there'd": "there would", 
    "there'd've": "there would have", 
    "there's": "there is", 
    "theres": "there is",
    "here's": "here is",
    "heres": "here is",
    "they'd": "they would", 
    "they'd've": "they would have", 
    "they'll": "they will", 
    "they'll've": "they will have", 
    "they're": "they are", 
    "they've": "they have", 
    "to've": "to have", 
    "wasn't": "was not", 
    "wasnt": "was not",
    "we'd": "we would", 
    "we'd've": "we would have", 
    "we'll": "we will", 
    "we'll've": "we will have", 
    "we're": "we are", 
    "we've": "we have", 
    "weve": "we have",
    "werent": "were not",
    "weren't": "were not", 
    "what'll": "what will",
    "whatll": "what will",
    "what'll've": "what will have", 
    "what're": "what are",  
    "what's": "what is", 
    "what's": "what is",
    "whatve": "what have",
    "what've": "what have", 
    "when's": "when is", 
    "when've": "when have", 
    "where'd": "where did", 
    "where's": "where is", 
    "wheres": "where is", 
    "where've": "where have",
    "who'll": "who will", 
    "who'll've": "who will have", 
    "who's": "who is", 
    "who've": "who have", 
    "whys": "why is",
    "why's": "why is", 
    "why've": "why have", 
    "will've": "will have", 
    "willve": "will have",
    "won't": "will not", 
    "wont": "will not",
    "won't've": "will not have", 
    "would've": "would have", 
    "wouldve": "would have",
    "wouldn't": "would not", 
    "wouldnt": "would not",
    "wouldn't've": "would not have", 
    "y'all": "you all", 
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would", 
    "you'd've": "you would have", 
    "you'll": "you will", 
    "you'll've": "you will have", 
    "you're": "you are", 
    "you've": "you have"
}

len(contraction_mapping)

147

In [None]:
len(contraction_mapping)

147

In [None]:
def expand_contraction(text): # Before expanding contraction, might want to clean of symbols that are not '
  return contraction_mapping.get(text, text)

text = "You're a pig and I should've slayed you, grrr"
text_split = text.split()
for i, txt in enumerate(text_split):
  text_split[i] = expand_contraction(txt.lower())
print(' '.join(text_split))

you are a pig and i should have slayed you, grrr


## Slang/Vocabulary

In [None]:
#@title Vocab Mapping
vocab_mapping = {
    'ta': 'teaching assistant',
    'tas', 'teaching assistants',
    'biz': 'business',
    'hw': 'homework',
    'hws': 'homeworks',
    'faq': 'frequently answered question',
    'faqs': 'frequently answered questions',
    'mcq': 'multiple choice question',
    'mcqs': 'multiple choice questions',
    'frq': 'free response question',
    'frqs': 'free response questions',
    'ppt': 'powerpoint',
    'ppts': 'powerpoints',
    'ques': 'question',
    'bs': 'bullshit',
    'bsing': 'bullshitting',
    'bsed': 'bullshitted',
    'lol': 'laugh out loud',
    'btw': 'by the way',
    'imo': 'in my opinion',
    'imho': 'in my honest opinion',
    'tbh': 'to be honest',
    'asap': 'as soon as possible',
    'idc': 'i do not care',
    'omg': 'oh my god',
    'ppl': 'people',
    'rip': 'rest in peace',
    'srsly': 'seriously',
    'thx': 'thanks',
    'txt': 'text',
    'ur': 'your',
    'tho': 'though',
    'wtf': 'what the fuck',
    'wth': 'what the heck',
    'bc': 'because',
    'b4': 'before',
    'h8': 'hate',
    'jk': 'just kidding',
    'cuz': 'because',
    'coz': 'because',
    'cos': 'because',
    'pts': 'points',
    'pg': 'page',
    'req': 'requirement',
    'reqs': 'requirements',
    'abt': 'about',
    'fav': 'favorite',
    'lil': 'little'
}

## Spellchecker

In [None]:
# sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
# dictionary_path = pkg_resources.resource_filename(
#     "symspellpy", "frequency_dictionary_en_82_765.txt"
# )
# # term_index is the column of the term and count_index is the
# # column of the term frequency
# sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# # lookup suggestions for single-word input strings
# input_term = "memebers"  # misspelling of "members"

# # Verbosity.TOP gets the best suggestion
# suggestion = sym_spell.lookup(input_term, Verbosity.TOP, max_edit_distance=2)
# print(suggestion[0], len(suggestion))

## Stopwords

In [None]:
# amend list of stop words to keep whatever it is we want by removing words from list that we want to keep

# TODO: is the list of stopwords on git complete and accurate or does someone want to read through all 325 stopwords spacy gives and determine which ones to keep?
# stopwords = STOP_WORDS
# stopwords.remove("but")
# stopwords.remove("not")
# stopwords.remove("nor")
# stopwords.remove("never")

# stopwords = set()
gen_stops = set(["mr", "ms", "dr", "doctor", "s", "t", "i", "me", "myself", "is", "she", "he", "we", "him", "her", "it"])
domain_stops = set(["book", "books", "college", "colleges", "lecture", "lectures", "university", "universities", "lab", "labs", "hw", "hws", "quiz", "quizzes", "prof", "professor", "teacher", "class", "classes", "course", "courses"])
stopwords = gen_stops.union(domain_stops)


In [None]:
course_abbreviated_stopwords = {
    'eng', 'bio', 'engl', 'hist', 'chem', 'psy', 'soc', 'econ', 'psych', 'span', 'comm', 'acct',
    'phil', 'bus', 'enc', 'phys', 'anth', 'acc', 'eco', 'nurs', 'pols', 'spch', 'mgmt', 'geog', 'soci', 'geo',
    'phy', 'stat', 'mgt', 'govt', 'gov', 'spa', 'en', 'pol', 'edu', 'gen', 'mkt', 'music', 'alg', 'calc', 'fren',
    'poli', 'law', 'crim', 'engr', 'art', 'busn',
}

course_fluff_stopwords = {
    'intro', 'introduction', 'introductory', 'basic', 'concept', 'concepts', 'studies', 'discovery', 'background', 'knowledge',
    'materials', 'intermediate', 'advanced', 'managing', 'management', 'theory', 'thoeries', 'design', 'analysis',
    'science', 'sciences', 'overview', 'laboratory', 'principles', 'practices', 'contemporary', 'modern',
    'civilization', 
}

course_full_stopwords = {
    'math', 'english', 'biology', 'history', 'chemistry', 'psychology', 'sociology', 'spanish', 'communication', 'accounting',
    'philosophy', 'business', 'physics', 'anthropology', 'ecology', 'economics', 'political', 'management', 'geology',
    'statistics', 'government', 'marketing', 'music', 'algebra', 'calculus', 'french', 'german', 'law', 'criminology', 'engineering',
    'health', 'art', 'theater', 'astrology', 'science', 'sciences', 'writing', 'finance', 'japanese', 'arab', 'religion', 'ethnic',
    'speech', 'culture', 'literature'
}

stopwords.update(course_abbreviated_stopwords)
# stopwords.update(course_fluff_stopwords)
stopwords.update(course_full_stopwords)

stopwords_small = course_abbreviated_stopwords.union(course_full_stopwords)
stopwords_large = stopwords.union(course_fluff_stopwords)
stopwords_medium = stopwords

In [None]:
regex_course_stopwords = [
    'math', 'engl', 'bio', 'hist', 'chem', 'psy', 'soc', 'span', 'acct', 'phil',
    'phys', 'anth', 'eco', 'pols', 'geo', 'gov', 'alg', 'calc', 'fren', 'germ',
    'crim', 'astr', 'sci', 'jap', 'relig', 'ethn', 'russ', 'eur', 'asia', 'afri',
    'christ', 'isl', 'ital', 'arch', 'chin'
]

regex_course_stopwords = [r'\b' + word + r'\S+' for word in regex_course_stopwords]
regex_course_stopwords.append(r'\S+' + 'logy' + r'\b')
regex_course_stopwords.append(r'\S+' + 'phy' + r'\b')

## Spacy Setup

In [None]:
nlp = spacy.load('en_core_web_sm', exclude=['lemmatizer', 'parser', 'textcat', 'custom'])

# Undersampling
Currently the method used to undersample is messy.
Not only this, but with multinomial nb the stats are:

In [None]:
# reviews_pos = reviews[reviews['sentiment'] == 1]
# reviews_neg = reviews[reviews['sentiment'] == 0]
# print(len(reviews_pos), len(reviews_neg))
# reviews_pos = reviews[reviews['sentiment'] == 1].sample(n = int(len(reviews_neg)*(.7/.3)), random_state=1) # Messy way of undersampling

# print(len(reviews_pos))

# reviews_pos.reset_index(inplace=True, drop=True)
# reviews_neg.reset_index(inplace=True, drop=True)
# reviews = pd.concat([reviews_pos, reviews_neg], ignore_index=True)

# print(reviews.sentiment.value_counts())
# print(reviews.sentiment.value_counts()[True]/len(reviews))

# Preprocessing Pipeline

In [None]:
spellchecked_comments = []
lemm = WordNetLemmatizer()
grades = set(['a', 'b', 'c', 'd', 'e', 'f'])
unseen = Counter()

def preprocess_pipeline(df):
  cnt = 0 # to keep track of progress
  comments_proper = []
  for index, review in df.iterrows():
    comment = review['comment']
    fname = review['firstName'].lower().split(' ')
    lname = review['lastName'].lower().split(' ')
    names = set(fname + lname)

    cnt += 1
    if cnt % 25000 == 0:
      print(cnt)

    comment = remove_urls(comment)
    comment = remove_phones(comment)
    comment = remove_emails(comment)
    comment = remove_html_entities(comment)
    comment = remove_html_tags(comment)

    comment_split = comment.split(' ')
    new_comment_split = []
    for i, word in enumerate(comment_split):
      word = convert_emoticons(word)
      word = word.lower()
      word = expand_contraction(word)
      word = re.sub("[^a-z\s]+", ' ', word)   # replace characters that are not alphabetic, space, or underscore
      # word = word.replace("'", ' ') # replace apostrophe with space
      word = re.sub(r'(.)\1\1+', '\g<1>', word)  # replace any three character+ sequence with one
      word = re.sub('\s+', ' ', word)
      word = word.strip() # trailing whitespace because punctuation replaced by space
      # if word not in names:
      new_comment_split.extend(word.split(' '))

    # comment = comment.lower()
    # comment = re.sub("[^a-zA-Z\s]+", ' ', comment)   # replace characters that are not alphabetic, space, or underscore
    # comment = comment.replace("'", '') # remove apostrophes
    # comment = re.sub(r'(.)\1\1+', '\g<1>', comment)  # replace any three characters sequence with one
    # comment = re.sub('\s+', ' ', comment)
    # comment = comment.strip() # trailing whitespace because punctuation replaced by space

    
    # comment_split = comment.split(' ')
    # new_comment_split = []
    # for i, word in enumerate(comment_split):
    #   if word not in names:
    #     new_comment_split.append(word)

    # Remove partial course names from comments
    for course_name in regex_course_stopwords:
      comment = re.sub(course_name, ' ', comment)

    # Remove names from the comment
    for i, word in enumerate(new_comment_split):
      if word in names:
        new_comment_split[i] = ''
      else:
        new_comment_split[i] = vocab_mapping.get(word, word)

    comment = ' '.join(new_comment_split)
    comment = re.sub('\s+', ' ', comment)
    comment = comment.strip()

    # comment = [lemm.lemmatize(word) for word in comment.split()] # Lemmatize
    # comment = [word for word in comment.split() if word not in stopwords] # remove stopwords
    # comment = " ".join(comment)

    # comment = ' '.join(word for word in comment.split() if len(word) > 1)

    comments_proper.append(comment)
    # spellchecked_comments.append(' '.join(sym_spell.lookup(word, Verbosity.TOP, max_edit_distance=2, include_unknown=True)[0].term for word in comment.split()))
  return comments_proper

comments_proper = preprocess_pipeline(reviews)

25000
50000
75000
100000
125000
150000
175000
200000
225000
250000
275000
300000


In [None]:
unseen_words = Counter()
people = Counter()
def preprocess_pipe(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=200):
      # for word in doc:
      #   if word.pos_ == 'PROPN':
      #     unseen_words[word.text] += 1
      #     print(word.text, word.pos_)
      # print(doc.ents)
      for word in doc.ents:
        if word.label_ == 'PERSON':
          people[word.text] += 1
          # print(word.text,word.label_)

# preprocess_pipe(comments_proper)
# print(unseen_words, len(unseen_words))
# print(people, len(people))

In [None]:
print(people)

Counter()


In [None]:
# i = 0
# for comment, spellcheck_comment in zip(comments_proper, spellchecked_comments):
#   print(comment)
#   print(spellcheck_comment)
#   print('\n')
#   i += 1
#   if i == 10:
#     break

In [None]:
reviews["cleanedComment"] = pd.Series(comments_proper)
# reviews["cleanedCommentChecked"] = pd.Series(spellchecked_comments)
reviews['cleanedComment'].head(25)

0     amazing professor he is so kind and friendly a...
1     i took this class in spring of this was the to...
2     condescending and does not respect students tr...
3     she did not seem enthused to teach at all she ...
4     this professor has a very well organized class...
5     enjoy the class would take a class of his agai...
6     easy a when she is doing examples on how to wr...
7     this woman is plain terrible worst teacher i h...
8     dr is hilarious he acts like he wants the stud...
9     dr teaches well but you should be prepared to ...
10    i did not learn anything in this class i have ...
11    mrs is extremely friendly and willing to work ...
12                                         knowledgable
13    this teacher is super funny she is a little al...
14    professor teaches very well but no matter how ...
15    this was a very easy class it seems like as lo...
16    prof is a good professor her lectures feel lon...
17    this class was an honest ate of time and m

In [None]:
reviews['sentiment'].value_counts()

True     220836
False     79164
Name: sentiment, dtype: int64

In [None]:
for index, row in reviews.head(50).iterrows():
    print(row['comment'])
    print(row['cleanedComment'])
    print('\n')

Amazing professor! He is so kind and friendly and makes class very welcoming! He is definitely a fair grader and does give extra credit. There were about 5 response papers, 2 major essays and 1 exam (which he let us do take home). I would highly recommend this professor! Passed with a B+
amazing professor he is so kind and friendly and makes class very welcoming he is definitely a fair grader and does give extra credit there were about response papers major essays and exam which he let us do take home i would highly recommend this professor passed with a b


I took this class in Spring of 2015, this was the toughest Math class I had taken this far. Professor Nauman helped me in the math lab on numerous occasions, via email, and no matter what is going on she responds and lets me know what I am doing wrong. She really cares for her students and teaching is absolutely her forte.. Excellent teacher!!!
i took this class in spring of this was the toughest math class i had taken this far pro

In [None]:
# dropping rows <= 5
# reviews['wordCount'] = reviews["cleanedComment"].str.split().str.len()
# reviews[['wordCount', 'cleanedComment']].head(5)

# reviews = reviews[reviews['wordCount'] > 5]
# reviews.shape
reviews = reviews.loc[:, ["firstName", "lastName", "comment", "cleanedComment", "clarityRating", "sentiment", "professor_id"]]

# Document Sentiment Pipeline

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedGroupKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

from sklearn.feature_selection import chi2, SelectPercentile, SelectKBest
from sklearn.pipeline import Pipeline

In [None]:
def evalPerformance(yp, yt, mode="micro"):
    prec_score = precision_score(yt, yp)
    rec_score = recall_score(yt, yp)
    f1 = f1_score(yt, yp, average=mode)
    acc_score = accuracy_score(yt, yp)
    conf_m = confusion_matrix(yt, yp)


    print(f"Precision Score: {prec_score*100}")
    print(f"Recall Score: {rec_score*100}")
    print("F1 Score: {0}".format(f1 * 100))
    print("Accuracy Score: " + str(acc_score * 100))
    print(conf_m)


In [None]:
review_ids = reviews.groupby(['professor_id', 'cleanedComment']) 
review_ids.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,firstName,lastName,comment,clarityRating,sentiment
professor_id,cleanedComment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
VGVhY2hlci05Mjc1NTI=,do not take this lady for speech much easier options and you will learn just as much she has to many useless rules and overall she just wants to be in control of everything,.,Ramage,Do Not Take this lady for speech. Much easier ...,1,False
VGVhY2hlci05Mjc1NTI=,i cannot say that her methods of grading were unfair however many times requirements for a speech dates and other major things about what was going on were changed in the middle of a lecture if you missed it then you may receive a changes like these are not emphasized if you do not have your outline in hand on speech day you get a,.,Ramage,I can't say that her methods of grading were u...,1,False
VGVhY2hlci05Mjc1NTI=,maybe the last commenter should have accepted a few lessons from mrs there are at least two misspelled words and at least one misused word in the comments grammar matters mrs is a very good teacher i learned much more because she taught the correct way of doing things,.,Ramage,Maybe the last commenter should have accepted ...,4,True
VGVhY2hlci05Mjc1NTI=,not relly a speech teacher more of just an outline teacher she does not grade you speech just your outline so be orginized i took her and i was not satisfied her style is more of grading the paper based on grammer more of and english class then a speech class,.,Ramage,Not relly a speech teacher more of just an out...,1,False
VGVhY2hlci05Mjc1NTI=,she does not put up with nonsense or tardiness at all she expects respect she will teach you the correct way of speaking whether you can adapt to it or not what you learn will definitely benefit you in life very organized prompt with returning grades organization is worth a lot in a busy schedule,.,Ramage,She does not put up with nonsense or tardines...,4,True
...,...,...,...,...,...,...
VGVhY2hlci0xNjIxNTU4,dr j aka cobra is amazing not the typical over glorified professors you encounter at walsh he actually has a real phd down to earth and actually selflessly works with students to over come obstacles after the last class i wondered if it be cheezy to shake his hand but apparently more than half the class did so and asked to take future class,Jeremy,Lim,Dr J (aka Cobra) is amazing!! Not the typical ...,5,True
VGVhY2hlci0xNjIxNTU4,dr j is awesome and funny knows his stuff real well and demonstrates how marketing theories tie into real world applications i am still blown away by the gorilla video very energetic although english may not be his first language,Jeremy,Lim,Dr J is awesome and funny... Knows his stuff r...,4,True
VGVhY2hlci0xNjIxNTU4,interesting and energetic too bad it is on a monday evening,Jeremy,Lim,Interesting and energetic. Too bad it is on a ...,4,True
VGVhY2hlci0xNjIxNTYx,awful professor his quizzes were worded to trick you to the th degree he does not care about how students are doing in the class he expects you to treat his class as the only one take this class at your own risk because it will drop your gpa no matter how good you do in other classes avoid and take another professor instead,Gregorio,Pantoja,Awful professor. His quizzes were worded to tr...,1,False


In [None]:
groups_professor_id_list = np.array(reviews['professor_id'].values)
print(groups_professor_id_list[:5])

y = reviews['sentiment']
print(y.head(5))

X = reviews['cleanedComment']
X.head(5)

['VGVhY2hlci0xNDEyOTA1' 'VGVhY2hlci0xMzMxMDMy' 'VGVhY2hlci0xMzA0MTgy'
 'VGVhY2hlci05OTMzMDk=' 'VGVhY2hlci0xMDQwMzI5']
0     True
1     True
2    False
3    False
4     True
Name: sentiment, dtype: bool


0    amazing professor he is so kind and friendly a...
1    i took this class in spring of this was the to...
2    condescending and does not respect students tr...
3    she did not seem enthused to teach at all she ...
4    this professor has a very well organized class...
Name: cleanedComment, dtype: object

## Pipeline

In [None]:
sent_pipeline = Pipeline([
    # ("vectorizer", CountVectorizer(ngram_range=(1,2), max_df=0.5)), 
    ("vectorizer", TfidfVectorizer(ngram_range=(1,3), min_df = 5)),
    ("selector"  , SelectPercentile(score_func=chi2, percentile=22)),
    ("classifier", LogisticRegression(solver='sag', C = 2.5))
    # ("classifier" , MultinomialNB(alpha=1.0))
    # ("classifer" , DecisionTreeClassifier(max_depth=5))
])

In [None]:
# param_list = {
#     'vectorizer__ngram_range': [(1, 3)],
#     'vectorizer__min_df': [5, 6, 7],
#     'selector__percentile': range(10, 33, 2),
#     'classifier__alpha': np.arange(0, .71, .02),
# }

param_list = {
    'vectorizer__ngram_range': [(1, 3)],
    'vectorizer__min_df': [5, 6, 7],
    'vectorizer__stop_words': [stopwords_small, stopwords_medium, stopwords_large],
    'selector__percentile': range(16, 33, 2),
    'classifier__solver': ['newton-cg', 'liblinear', 'sag', 'saga'],
    'classifier__C': [20, 15, 10, 7.5, 5, 2.5],
}

In [None]:
sgkf = StratifiedGroupKFold(n_splits = 5)
random_search = RandomizedSearchCV(sent_pipeline, param_list, scoring='f1_micro', cv=sgkf, n_iter=20, verbose=3)

## Cross Validation

In [None]:
random_search.fit(X, y, groups=groups_professor_id_list)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END classifier__C=5, classifier__solver=liblinear, selector__percentile=26, vectorizer__min_df=6, vectorizer__ngram_range=(1, 3), vectorizer__stop_words={'fren', 'phys', 'management', 'econ', 'eco', 'geog', 'accounting', 'religion', 'calc', 'criminology', 'science', 'span', 'government', 'en', 'algebra', 'french', 'geo', 'engl', 'soci', 'statistics', 'geology', 'nurs', 'psych', 'soc', 'ecology', 'gen', 'writing', 'culture', 'pol', 'english', 'anthropology', 'health', 'speech', 'eng', 'comm', 'enc', 'astrology', 'stat', 'crim', 'business', 'phy', 'literature', 'ethnic', 'german', 'bus', 'marketing', 'calculus', 'finance', 'hist', 'acc', 'art', 'chemistry', 'music', 'anth', 'communication', 'mkt', 'sociology', 'pols', 'acct', 'edu', 'philosophy', 'physics', 'theater', 'economics', 'mgt', 'spanish', 'history', 'biology', 'alg', 'spch', 'mgmt', 'poli', 'law', 'psy', 'psychology', 'math', 'sciences', 'engineering', 'chem

RandomizedSearchCV(cv=StratifiedGroupKFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('vectorizer',
                                              TfidfVectorizer(min_df=5,
                                                              ngram_range=(1,
                                                                           3))),
                                             ('selector',
                                              SelectPercentile(percentile=22,
                                                               score_func=<function chi2 at 0x7fe175e33050>)),
                                             ('classifier',
                                              LogisticRegression(C=2.5,
                                                                 solver='sag'))]),
                   n_iter=20,
                   param_distributions={'cla...
                                                                    'communicat

## Estimator Selection

In [43]:
print(random_search.best_params_)
print(random_search.best_estimator_)
best_estimator = random_search.best_estimator_

{'vectorizer__stop_words': {'fren', 'phys', 'management', 'econ', 'eco', 'geog', 'accounting', 'religion', 'calc', 'criminology', 'science', 'span', 'government', 'en', 'algebra', 'french', 'geo', 'engl', 'soci', 'statistics', 'geology', 'nurs', 'psych', 'soc', 'ecology', 'gen', 'writing', 'culture', 'pol', 'english', 'anthropology', 'health', 'speech', 'eng', 'comm', 'enc', 'astrology', 'stat', 'crim', 'business', 'phy', 'literature', 'ethnic', 'german', 'bus', 'marketing', 'calculus', 'finance', 'hist', 'acc', 'art', 'chemistry', 'music', 'anth', 'communication', 'mkt', 'sociology', 'pols', 'acct', 'edu', 'philosophy', 'physics', 'theater', 'economics', 'mgt', 'spanish', 'history', 'biology', 'alg', 'spch', 'mgmt', 'poli', 'law', 'psy', 'psychology', 'math', 'sciences', 'engineering', 'chem', 'engr', 'political', 'phil', 'busn', 'japanese', 'arab', 'gov', 'bio', 'spa', 'govt'}, 'vectorizer__ngram_range': (1, 3), 'vectorizer__min_df': 6, 'selector__percentile': 26, 'classifier__solver

In [None]:
# best_estimator = Pipeline([
#     ("vectorizer", TfidfVectorizer(ngram_range=(1,2), min_df=8, stop_words=stopwords)),
#     ("selector"  , SelectPercentile(score_func=chi2, percentile=30)),
#     ("classifer" , MultinomialNB())
# ])
# best_estimator.fit(X, y)

# Validating (or Testing???)

In [51]:
test_reviews = pd.read_csv("/content/drive/MyDrive/RMP/scraped_comments.csv").sample(n=120000, random_state=1)

In [52]:
print("Shape before dropping:", test_reviews.shape)
test_reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

test_reviews.drop_duplicates(subset="comment", keep="first", inplace=True)

# drop rows containing only "No Comments" (default value assigned by RMP to a review that didn't enter a comment)
test_reviews = test_reviews[test_reviews["comment"] != "No Comments"]

# drop rows containing NaN comment
test_reviews.dropna(subset=["comment"], inplace=True)

# fill null names with empty string
test_reviews['firstName'].fillna('', inplace=True)
test_reviews['lastName'].fillna('', inplace=True)

# Dropping test_reviews with qualityRating == 3
test_reviews['qualityRating'] = (test_reviews['helpfulRating']+test_reviews['clarityRating'])/2.0
test_reviews = test_reviews[test_reviews["qualityRating"] != 3.0]
test_reviews["sentiment"] = test_reviews["qualityRating"] > 3.0

print("Shape after dropping:", test_reviews.shape)
test_reviews.reset_index(drop=True, inplace=True)

Shape before dropping: (120000, 16)
Shape after dropping: (105800, 18)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [53]:
comments_proper = []

comments_proper = preprocess_pipeline(test_reviews)

25000
50000
75000
100000


In [54]:
sent_predA = best_estimator.predict(comments_proper)

In [55]:
test_reviews.reset_index(inplace=True, drop=True)

## Accuracy

In [56]:
evalPerformance(sent_predA, test_reviews['sentiment'])

Precision Score: 93.67122022041839
Recall Score: 95.61339692291111
F1 Score: 92.39035916824196
Accuracy Score: 92.39035916824196
[[26779  4795]
 [ 3256 70970]]


In [57]:
right = sum(a == b for a, b in zip(sent_predA, test_reviews['sentiment']))
right / len(sent_predA)

0.9239035916824196

In [63]:
docs = [
    "He has his own grading criteria, which may throw you off. Tests are divided into weekly quiz, which you can redo them for better grade. PAs are difficult and mimir grading provides limited info, but he do provide fast and helpful feedback via office hour or mail. I was too late when I realized that, so contact him quickly if PA is hurting you.",
    "Makes the course unnecessarily hard. Passing a test with a C is uncommon. Don't be tricked by how nice of a guy he is, he wants to watch the world burn.",
    "This Professor is a very helpful, but is extremely difficult. The homework is a lot of work and expect the final to very very difficult.",
    "Think it was his first time teaching 221 so he was disorganized on the syllabus and assignments. Quite a bit of work so wouldn't recommend taking his class before ETAM. He pushes you to develop good programming practices. You'll come out of this class a better individual programmer but probably not with the best grade.",
    "I have perspective transferring from TAMU to another University, this class is hard, and talking to graduate students from other institutions it's not hard to see why, there's typically graduate level DSA covered in this class. It's worth taking for that reason alone. The depth, and content covered makes this the most important class in TAMU CS.",
    "Insufferable lectures, but an easy grader. She may be the professor you want, but she is not the professor you need.",
    "Leyk is a decent teacher for 121. This was the first semester she taught the course I believe so it was kinda all over the place, but she did a good job. HW isn't impossible or unreasonable and there's plenty of help. Also only having a final project and no final exam was nice."
]

sample_reviews = pd.DataFrame(data={'firstName': ['Philip Teresa']*7, 'lastName': ['Ritchey Leyk']*7, 'comment': docs})
docs_preprocessed = preprocess_pipeline(sample_reviews)

for comment in docs_preprocessed:
  print(comment)

he has his own grading criteria which may throw you off tests are divided into weekly quiz which you can redo them for better grade pas are difficult and mimir grading provides limited info but he do provide fast and helpful feedback via office hour or mail i was too late when i realized that so contact him quickly if pa is hurting you
makes the course unnecessarily hard passing a test with a c is uncommon do not be tricked by how nice of a guy he is he wants to watch the world burn
this professor is a very helpful but is extremely difficult the homework is a lot of work and expect the final to very very difficult
think it was his first time teaching so he was disorganized on the syllabus and assignments quite a bit of work so would not recommend taking his class before etam he pushes you to develop good programming practices you will come out of this class a better individual programmer but probably not with the best grade
i have perspective transferring from tamu to another universit

In [66]:

'''array([
       [0.37252759, 0.62747241],
       [0.33527024, 0.66472976],
       [0.04423447, 0.95576553],
       [0.14304011, 0.85695989],
       [0.48898867, 0.51101133],
       [0.59293458, 0.40706542],
       [0.24181395, 0.75818605]])'''
best_estimator.predict_proba(docs_preprocessed)

array([[0.37252759, 0.62747241],
       [0.18490677, 0.81509323],
       [0.04423447, 0.95576553],
       [0.31818556, 0.68181444],
       [0.40490644, 0.59509356],
       [0.59293458, 0.40706542],
       [0.15071307, 0.84928693]])

# Possible Improvements
* Could engineer new features using words that are capitalized in the review
* Remove reviews that're not english