# Preprocessing Setup
Basic steps, because there are domain specific problems to account for later

In [None]:
import pandas as pd
import numpy as np
import re
import spacy
import pkg_resources
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words, wordnet, brown

In [None]:
# !pip install symspellpy

from spacy.cli import download
download('en_core_web_md')
nltk.download('words')
nltk.download('omw-1.4')
nltk.download('wordnet')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# from symspellpy import SymSpell, Verbosity

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
reviews = pd.read_csv("/content/drive/MyDrive/RMP/reviews_filtered.csv").sample(n=300000, random_state=1)
# reviews = pd.read_csv("/content/drive/MyDrive/RMP/scraped_comments_with_professor.csv").sample(n=300000, random_state=1)
# reviews[['comment_id', 'firstName', 'lastName', 'prof_class', 'comment', 'clarityRating', 'helpfulRating']].to_csv("/content/drive/MyDrive/RMP/scraped_comments_sample.csv")

In [None]:
reviews.head()

Unnamed: 0.1,Unnamed: 0,firstName,lastName,prof_class,comment,date,clarityRating,helpfulRating,professor_id,comment_id,qualityRating,sentiment
220803,781736,Barbara,Ewell,ENGWRIT,She was very helpful in my writing class.,2012-07-17 14:50:33+00:00,5,5,VGVhY2hlci05NzE2ODU=,UmF0aW5nLTIwNDk4NjIz,5.0,True
101502,588101,Jose,Delpilar,PSYCH101,AMAZING PROFESSOR!!!!!!!!!!!!!!!!!!!!! You won...,2011-12-10 00:05:38+00:00,5,5,VGVhY2hlci0xNTc2Mzg5,UmF0aW5nLTE5NDQ4MDgy,5.0,True
106342,380054,Samuel,Workman,GOV310L,Such a great professor! He's really helpful an...,2012-01-10 18:17:36+00:00,5,5,VGVhY2hlci0xNDQ1OTQ4,UmF0aW5nLTE5NzA1NDYx,5.0,True
66781,173732,Nanete,Maki-Dearsan,ART20,This teacher seemed helpful but then angry whe...,2010-01-26 14:32:30+00:00,2,2,VGVhY2hlci0xMDI0OTAy,UmF0aW5nLTE2NzU4NzE4,2.0,False
229545,24293,Catherine,Bacus,GERO11,"On Campus Course: Organized, Stays on task, En...",2009-11-24 10:00:38+00:00,5,5,VGVhY2hlci0xMDg2NjY0,UmF0aW5nLTE2NDIxNzAy,5.0,True


## Duplicates and Nulls

In [None]:
print("Shape before dropping:", reviews.shape)
reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

# drop rows containing only "No Comments" (default value assigned by RMP to a review that didn't enter a comment)
reviews = reviews[reviews["comment"] != "No Comments"]

# drop rows containing NaN comment
reviews.dropna(subset=["comment"], inplace=True)

# fill null names with empty string
reviews['firstName'].fillna('', inplace=True)
reviews['lastName'].fillna('', inplace=True)

# Dropping reviews with qualityRating == 3.0
reviews['qualityRating'] = (reviews['helpfulRating']+reviews['clarityRating'])/2.0
reviews = reviews[reviews["qualityRating"] != 3.0]
reviews["sentiment"] = reviews["qualityRating"] > 3.0

print("Shape after dropping:", reviews.shape)
reviews.reset_index(drop=True, inplace=True)

Shape before dropping: (300000, 12)
Shape after dropping: (300000, 12)


## Removing Urls, Phone Numbers, and Emails

In [None]:
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', ' ', text)

def remove_phones(text):
    return re.sub(r'\d{3}-\d{3}-\d{4}', ' ', text)

def remove_emails(text):
    return re.sub(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', ' ', text)

print(remove_urls('Hey! Check out this link: www.somelink.com'))
print(remove_phones("Hey! Check out this phone number: 742-457-0417"))
print(remove_emails("Hey! Check out this email address: nooneuses@yahoo.com"))

Hey! Check out this link:  
Hey! Check out this phone number:  
Hey! Check out this email address:  


## Html Artifacts

In [None]:
# TODO: Convert html entites of quotes -> "'" to normalize
def remove_html_entities(text):
  text = re.sub('&[0-9a-zA-Z#]+;', ' ', text)
  return re.sub('&#63;?', '', text)

def remove_html_tags(text):
  return re.sub('<.{1,6}?>', ' ', text)

text = "This professor is such an easy &quot;A&quot;, why are y'all struggling &#63;&#63;&#63 </div>"
print(remove_html_entities(text))
print(remove_html_tags(text))

This professor is such an easy  A , why are y'all struggling    </div>
This professor is such an easy &quot;A&quot;, why are y'all struggling &#63;&#63;&#63  


## Emoticon Conversion to Words

In [None]:
#@title Emoticon Mapping
EMOTICONS = {
    u"<3": "emopos",
    u":‑)":"emopos",
    u":-))":"emopos",
    u":-)))":"emopos",
    u":)":"emopos",
    u":))":"emopos",
    u":)))":"emopos",
    u":-]":"emopos",
    u":]":"emopos",
    u":-3":"emopos",
    u":3":"emopos",
    u":->":"emopos",
    u":>":"emopos",
    u"8-)":"emopos",
    u":-}":"emopos",
    u":}":"emopos",
    u":-)":"emopos",
    u":c)":"emopos",
    u":^)":"emopos",
    u"=]":"emopos",
    u"=)":"emopos",
    u":‑D":"emopos",
    u":D":"emopos",
    u"8‑D":"emopos",
    u"8D":"emopos",
    u"X‑D":"emopos",
    u"XD":"emopos",
    u"=D":"emopos",
    u"=3":"emopos",
    u"B^D":"emopos",
    u":-))":"emopos",
    u":-(":"emoneg",
    u":‑(":"emoneg",
    u":(":"emoneg",
    u":‑c":"emoneg",
    u":c":"emoneg",
    u":‑<":"emoneg",
    u":<":"emoneg",
    u":‑[":"emoneg",
    u":[":"emoneg",
    u":-||":"emoneg",
    u">:[":"emoneg",
    u":{":"emoneg",
    u">:(":"emoneg",
    u":'‑(":"emoneg",
    u":'(":"emoneg",
    u":'‑)":"emopos",
    u":')":"emopos",
    u"D‑':":"emoneg",
    u"D:<":"emoneg",
    u"D:":"emoneg",
    u"D8":"emoneg",
    u"D;":"emoneg",
    u"D=":"emoneg",
    u"DX":"emoneg",
    u";‑)":"emopos",
    u";)":"emopos",
    u"*-)":"emopos",
    u"*)":"emopos",
    u";‑]":"emopos",
    u";]":"emopos",
    u";^)":"emopos",
    u":‑,":"emopos",
    u";D":"emopos",
    u":‑P":"emopos",
    u":P":"emopos",
    u"X‑P":"emopos",
    u"XP":"emopos",
    u":‑Þ":"emopos",
    u":Þ":"emopos",
    u"=p":"emopos",
    u":‑/":"emoneg",
    u":/":"emoneg",
    u":-[.]":"emoneg",
    u">:[(\)]":"emoneg",
    u">:/":"emoneg",
    u":[(\)]":"emoneg",
    u"=/":"emoneg",
    u"=[(\)]":"emoneg",
    u":L":"emoneg",
    u"=L":"emoneg",
    u":‑|":"emoneg",
    u":|":"emoneg",
    u"O:‑)":"emopos",
    u"O:)":"emopos",
    u"0:‑3":"emopos",
    u"0:3":"emopos",
    u"0:‑)":"emopos",
    u"0:)":"emopos",
    u":‑b":"emopos",
    u"(>_<)":"emoneg",
    u"(>_<)>":"emoneg",
    u"^_^":"emopos",
    u"(^_^)/":"emopos",
    u"(^O^)／":"emopos",
    u"(^o^)／":"emopos",
    u"('_')":"emoneg",
    u"(/_;)":"emoneg",
    u"(T_T) (;_;)":"emoneg",
    u"(;_;":"emoneg",
    u"(;_:)":"emoneg",
    u"(;O;)":"emoneg",
    u"(:_;)":"emoneg",
    u"(ToT)":"emoneg",
    u";_;":"emoneg",
    u";-;":"emoneg",
    u";n;":"emoneg",
    u"Q.Q":"emoneg",
    u"T.T":"emoneg",
    u"Q_Q":"emoneg",
    u"(-.-)":"emopos",
    u"(-_-)":"emopos",
    u"(；一_一)":"emopos",
    u"(=_=)":"emoneg",
    u"^m^":"emopos",
    u">^_^<":"emopos",
    u"<^!^>":"emopos",
    u"^/^":"emopos",
    u"（*^_^*）" :"emopos",
    u"(^<^) (^.^)":"emopos",
    u"(^^)":"emopos",
    u"(^.^)":"emopos",
    u"(^_^.)":"emopos",
    u"(^_^)":"emopos",
    u"(^^)":"emopos",
    u"(^J^)":"emopos",
    u"(*^.^*)":"emopos",
    u"(^—^）":"emopos",
    u"(#^.^#)":"emopos",
    u"(*^0^*)":"emopos",
    u"(*^^)v":"emopos",
    u"(^_^)v":"emopos",
    u'(-"-)':"emoneg",
    u"(ーー;)":"emoneg",
    u"(＾ｖ＾)":"emopos",
    u"(＾ｕ＾)":"emopos",
    u"(^)o(^)":"emopos",
    u"(^O^)":"emopos",
    u"(^o^)":"emopos",
    u")^o^(":"emopos",
    u":O o_O":"emoneg",
    u"o_0":"emoneg",
    u"o.O":"emoneg",
    u"(o.o)":"emoneg",
    u"(*￣m￣)": "emoneg",
}

for emote, val in EMOTICONS.items():
  EMOTICONS[emote] = val.lower().replace(',', ' ').replace(' ', '_')

In [None]:
def convert_emoticons(text):
  return EMOTICONS.get(text, text)
  
text = "Hello :-) :-)"
text_split = text.split()
for i, txt in enumerate(text_split):
  text_split[i] = convert_emoticons(txt)
print(' '.join(text_split))

Hello emopos emopos


## Contractions

In [None]:
#@title Contraction Mapping
contraction_mapping = {
    "dont": "do not",
    "doesnt": "does not",
    "arent": "are not",
    "cant": "can not",
    "couldve": "could have",
    "couldnt": "could not",
    "didnt": "did not",
    "aint": "is not",
    "arent": "are not",
    "hes": "he is",
    "shes": "she is",
    "havent": "have not",
    "hasnt": "has not",
    'youll': "you will",
    "ive": "i have",
    "youve": "you have",
    "shouldve": "should have",
    "im": "i am",
    "isnt": "is not",
    "ain't": "is not", 
    "aren't": "are not",
    "can't": "cannot",
    "'cause": "because", 
    "could've": "could have", 
    "couldn't": "could not", 
    "didn't": "did not",  
    "doesn't": "does not", 
    "don't": "do not", 
    "hadn't": "had not", 
    "hasn't": "has not", 
    "shouldnt": "should not",
    "haven't": "have not", 
    "he'd": "he would",
    "he'll": "he will", 
    "he's": "he is", 
    "how'd": "how did", 
    "how'd'y": "how do you", 
    "how'll": "how will", 
    "how's": "how is",
    "i'd": "i would", 
    "i'd've": "i would have", 
    "i'll": "i will",  
    "i'll've": "i will have",
    "i'm": "i am", 
    "i've": "i have", 
    "isn't": "is not", 
    "it'd": "it would", 
    "it'd've": "it would have", 
    "it'll": "it will", 
    "it'll've": "it will have",
    "it's": "it is", 
    "let's": "let us", 
    "ma'am": "madam", 
    "mayn't": "may not", 
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have", 
    "must've": "must have", 
    "mustn't": "must not", 
    "mustn't've": "must not have", 
    "needn't": "need not", 
    "needn't've": "need not have",
    "o'clock": "of the clock", 
    "oughtn't": "ought not", 
    "oughtn't've": "ought not have", 
    "shan't": "shall not", 
    "sha'n't": "shall not", 
    "shan't've": "shall not have", 
    "she'd": "she would", 
    "she'd've": "she would have", 
    "she'll": "she will", 
    "she'll've": "she will have", 
    "she's": "she is", 
    "should've": "should have", 
    "shouldn't": "should not", 
    "shouldn't've": "should not have", 
    "this's": "this is",
    "this'll": "this will",
    "thisll": "this will",
    "that'd": "that would", 
    "that'd've": "that would have", 
    "that's": "that is", 
    "thats": "that is",
    "there'd": "there would", 
    "there'd've": "there would have", 
    "there's": "there is", 
    "theres": "there is",
    "here's": "here is",
    "heres": "here is",
    "they'd": "they would", 
    "they'd've": "they would have", 
    "they'll": "they will", 
    "they'll've": "they will have", 
    "they're": "they are", 
    "they've": "they have", 
    "to've": "to have", 
    "wasn't": "was not", 
    "wasnt": "was not",
    "we'd": "we would", 
    "we'd've": "we would have", 
    "we'll": "we will", 
    "we'll've": "we will have", 
    "we're": "we are", 
    "we've": "we have", 
    "weve": "we have",
    "werent": "were not",
    "weren't": "were not", 
    "what'll": "what will",
    "whatll": "what will",
    "what'll've": "what will have", 
    "what're": "what are",  
    "what's": "what is", 
    "what's": "what is",
    "whatve": "what have",
    "what've": "what have", 
    "when's": "when is", 
    "when've": "when have", 
    "where'd": "where did", 
    "where's": "where is", 
    "wheres": "where is", 
    "where've": "where have",
    "who'll": "who will", 
    "who'll've": "who will have", 
    "who's": "who is", 
    "who've": "who have", 
    "whys": "why is",
    "why's": "why is", 
    "why've": "why have", 
    "will've": "will have", 
    "willve": "will have",
    "won't": "will not", 
    "wont": "will not",
    "won't've": "will not have", 
    "would've": "would have", 
    "wouldve": "would have",
    "wouldn't": "would not", 
    "wouldnt": "would not",
    "wouldn't've": "would not have", 
    "y'all": "you all", 
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would", 
    "you'd've": "you would have", 
    "you'll": "you will", 
    "you'll've": "you will have", 
    "you're": "you are", 
    "you've": "you have"
}

len(contraction_mapping)

In [None]:
len(contraction_mapping)

120

In [None]:
def expand_contraction(text): # Before expanding contraction, might want to clean of symbols that are not '
  return contraction_mapping.get(text, text)

text = "You're a pig and I should've slayed you, grrr"
text_split = text.split()
for i, txt in enumerate(text_split):
  text_split[i] = expand_contraction(txt.lower())
print(' '.join(text_split))

you are a pig and i should have slayed you, grrr


## Slang/Vocabulary

In [None]:
#@title Vocab Mapping
vocab_mapping = {
    'ta': 'teaching assistant',
    'biz': 'business',
    'hw': 'homework',
    'hws': 'homeworks',
    'faq': 'frequently answered question',
    'faqs': 'frequently answered questions',
    'mcq': 'multiple choice question',
    'mcqs': 'multiple choice questions',
    'frq': 'free response question',
    'frqs': 'free response questions',
    'ppt': 'powerpoint',
    'ppts': 'powerpoints',
    'ques': 'question',
    'bs': 'bullshit',
    'bsing': 'bullshitting',
    'bsed': 'bullshitted',
    'lol': 'laugh out loud',
    'btw': 'by the way',
    'imo': 'in my opinion',
    'imho': 'in my honest opinion',
    'tbh': 'to be honest',
    'asap': 'as soon as possible',
    'idc': 'i do not care',
    'omg': 'oh my god',
    'ppl': 'people',
    'rip': 'rest in peace',
    'srsly': 'seriously',
    'thx': 'thanks',
    'txt': 'text',
    'ur': 'your',
    'tho': 'though',
    'wtf': 'what the fuck',
    'wth': 'what the heck',
    'bc': 'because',
    'b4': 'before',
    'h8': 'hate',
    'jk': 'just kidding',
    'cuz': 'because'
}

## Spellchecker

In [None]:
# sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
# dictionary_path = pkg_resources.resource_filename(
#     "symspellpy", "frequency_dictionary_en_82_765.txt"
# )
# # term_index is the column of the term and count_index is the
# # column of the term frequency
# sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# # lookup suggestions for single-word input strings
# input_term = "memebers"  # misspelling of "members"

# # Verbosity.TOP gets the best suggestion
# suggestion = sym_spell.lookup(input_term, Verbosity.TOP, max_edit_distance=2)
# print(suggestion[0], len(suggestion))

## Stopwords

In [None]:
# amend list of stop words to keep whatever it is we want by removing words from list that we want to keep

# TODO: is the list of stopwords on git complete and accurate or does someone want to read through all 325 stopwords spacy gives and determine which ones to keep?
# stopwords = STOP_WORDS
# stopwords.remove("but")
# stopwords.remove("not")
# stopwords.remove("nor")
# stopwords.remove("never")
gen_stops = set(["mr", "ms", "dr", "doctor", "s", "t", "i", "me", "myself", "is", "she", "he", "we", "him", "her", "it"])
domain_stops = set(["book", "books", "college", "colleges", "lecture", "lectures", "university", "universities", "lab", "labs", "hw", "hws", "quiz", "quizzes", "prof", "professor", "teacher", "class", "classes", "course", "courses"])
stopwords = gen_stops.union(domain_stops)

## Spacy Setup

In [None]:
nlp = spacy.load('en_core_web_sm', exclude=['lemmatizer', 'parser', 'textcat', 'custom'])

# Undersampling
Currently the method used to undersample is messy.
Not only this, but with multinomial nb the stats are:

In [None]:
reviews_pos = reviews[reviews['sentiment'] == 1]
reviews_neg = reviews[reviews['sentiment'] == 0]
print(len(reviews_pos), len(reviews_neg))
reviews_pos = reviews[reviews['sentiment'] == 1].sample(n = int(len(reviews_neg)*(.7/.3)), random_state=1) # Messy way of undersampling

print(len(reviews_pos))

reviews_pos.reset_index(inplace=True, drop=True)
reviews_neg.reset_index(inplace=True, drop=True)
reviews = pd.concat([reviews_pos, reviews_neg], ignore_index=True)

print(reviews.sentiment.value_counts())
print(reviews.sentiment.value_counts()[True]/len(reviews))

221784 78216
182504
True     182504
False     78216
Name: sentiment, dtype: int64
0.7


# Preprocessing Pipeline

In [None]:
spellchecked_comments = []
lemm = WordNetLemmatizer()
grades = set(['a', 'b', 'c', 'd', 'e', 'f'])
unseen = Counter()

def preprocess_pipeline(df):
  cnt = 0 # to keep track of progress
  comments_proper = []
  for index, review in df.iterrows():
    comment = review['comment']
    fname = review['firstName'].lower().split(' ')
    lname = review['lastName'].lower().split(' ')
    names = set(fname + lname)

    cnt += 1
    if cnt % 25000 == 0:
      print(cnt)

    comment = remove_urls(comment)
    comment = remove_phones(comment)
    comment = remove_emails(comment)
    comment = remove_html_entities(comment)
    comment = remove_html_tags(comment)

    comment_split = comment.split(' ')
    new_comment_split = []
    for i, word in enumerate(comment_split):
      word = convert_emoticons(word)
      word = word.lower()
      word = expand_contraction(word)
      word = re.sub("[^a-z\s]+", ' ', word)   # replace characters that are not alphabetic, space, or underscore
      # word = word.replace("'", ' ') # replace apostrophe with space
      word = re.sub(r'(.)\1\1+', '\g<1>', word)  # replace any three character+ sequence with one
      word = re.sub('\s+', ' ', word)
      word = word.strip() # trailing whitespace because punctuation replaced by space
      # if word not in names:
      new_comment_split.extend(word.split(' '))

    # comment = comment.lower()
    # comment = re.sub("[^a-zA-Z\s]+", ' ', comment)   # replace characters that are not alphabetic, space, or underscore
    # comment = comment.replace("'", '') # remove apostrophes
    # comment = re.sub(r'(.)\1\1+', '\g<1>', comment)  # replace any three characters sequence with one
    # comment = re.sub('\s+', ' ', comment)
    # comment = comment.strip() # trailing whitespace because punctuation replaced by space

    
    # comment_split = comment.split(' ')
    # new_comment_split = []
    # for i, word in enumerate(comment_split):
    #   if word not in names:
    #     new_comment_split.append(word)

    # Remove names from the comment
    for i, word in enumerate(new_comment_split):
      if word in names or word in stopwords:
        new_comment_split[i] = ''

    comment = ' '.join(new_comment_split)
    comment = re.sub('\s+', ' ', comment)
    comment = comment.strip()

    # comment = [lemm.lemmatize(word) for word in comment.split()] # Lemmatize
    # comment = [word for word in comment.split() if word not in stopwords] # remove stopwords
    # comment = " ".join(comment)

    # comment = ' '.join(word for word in comment.split() if len(word) > 1)

    comments_proper.append(comment)
    # spellchecked_comments.append(' '.join(sym_spell.lookup(word, Verbosity.TOP, max_edit_distance=2, include_unknown=True)[0].term for word in comment.split()))
  return comments_proper

comments_proper = preprocess_pipeline(reviews)

25000
50000
75000
100000
125000
150000
175000
200000
225000
250000


In [None]:
unseen_words = Counter()
people = Counter()
def preprocess_pipe(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=200):
      # for word in doc:
      #   if word.pos_ == 'PROPN':
      #     unseen_words[word.text] += 1
      #     print(word.text, word.pos_)
      # print(doc.ents)
      for word in doc.ents:
        if word.label_ == 'PERSON':
          people[word.text] += 1
          # print(word.text,word.label_)

# preprocess_pipe(comments_proper)
# print(unseen_words, len(unseen_words))
# print(people, len(people))

In [None]:
print(people)

Counter()


In [None]:
# i = 0
# for comment, spellcheck_comment in zip(comments_proper, spellchecked_comments):
#   print(comment)
#   print(spellcheck_comment)
#   print('\n')
#   i += 1
#   if i == 10:
#     break

In [None]:
reviews["cleanedComment"] = pd.Series(comments_proper)
# reviews["cleanedCommentChecked"] = pd.Series(spellchecked_comments)
reviews['cleanedComment'].head(25)

0     really nice boring but if you attend his you w...
1     pretty easy all grades you on tests with the t...
2     this amazing very knowledgeable very helpful f...
3           very inspirational caring and understanding
4     awesome not a complete blow off but pretty dam...
5     took speech from back in at golden west in cal...
6     very clear when teaches sit in the front row b...
7                  great that you can easily relate too
8     great took this online very clear on what requ...
9     really cares for students expects you to work ...
10    was by far the nicest and most caring that hav...
11            made interesting and fun highly recommend
12    should preface this review by saying that have...
13                                      liked his kinda
14    took his at the of kentucky not murray state a...
15    took ochem and during the summer and got a in ...
16             amazing loved study and you will do well
17    an awesome does not just go over power poi

In [None]:
reviews['sentiment'].value_counts()

True     182504
False     78216
Name: sentiment, dtype: int64

In [None]:
for index, row in reviews.head(50).iterrows():
    print(row['comment'])
    print(row['cleanedComment'])
    print('\n')

Really nice prof, he is boring but if you attend his lectures you will do really well, provides good summary notes, overall good person! not bad as everyone says he is :)
really nice boring but if you attend his you will do really well provides good summary notes overall good person not bad as everyone says emopos


Class is pretty easy. All he grades you on is 4 tests with the 4th being cumulative. The tests are not hard if you pay attention and take notes. He does give out extra credit in the form of &quot;bumps&quot; you get these by being in his &quot;trial&quot; setup, being a student investigator, or being a student senator. Overall fun, easy class. Take him.
pretty easy all grades you on tests with the th being cumulative the tests are not hard if you pay attention and take notes does give out extra credit in the form of bumps you get these by being in his trial setup being a student investigator or being a student senator overall fun easy take


This professor is amazing.  Very

In [None]:
# dropping rows <= 5
# reviews['wordCount'] = reviews["cleanedComment"].str.split().str.len()
# reviews[['wordCount', 'cleanedComment']].head(5)

# reviews = reviews[reviews['wordCount'] > 5]
# reviews.shape
reviews = reviews.loc[:, ["firstName", "lastName", "comment", "cleanedComment", "clarityRating", "sentiment", "professor_id"]]

# Document Sentiment Pipeline

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedGroupKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

from sklearn.feature_selection import chi2, SelectPercentile, SelectKBest
from sklearn.pipeline import Pipeline

In [None]:
def evalPerformance(yp, yt, mode="micro"):
    prec_score = precision_score(yt, yp)
    rec_score = recall_score(yt, yp)
    f1 = f1_score(yt, yp, average=mode)
    acc_score = accuracy_score(yt, yp)
    conf_m = confusion_matrix(yt, yp)


    print(f"Precision Score: {prec_score*100}")
    print(f"Recall Score: {rec_score*100}")
    print("F1 Score: {0}".format(f1 * 100))
    print("Accuracy Score: " + str(acc_score * 100))
    print(conf_m)


In [None]:
review_ids = reviews.groupby(['professor_id', 'cleanedComment']) 
review_ids.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,firstName,lastName,comment,clarityRating,sentiment
professor_id,cleanedComment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
VGVhY2hlci05Mjc1NTI=,cannot say that methods of grading were unfair however many times requirements for a speech dates and other major things about what was going on were changed in the middle of a if you missed then you may receive a changes like these are not emphasized if you do not have your outline in hand on speech day you get a,.,Ramage,I can't say that her methods of grading were u...,1,False
VGVhY2hlci05Mjc1NTI=,does not put up with nonsense or tardiness at all expects respect will teach you the correct way of speaking whether you can adapt to or not what you learn will definitely benefit you in life very organized prompt with returning grades organization worth a lot in a busy schedule,.,Ramage,She does not put up with nonsense or tardines...,4,True
VGVhY2hlci05Mjc1NTI=,maybe the last commenter should have accepted a few lessons from mrs there are at least two misspelled words and at least one misused word in the comments grammar matters mrs a very good learned much more because taught the correct way of doing things,.,Ramage,Maybe the last commenter should have accepted ...,4,True
VGVhY2hlci05Mjc1NTI=,not relly a speech more of just an outline does not grade you speech just your outline so be orginized took and was not satisfied style more of grading the paper based on grammer more of and english then a speech,.,Ramage,Not relly a speech teacher more of just an out...,1,False
VGVhY2hlci05Mjc1NTI=,requires you to print out papers for to grade your speech seems like wants you to fail the if your missing one thing will cause you to not be able to make your speech not the best speech giver either tells the same stories over and over again,.,Ramage,She requires you to print out papers for her t...,3,False
...,...,...,...,...,...,...
VGVhY2hlci0xNjIxNTMz,a great in my opinion just really bad hand writing if you want to do well in his like any other uga you must review your in examples and notes the homework are tough but if you follow all his proofs you will be able to solve them most be do poorly because they are not willing to stay after,Ryan,Scott,"He is a great teacher in my opinion, just real...",3,True
VGVhY2hlci0xNjIxNTMz,a hard has test and curves them all,Ryan,Scott,He's a hard teacher. He has 3 test and curves ...,2,False
VGVhY2hlci0xNjIxNTMz,teaches the material but does not go back if you missed a day curves the test but the test and homework are super hard go to tutoring get to know your fellow students its small and they are your best bet to better understanding the material loves math you can see think asks questions during that you do not know how to answer,Ryan,Scott,He teaches the material but doesn't go back if...,3,False
VGVhY2hlci0xNjIxNTMz,tests are hard you need to study and pay attention wants to make sure students understand the concepts there are tests but are curved in the end,Ryan,Scott,Tests are hard. You need to study and pay atte...,2,False


In [None]:
groups_professor_id_list = np.array(reviews['professor_id'].values)
print(groups_professor_id_list[:5])

y = reviews['sentiment']
print(y.head(5))

X = reviews['cleanedComment']
X.head(5)

['VGVhY2hlci0xMDA5OTAz' 'VGVhY2hlci0xMTM0MDI3' 'VGVhY2hlci0xNTY2MTcw'
 'VGVhY2hlci0xNjIwMjE4' 'VGVhY2hlci05NjQ4NTY=']
0    True
1    True
2    True
3    True
4    True
Name: sentiment, dtype: bool


0    really nice boring but if you attend his you w...
1    pretty easy all grades you on tests with the t...
2    this amazing very knowledgeable very helpful f...
3          very inspirational caring and understanding
4    awesome not a complete blow off but pretty dam...
Name: cleanedComment, dtype: object

## Pipeline

In [None]:
sent_pipeline = Pipeline([
    # ("vectorizer", CountVectorizer(ngram_range=(1,2), max_df=0.5)), 
    ("vectorizer", TfidfVectorizer(ngram_range=(1,2), min_df = 7)),
    ("selector"  , SelectPercentile(score_func=chi2, percentile=30)),
    ("classifier" , MultinomialNB(alpha=1.0))
    # ("classifer" , DecisionTreeClassifier(max_depth=5))
])

# sent_pipeline = Pipeline([
#     ("vectorizer", TfidfVectorizer(ngram_range=(1,2), min_df=8, stop_words=stopwords)),
#     ("selector"  , SelectPercentile(score_func=chi2, percentile=30)),
#     ("classifier" , MultinomialNB(alpha=1.0))
# ])

In [None]:
param_list = {
    'vectorizer__ngram_range': [(1, 3)],
    'vectorizer__min_df': [6, 7, 8, 9],
    'selector__percentile': range(10, 41, 2),
    'classifier__alpha': np.arange(0, .5, .05),
}

In [None]:
sgkf = StratifiedGroupKFold(n_splits = 5)
random_search = RandomizedSearchCV(sent_pipeline, param_list, scoring='f1_micro', cv=sgkf, n_iter=20, verbose=3)

## Cross Validation

In [None]:
random_search.fit(X, y, groups=groups_professor_id_list)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END classifier__alpha=0.05, selector__percentile=10, vectorizer__min_df=8, vectorizer__ngram_range=(1, 3);, score=0.929 total time=  25.6s
[CV 2/5] END classifier__alpha=0.05, selector__percentile=10, vectorizer__min_df=8, vectorizer__ngram_range=(1, 3);, score=0.931 total time=  25.6s
[CV 3/5] END classifier__alpha=0.05, selector__percentile=10, vectorizer__min_df=8, vectorizer__ngram_range=(1, 3);, score=0.931 total time=  25.2s
[CV 4/5] END classifier__alpha=0.05, selector__percentile=10, vectorizer__min_df=8, vectorizer__ngram_range=(1, 3);, score=0.932 total time=  25.7s
[CV 5/5] END classifier__alpha=0.05, selector__percentile=10, vectorizer__min_df=8, vectorizer__ngram_range=(1, 3);, score=0.929 total time=  25.6s


  % _ALPHA_MIN


[CV 1/5] END classifier__alpha=0.0, selector__percentile=40, vectorizer__min_df=9, vectorizer__ngram_range=(1, 3);, score=0.928 total time=  25.4s


  % _ALPHA_MIN


[CV 2/5] END classifier__alpha=0.0, selector__percentile=40, vectorizer__min_df=9, vectorizer__ngram_range=(1, 3);, score=0.928 total time=  25.5s


  % _ALPHA_MIN


[CV 3/5] END classifier__alpha=0.0, selector__percentile=40, vectorizer__min_df=9, vectorizer__ngram_range=(1, 3);, score=0.927 total time=  25.5s


  % _ALPHA_MIN


[CV 4/5] END classifier__alpha=0.0, selector__percentile=40, vectorizer__min_df=9, vectorizer__ngram_range=(1, 3);, score=0.929 total time=  25.4s


  % _ALPHA_MIN


[CV 5/5] END classifier__alpha=0.0, selector__percentile=40, vectorizer__min_df=9, vectorizer__ngram_range=(1, 3);, score=0.927 total time=  25.4s
[CV 1/5] END classifier__alpha=0.2, selector__percentile=24, vectorizer__min_df=6, vectorizer__ngram_range=(1, 3);, score=0.934 total time=  25.9s
[CV 2/5] END classifier__alpha=0.2, selector__percentile=24, vectorizer__min_df=6, vectorizer__ngram_range=(1, 3);, score=0.935 total time=  25.6s
[CV 3/5] END classifier__alpha=0.2, selector__percentile=24, vectorizer__min_df=6, vectorizer__ngram_range=(1, 3);, score=0.934 total time=  25.6s
[CV 4/5] END classifier__alpha=0.2, selector__percentile=24, vectorizer__min_df=6, vectorizer__ngram_range=(1, 3);, score=0.935 total time=  25.9s
[CV 5/5] END classifier__alpha=0.2, selector__percentile=24, vectorizer__min_df=6, vectorizer__ngram_range=(1, 3);, score=0.934 total time=  25.9s
[CV 1/5] END classifier__alpha=0.4, selector__percentile=30, vectorizer__min_df=6, vectorizer__ngram_range=(1, 3);, sc

  % _ALPHA_MIN


[CV 1/5] END classifier__alpha=0.0, selector__percentile=12, vectorizer__min_df=7, vectorizer__ngram_range=(1, 3);, score=0.927 total time=  26.4s


  % _ALPHA_MIN


[CV 2/5] END classifier__alpha=0.0, selector__percentile=12, vectorizer__min_df=7, vectorizer__ngram_range=(1, 3);, score=0.928 total time=  26.3s


  % _ALPHA_MIN


[CV 3/5] END classifier__alpha=0.0, selector__percentile=12, vectorizer__min_df=7, vectorizer__ngram_range=(1, 3);, score=0.927 total time=  26.8s


  % _ALPHA_MIN


[CV 4/5] END classifier__alpha=0.0, selector__percentile=12, vectorizer__min_df=7, vectorizer__ngram_range=(1, 3);, score=0.929 total time=  27.8s


  % _ALPHA_MIN


[CV 5/5] END classifier__alpha=0.0, selector__percentile=12, vectorizer__min_df=7, vectorizer__ngram_range=(1, 3);, score=0.927 total time=  26.6s
[CV 1/5] END classifier__alpha=0.25, selector__percentile=10, vectorizer__min_df=9, vectorizer__ngram_range=(1, 3);, score=0.928 total time=  26.1s
[CV 2/5] END classifier__alpha=0.25, selector__percentile=10, vectorizer__min_df=9, vectorizer__ngram_range=(1, 3);, score=0.930 total time=  26.4s
[CV 3/5] END classifier__alpha=0.25, selector__percentile=10, vectorizer__min_df=9, vectorizer__ngram_range=(1, 3);, score=0.930 total time=  26.1s
[CV 4/5] END classifier__alpha=0.25, selector__percentile=10, vectorizer__min_df=9, vectorizer__ngram_range=(1, 3);, score=0.930 total time=  26.6s
[CV 5/5] END classifier__alpha=0.25, selector__percentile=10, vectorizer__min_df=9, vectorizer__ngram_range=(1, 3);, score=0.928 total time=  27.4s
[CV 1/5] END classifier__alpha=0.25, selector__percentile=32, vectorizer__min_df=8, vectorizer__ngram_range=(1, 3

  % _ALPHA_MIN


[CV 1/5] END classifier__alpha=0.0, selector__percentile=28, vectorizer__min_df=7, vectorizer__ngram_range=(1, 3);, score=0.929 total time=  27.3s


  % _ALPHA_MIN


[CV 2/5] END classifier__alpha=0.0, selector__percentile=28, vectorizer__min_df=7, vectorizer__ngram_range=(1, 3);, score=0.930 total time=  26.8s


  % _ALPHA_MIN


[CV 3/5] END classifier__alpha=0.0, selector__percentile=28, vectorizer__min_df=7, vectorizer__ngram_range=(1, 3);, score=0.928 total time=  28.0s


  % _ALPHA_MIN


[CV 4/5] END classifier__alpha=0.0, selector__percentile=28, vectorizer__min_df=7, vectorizer__ngram_range=(1, 3);, score=0.930 total time=  27.5s


  % _ALPHA_MIN


[CV 5/5] END classifier__alpha=0.0, selector__percentile=28, vectorizer__min_df=7, vectorizer__ngram_range=(1, 3);, score=0.928 total time=  27.6s
[CV 1/5] END classifier__alpha=0.1, selector__percentile=36, vectorizer__min_df=6, vectorizer__ngram_range=(1, 3);, score=0.935 total time=  27.8s
[CV 2/5] END classifier__alpha=0.1, selector__percentile=36, vectorizer__min_df=6, vectorizer__ngram_range=(1, 3);, score=0.936 total time=  28.3s
[CV 3/5] END classifier__alpha=0.1, selector__percentile=36, vectorizer__min_df=6, vectorizer__ngram_range=(1, 3);, score=0.935 total time=  28.3s
[CV 4/5] END classifier__alpha=0.1, selector__percentile=36, vectorizer__min_df=6, vectorizer__ngram_range=(1, 3);, score=0.936 total time=  28.5s
[CV 5/5] END classifier__alpha=0.1, selector__percentile=36, vectorizer__min_df=6, vectorizer__ngram_range=(1, 3);, score=0.935 total time=  28.0s
[CV 1/5] END classifier__alpha=0.15000000000000002, selector__percentile=38, vectorizer__min_df=9, vectorizer__ngram_r

RandomizedSearchCV(cv=StratifiedGroupKFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('vectorizer',
                                              TfidfVectorizer(min_df=7,
                                                              ngram_range=(1,
                                                                           2))),
                                             ('selector',
                                              SelectPercentile(percentile=30,
                                                               score_func=<function chi2 at 0x7fba76231cb0>)),
                                             ('classifier', MultinomialNB())]),
                   n_iter=20,
                   param_distributions={'classifier__alpha': array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45]),
                                        'selector__percentile': range(10, 41, 2),
                                        'vectorize

## Estimator Selection

In [None]:
print(random_search.best_params_)
print(random_search.best_estimator_)
best_estimator = random_search.best_estimator_

{'vectorizer__ngram_range': (1, 3), 'vectorizer__min_df': 6, 'selector__percentile': 36, 'classifier__alpha': 0.1}
Pipeline(steps=[('vectorizer', TfidfVectorizer(min_df=6, ngram_range=(1, 3))),
                ('selector',
                 SelectPercentile(percentile=36,
                                  score_func=<function chi2 at 0x7fba76231cb0>)),
                ('classifier', MultinomialNB(alpha=0.1))])


In [None]:
# best_estimator = Pipeline([
#     ("vectorizer", TfidfVectorizer(ngram_range=(1,2), min_df=8, stop_words=stopwords)),
#     ("selector"  , SelectPercentile(score_func=chi2, percentile=30)),
#     ("classifer" , MultinomialNB())
# ])
# best_estimator.fit(X, y)

# Validating (or Testing???)

In [None]:
test_reviews = pd.read_csv("/content/drive/MyDrive/RMP/scraped_comments.csv").sample(n=120000, random_state=1)

In [None]:
print("Shape before dropping:", test_reviews.shape)
test_reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

# drop rows containing only "No Comments" (default value assigned by RMP to a review that didn't enter a comment)
test_reviews = test_reviews[test_reviews["comment"] != "No Comments"]

# drop rows containing NaN comment
test_reviews.dropna(subset=["comment"], inplace=True)

# fill null names with empty string
test_reviews['firstName'].fillna('', inplace=True)
test_reviews['lastName'].fillna('', inplace=True)

# Dropping test_reviews with qualityRating == 3
test_reviews['qualityRating'] = (test_reviews['helpfulRating']+test_reviews['clarityRating'])/2.0
test_reviews = test_reviews[test_reviews["qualityRating"] != 3.0]
test_reviews["sentiment"] = test_reviews["qualityRating"] > 3.0

print("Shape after dropping:", test_reviews.shape)
test_reviews.reset_index(drop=True, inplace=True)

Shape before dropping: (120000, 16)
Shape after dropping: (106780, 18)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
comments_proper = []

comments_proper = preprocess_pipeline(test_reviews)

In [None]:
sent_predA = best_estimator.predict(comments_proper)

In [None]:
test_reviews.reset_index(inplace=True, drop=True)

## Accuracy

In [None]:
evalPerformance(sent_predA, test_reviews['sentiment'])

In [None]:
right = sum(a == b for a, b in zip(sent_predA, test_reviews['sentiment']))
right / len(sent_predA)

# Possible Improvements
* Could engineer new features using words that are capitalized in the review
* Remove reviews that're not english