In [None]:
import pandas as pd
import numpy as np
import re
import spacy
import pkg_resources
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter, defaultdict

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words, wordnet, brown

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
reviews = pd.read_csv("/content/drive/MyDrive/RMP/scraped_comments_with_professor.csv").sample(n=200000, random_state=1)
# reviews[['comment_id', 'firstName', 'lastName', 'prof_class', 'comment', 'clarityRating', 'helpfulRating']].to_csv("/content/drive/MyDrive/RMP/scraped_comments_sample.csv")

# Helper Functions/Comment ____

In [None]:
#@title Dropping nulls
print("Shape before dropping:", reviews.shape)
reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

# drop rows containing only "No Comments" (default value assigned by RMP to a review that didn't enter a comment)
reviews = reviews[reviews["comment"] != "No Comments"]

# drop rows containing NaN comment
reviews.dropna(subset=["comment"], inplace=True)

# fill null names with empty string
reviews['firstName'].fillna('', inplace=True)
reviews['lastName'].fillna('', inplace=True)

# fill null classes with empty string
reviews['prof_class'].fillna('', inplace=True)

# Dropping qualityRating == 3.0
reviews['qualityRating'] = (reviews['helpfulRating']+reviews['clarityRating'])/2.0
reviews = reviews[reviews["qualityRating"] != 3.0]
reviews["sentiment"] = reviews["qualityRating"] > 3.0

print("Shape after dropping:", reviews.shape)
reviews.reset_index(drop=True, inplace=True)

Shape before dropping: (200000, 17)
Shape after dropping: (184942, 19)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
review_names = reviews.dropna(subset=["firstName", "lastName", "comment", "prof_class"]).loc[:,['firstName', 'lastName', 'comment', 'prof_class']]

In [None]:
#@title Emoticon Mapping
EMOTICONS = {
    u"<3": "emopos",
    u":‑)":"emopos",
    u":-))":"emopos",
    u":-)))":"emopos",
    u":)":"emopos",
    u":))":"emopos",
    u":)))":"emopos",
    u":-]":"emopos",
    u":]":"emopos",
    u":-3":"emopos",
    u":3":"emopos",
    u":->":"emopos",
    u":>":"emopos",
    u"8-)":"emopos",
    u":-}":"emopos",
    u":}":"emopos",
    u":-)":"emopos",
    u":c)":"emopos",
    u":^)":"emopos",
    u"=]":"emopos",
    u"=)":"emopos",
    u":‑D":"emopos",
    u":D":"emopos",
    u"8‑D":"emopos",
    u"8D":"emopos",
    u"X‑D":"emopos",
    u"XD":"emopos",
    u"=D":"emopos",
    u"=3":"emopos",
    u"B^D":"emopos",
    u":-))":"emopos",
    u":-(":"emoneg",
    u":‑(":"emoneg",
    u":(":"emoneg",
    u":‑c":"emoneg",
    u":c":"emoneg",
    u":‑<":"emoneg",
    u":<":"emoneg",
    u":‑[":"emoneg",
    u":[":"emoneg",
    u":-||":"emoneg",
    u">:[":"emoneg",
    u":{":"emoneg",
    u">:(":"emoneg",
    u":'‑(":"emoneg",
    u":'(":"emoneg",
    u":'‑)":"emopos",
    u":')":"emopos",
    u"D‑':":"emoneg",
    u"D:<":"emoneg",
    u"D:":"emoneg",
    u"D8":"emoneg",
    u"D;":"emoneg",
    u"D=":"emoneg",
    u"DX":"emoneg",
    u";‑)":"emopos",
    u";)":"emopos",
    u"*-)":"emopos",
    u"*)":"emopos",
    u";‑]":"emopos",
    u";]":"emopos",
    u";^)":"emopos",
    u":‑,":"emopos",
    u";D":"emopos",
    u":‑P":"emopos",
    u":P":"emopos",
    u"X‑P":"emopos",
    u"XP":"emopos",
    u":‑Þ":"emopos",
    u":Þ":"emopos",
    u"=p":"emopos",
    u":‑/":"emoneg",
    u":/":"emoneg",
    u":-[.]":"emoneg",
    u">:[(\)]":"emoneg",
    u">:/":"emoneg",
    u":[(\)]":"emoneg",
    u"=/":"emoneg",
    u"=[(\)]":"emoneg",
    u":L":"emoneg",
    u"=L":"emoneg",
    u":‑|":"emoneg",
    u":|":"emoneg",
    u"O:‑)":"emopos",
    u"O:)":"emopos",
    u"0:‑3":"emopos",
    u"0:3":"emopos",
    u"0:‑)":"emopos",
    u"0:)":"emopos",
    u":‑b":"emopos",
    u"(>_<)":"emoneg",
    u"(>_<)>":"emoneg",
    u"^_^":"emopos",
    u"(^_^)/":"emopos",
    u"(^O^)／":"emopos",
    u"(^o^)／":"emopos",
    u"('_')":"emoneg",
    u"(/_;)":"emoneg",
    u"(T_T) (;_;)":"emoneg",
    u"(;_;":"emoneg",
    u"(;_:)":"emoneg",
    u"(;O;)":"emoneg",
    u"(:_;)":"emoneg",
    u"(ToT)":"emoneg",
    u";_;":"emoneg",
    u";-;":"emoneg",
    u";n;":"emoneg",
    u"Q.Q":"emoneg",
    u"T.T":"emoneg",
    u"Q_Q":"emoneg",
    u"(-.-)":"emopos",
    u"(-_-)":"emopos",
    u"(；一_一)":"emopos",
    u"(=_=)":"emoneg",
    u"^m^":"emopos",
    u">^_^<":"emopos",
    u"<^!^>":"emopos",
    u"^/^":"emopos",
    u"（*^_^*）" :"emopos",
    u"(^<^) (^.^)":"emopos",
    u"(^^)":"emopos",
    u"(^.^)":"emopos",
    u"(^_^.)":"emopos",
    u"(^_^)":"emopos",
    u"(^^)":"emopos",
    u"(^J^)":"emopos",
    u"(*^.^*)":"emopos",
    u"(^—^）":"emopos",
    u"(#^.^#)":"emopos",
    u"(*^0^*)":"emopos",
    u"(*^^)v":"emopos",
    u"(^_^)v":"emopos",
    u'(-"-)':"emoneg",
    u"(ーー;)":"emoneg",
    u"(＾ｖ＾)":"emopos",
    u"(＾ｕ＾)":"emopos",
    u"(^)o(^)":"emopos",
    u"(^O^)":"emopos",
    u"(^o^)":"emopos",
    u")^o^(":"emopos",
    u":O o_O":"emoneg",
    u"o_0":"emoneg",
    u"o.O":"emoneg",
    u"(o.o)":"emoneg",
    u"(*￣m￣)": "emoneg",
}

for emote, val in EMOTICONS.items():
  EMOTICONS[emote] = val.lower().replace(',', ' ').replace(' ', '_')

In [None]:
#@title Contraction Mapping
contraction_mapping = {
    "dont": "do not",
    "doesnt": "does not",
    "arent": "are not",
    "cant": "can not",
    "couldve": "could have",
    "couldnt": "could not",
    "didnt": "did not",
    "aint": "is not",
    "arent": "are not",
    "hes": "he is",
    "shes": "she is",
    "havent": "have not",
    "hasnt": "has not",
    'youll': "you will",
    "ive": "i have",
    "youve": "you have",
    "shouldve": "should have",
    "im": "i am",
    "isnt": "is not",
    "ain't": "is not", 
    "aren't": "are not",
    "can't": "cannot",
    "'cause": "because", 
    "could've": "could have", 
    "couldn't": "could not", 
    "didn't": "did not",  
    "doesn't": "does not", 
    "don't": "do not", 
    "hadn't": "had not", 
    "hasn't": "has not", 
    "shouldnt": "should not",
    "haven't": "have not", 
    "he'd": "he would",
    "he'll": "he will", 
    "he's": "he is", 
    "how'd": "how did", 
    "how'd'y": "how do you", 
    "how'll": "how will", 
    "how's": "how is",
    "i'd": "i would", 
    "i'd've": "i would have", 
    "i'll": "i will",  
    "i'll've": "i will have",
    "i'm": "i am", 
    "i've": "i have", 
    "isn't": "is not", 
    "it'd": "it would", 
    "it'd've": "it would have", 
    "it'll": "it will", 
    "it'll've": "it will have",
    "it's": "it is", 
    "let's": "let us", 
    "ma'am": "madam", 
    "mayn't": "may not", 
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have", 
    "must've": "must have", 
    "mustn't": "must not", 
    "mustn't've": "must not have", 
    "needn't": "need not", 
    "needn't've": "need not have",
    "o'clock": "of the clock", 
    "oughtn't": "ought not", 
    "oughtn't've": "ought not have", 
    "shan't": "shall not", 
    "sha'n't": "shall not", 
    "shan't've": "shall not have", 
    "she'd": "she would", 
    "she'd've": "she would have", 
    "she'll": "she will", 
    "she'll've": "she will have", 
    "she's": "she is", 
    "should've": "should have", 
    "shouldn't": "should not", 
    "shouldn't've": "should not have", 
    "this's": "this is",
    "this'll": "this will",
    "thisll": "this will",
    "that'd": "that would", 
    "that'd've": "that would have", 
    "that's": "that is", 
    "thats": "that is",
    "there'd": "there would", 
    "there'd've": "there would have", 
    "there's": "there is", 
    "theres": "there is",
    "here's": "here is",
    "heres": "here is",
    "they'd": "they would", 
    "they'd've": "they would have", 
    "they'll": "they will", 
    "they'll've": "they will have", 
    "they're": "they are", 
    "they've": "they have", 
    "to've": "to have", 
    "wasn't": "was not", 
    "wasnt": "was not",
    "we'd": "we would", 
    "we'd've": "we would have", 
    "we'll": "we will", 
    "we'll've": "we will have", 
    "we're": "we are", 
    "we've": "we have", 
    "weve": "we have",
    "werent": "were not",
    "weren't": "were not", 
    "what'll": "what will",
    "whatll": "what will",
    "what'll've": "what will have", 
    "what're": "what are",  
    "what's": "what is", 
    "what's": "what is",
    "whatve": "what have",
    "what've": "what have", 
    "when's": "when is", 
    "when've": "when have", 
    "where'd": "where did", 
    "where's": "where is", 
    "wheres": "where is", 
    "where've": "where have",
    "who'll": "who will", 
    "who'll've": "who will have", 
    "who's": "who is", 
    "who've": "who have", 
    "whys": "why is",
    "why's": "why is", 
    "why've": "why have", 
    "will've": "will have", 
    "willve": "will have",
    "won't": "will not", 
    "wont": "will not",
    "won't've": "will not have", 
    "would've": "would have", 
    "wouldve": "would have",
    "wouldn't": "would not", 
    "wouldnt": "would not",
    "wouldn't've": "would not have", 
    "y'all": "you all", 
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would", 
    "you'd've": "you would have", 
    "you'll": "you will", 
    "you'll've": "you will have", 
    "you're": "you are", 
    "you've": "you have"
}

len(contraction_mapping)

147

In [None]:
#@title Vocab Mapping
vocab_mapping = {
    'ta': 'teaching assistant',
    'biz': 'business',
    'hw': 'homework',
    'hws': 'homeworks',
    'faq': 'frequently answered question',
    'faqs': 'frequently answered questions',
    'mcq': 'multiple choice question',
    'mcqs': 'multiple choice questions',
    'frq': 'free response question',
    'frqs': 'free response questions',
    'ppt': 'powerpoint',
    'ppts': 'powerpoints',
    'ques': 'question',
    'bs': 'bullshit',
    'bsing': 'bullshitting',
    'bsed': 'bullshitted',
    'lol': 'laugh out loud',
    'btw': 'by the way',
    'imo': 'in my opinion',
    'imho': 'in my honest opinion',
    'tbh': 'to be honest',
    'asap': 'as soon as possible',
    'idc': 'i do not care',
    'omg': 'oh my god',
    'ppl': 'people',
    'rip': 'rest in peace',
    'srsly': 'seriously',
    'thx': 'thanks',
    'txt': 'text',
    'ur': 'your',
    'tho': 'though',
    'wtf': 'what the fuck',
    'wth': 'what the heck',
    'bc': 'because',
    'b4': 'before',
    'h8': 'hate',
    'jk': 'just kidding',
    'cuz': 'because'
}

## Stopwords

In [None]:
general_stopwords = set(["mr", "ms", "dr", "s", "t", "i", "me", "myself", "is", "the"])

gen_stops = set(["mr", "ms", "dr", "doctor", "s", "t", "i", "me", "myself", "is", "she", "he", "we", "him", "her", "it"])
domain_stops = set(["book", "books", "college", "colleges", "lecture", "lectures", "university", "universities", "lab", "labs", "hw", "hws", "quiz", "quizzes", "prof", "professor", "teacher", "class", "classes", "course", "courses"])
stopwords = gen_stops.union(domain_stops)

## Helper Functions

In [None]:
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', ' ', text)

def remove_phones(text):
    return re.sub(r'\d{3}-\d{3}-\d{4}', ' ', text)

def remove_emails(text):
    return re.sub(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', ' ', text)

def remove_html_entities(text):
  text = re.sub('&[0-9a-zA-Z#]+;', ' ', text)
  return re.sub('&#63;?', '', text)

def remove_html_tags(text):
  return re.sub('<.{1,6}?>', ' ', text)

def convert_emoticons(text):
  return EMOTICONS.get(text, text)

def expand_contraction(text): # Before expanding contraction, might want to clean of symbols that are not '
  return contraction_mapping.get(text, text)

# Preprocessing Pipeline

In [None]:
from collections import defaultdict

all_words = defaultdict(int)

def preprocess_pipeline(df, func=None):
  cnt = 0 # to keep track of progress
  comments_proper = []
  for index, review in df.iterrows():
    comment = review['comment']
    fname = review['firstName'].lower().split(' ')
    lname = review['lastName'].lower().split(' ')
    names = set(fname + lname)

    cnt += 1
    if cnt % 25000 == 0:
      print(cnt)

    comment = remove_urls(comment)
    comment = remove_phones(comment)
    comment = remove_emails(comment)
    comment = remove_html_entities(comment)
    comment = remove_html_tags(comment)

    comment_split = comment.split(' ')
    new_comment_split = []
    for i, word in enumerate(comment_split):
      word = convert_emoticons(word)
      word = word.lower()
      word = expand_contraction(word)
      word = re.sub("[^a-z\s]+", ' ', word)   # replace characters that are not alphabetic, space, or underscore
      word = re.sub(r'(.)\1\1+', '\g<1>', word)  # replace any three character+ sequence with one
      word = re.sub('\s+', ' ', word)
      word = word.strip() # trailing whitespace because punctuation replaced by space
      new_comment_split.extend(word.split(' '))

    for i, word in enumerate(new_comment_split):
      if func and func(word, names):
        new_comment_split[i] = ''
      else:
        new_comment_split[i] = vocab_mapping.get(word, word)

    comment = ' '.join(new_comment_split)
    comment = re.sub('\s+', ' ', comment)
    comment = comment.strip()

    # comment = ' '.join(word for word in comment.split() if len(word) > 1)

    comments_proper.append(comment)
    for word in comment.split(' '):
      all_words[word] += 1
    # spellchecked_comments.append(' '.join(sym_spell.lookup(word, Verbosity.TOP, max_edit_distance=2, include_unknown=True)[0].term for word in comment.split()))
  return comments_proper

In [None]:
comments_proper = preprocess_pipeline(reviews)
# print(len(all_names)) # 49421 before
# print(len(all_words)) # 72182 words

reviews['cleanedComment'] = comments_proper

25000
50000
75000
100000
125000
150000
175000


## Professor Names
Figuring out how many professors appear in the comments and how it contributes to the overall amount of features.

In [None]:
all_names = set()
for index, review in reviews.iterrows():
  fname = review['firstName'].lower().split(' ')
  lname = review['lastName'].lower().split(' ')
  names = set(fname + lname)
  all_names.update(names)
print(len(all_words))
print(len(all_names))

55404
42997


In [None]:
all_words = defaultdict(int)
reviews['cleanedComment'] = preprocess_pipeline(reviews, lambda x, nms: x in nms)

25000
50000
75000
100000
125000
150000
175000


In [None]:
all_words_set = set(all_words.keys())
print(len(all_words_set))
print(len(all_names))
print(len(all_words_set.intersection(all_names))) # Cut a lot of features

43822
42997
4235


In [None]:
for comment in reviews.loc[100:200, 'cleanedComment']:
  print(comment)
  print()

absolutely fantastic life changing

bad teacher very harsh grader details are all or nothing

professor josh is awesome he is very friendly and has excellent dancing skills he makes things very clear and is very helpful outside of class take your math classes with him

he is good teacher he like teaching and wants to teach he lets you revise your papers

she is nice but not good teacher she is not help me at all very lazy the class was a waste of money and time do not take her class

amazing teacher

she was by far my favorite teacher that i have ever had she was so helpful i had her for psych and she made it so interesting i would recomend her to anyone

easy class show up and do the work all the test questions come from the mcgraw hill homework online if you do all the homework you get for homework grade just do the work

is a wonderful teacher with a lot of knowledge of criminology i found her classes to be very interesting come to class pay attention and read the textbook and you w

## Professor Classes

In [None]:
# regex: ([a-zA-Z]+\d*)
classes_counter = Counter()
classes_appear_set = set()
cnt = 0
for index, review in reviews.iterrows():
  prof_class = review['prof_class'].lower()
  classes = re.findall('[a-zA-Z]{2,}', prof_class)
  classes_counter.update(classes)
  for class_ in classes:
    if len(class_) > 1 and class_ in review['cleanedComment']:
      classes_appear_set.add(class_)
      cnt += 1

print(cnt)
print(len(classes_counter))
print(classes_counter)
# classes_appear_set

# general classes to remove:

33539
13271
Counter({'math': 10799, 'eng': 10257, 'bio': 6276, 'engl': 4820, 'hist': 4462, 'chem': 3819, 'psy': 3700, 'soc': 3132, 'mat': 2873, 'econ': 2595, 'psych': 2244, 'biol': 2161, 'art': 2134, 'psyc': 2096, 'span': 1962, 'comm': 1884, 'acct': 1862, 'phil': 1811, 'bus': 1702, 'enc': 1428, 'mth': 1311, 'phys': 1275, 'com': 1257, 'his': 1049, 'anth': 1024, 'acc': 944, 'bsc': 938, 'chm': 920, 'eco': 831, 'nurs': 811, 'nur': 782, 'pols': 780, 'mus': 751, 'spch': 748, 'mgmt': 746, 'hum': 738, 'fin': 717, 'geog': 693, 'soci': 686, 'mac': 678, 'phi': 668, 'geo': 636, 'phy': 631, 'cis': 613, 'cs': 608, 'stat': 602, 'mgt': 601, 'rel': 600, 'educ': 585, 'govt': 523, 'ma': 515, 'spa': 487, 'english': 480, 'en': 473, 'pol': 454, 'edu': 445, 'mkt': 433, 'music': 429, 'comp': 429, 'ps': 415, 'spc': 412, 'geol': 408, 'ba': 408, 'esl': 398, 'ant': 392, 'che': 385, 'mktg': 380, 'cj': 377, 'fren': 372, 'calc': 321, 'arth': 312, 'poli': 312, 'ch': 306, 'wr': 305, 'sls': 302, 'law': 301, 'lit': 299,

In [None]:
classes_counter.most_common(1000)

course_abbreviated_stopwords = {
    'eng', 'bio', 'engl', 'hist', 'chem', 'psy', 'soc', 'econ', 'psych', 'span', 'comm', 'acct',
    'phil', 'bus', 'enc', 'phys', 'anth', 'acc', 'eco', 'nurs', 'pols', 'spch', 'mgmt', 'geog', 'soci', 'geo',
    'phy', 'stat', 'mgt', 'govt', 'gov', 'spa', 'en', 'pol', 'edu', 'gen', 'mkt', 'music', 'alg', 'calc', 'fren',
    'poli', 'law', 'crim', 'engr', 'art', 'busn',
}

course_fluff_stopwords = {
    'intro', 'introduction', 'introductory', 'basic', 'concept', 'concepts', 'studies', 'discovery', 'background', 'knowledge',
    'materials', 'intermediate', 'advanced', 'managing', 'management', 'theory', 'thoeries', 'design', 'analysis',
    'science', 'sciences', 'overview', 'laboratory', 'principles', 'practices', 'contemporary', 'modern',
    'civilization', 
}

course_full_stopwords = {
    'math', 'english', 'biology', 'history', 'chemistry', 'psychology', 'sociology', 'spanish', 'communication', 'accounting',
    'philosophy', 'business', 'physics', 'anthropology', 'ecology', 'economics', 'political science', 'management', 'geology',
    'statistics', 'government', 'marketing', 'music', 'algebra', 'calculus', 'french', 'german', 'law', 'criminology', 'engineering',
    'health', 'art', 'theater', 'astrology', 'science', 'sciences', 'writing', 'finance', 'japanese', 'arab', 'religion', 'ethnic',
    'speech', 'culture', 'literature'
}

# Subject full names
# math, english, biology, history, chemistry, psychology, sociology, spanish, communication, accounting, philosphy,
# business, physics, anthropology, ecology, economics, political science, management, geology, statistics,
# government, marketing, music, algebra, calculus, french, german, law, criminology, engineering, health, art, speech,
# theater, astrology, science, sciences, writing, finance, japanese, arab, religion, ethnic, speech

# Fluff??:
# intro, introduction, introductory, basic, concepts, studies, discovery, background, knowledge, materials, intermediate, advanced
# managing, theory, theories, design, analysis, science, sciences, 


In [None]:
# Regex version:
# math*, engl*, bio*, hist*, chem*, psy*, soc*, span*, comm*, acct*, phil*,
# phys*, anth*, eco*, pols*, geo*, stat*, gov*, alg*, cal* or calc*, fren*, 
# germ*, crim*, astr*, sci*, writ*, jap*, reli*, ethn*

regex_course = [
    'math', 'engl', 'bio', 'hist', 'chem', 'psy', 'soc', 'span', 'acct', 'phil',
    'phys', 'anth', 'eco', 'pols', 'geo', 'gov', 'alg', 'calc', 'fren', 'germ',
    'crim', 'astr', 'sci', 'jap', 'relig', 'ethn', 'russ', 'eur', 'asia', 'afri',
    'christ', 'isl', 'ital', 'arch', 'chin'
         ]

regex_course = [r'\b' + word + r'\S+' for word in regex_course]
regex_course.append(r'\S+' + 'logy' + r'\b')
regex_course.append(r'\S+' + 'phy' + r'\b')

In [None]:
matching_course_counter = Counter()
course = re.compile(regex_course[-3])
for comment in reviews['cleanedComment']:
  comment = comment.lower()
  matching_courses = re.findall(course, comment)
  matching_course_counter.update(matching_courses)
print(matching_course_counter)

Counter({'chinese': 197, 'china': 63, 'chinatown': 3, 'chinchillas': 3, 'chinglish': 1})


In [None]:
print(f"Feature Count Before: {len(all_words)}")
all_words = defaultdict(int)
for index, review in reviews.iterrows():
  comment = review['cleanedComment']
  prof_class = review['prof_class'].lower()
  classes = set(re.findall('[a-zA-Z]{2,}', prof_class))
  comment_split = comment.split(' ')
  for index, word in enumerate(comment_split):
    if word in classes:
      comment_split[index] = ''
    else:
      all_words[word] += 1
  comment = ' '.join(comment_split)
  comment = re.sub('\s+', ' ', comment)
  comment = comment.strip()
  review['cleanedComment'] = comment

print(f'Feature Count After: {len(all_words)}')

Feature Count Before: 43822
Feature Count After: 43251


In [None]:
for index, review in reviews.sample(1000, random_state=5).iterrows():
  print(review['comment'])
  print(review['prof_class'], review['cleanedComment'])
  print()

I'm currently in her ethics class at UNF, and I LOVE that class. She is hilarious! Most of the class time is spent laughing (who wouldn't like a class like that&#63;). There are some assignments, but nothing weekly, or difficult. It's ethics, if you have common sense and an open-mind (with your own EDUCATED opinions!) you will do great in this class.
PHI2630 i am currently in her ethics class at unf and i love that class she is hilarious most of the class time is spent laughing who would not like a class like that there are some assignments but nothing weekly or difficult it is ethics if you have common sense and an open mind with your own educated opinions you will do great in this class

I never expected to encounter an English professor who struggled so badly to write with clarity.  (ie- syllabus,instructions for assignments, etc) I honestly think this results from lack of effort put forth from Ms. Holbrook.  Throughout the semester my recurring thought was, &quot;OMG, I acually am 

## Abbreviation and Stuff???

In [None]:
cleaned_comments = reviews['cleanedComment']

alphabet = set(list("abcdefghijklmnopqrstuvwxyz"))
words_avoid = set(["all", "up", "is", "so", "to", "be", "for", "on", "he", "his", "the", "had", "of", "and", "are",
                   "dr", "she", "her", "oh", "try", "you", "she", "now", "not", "my", "but", "an", "wow", "do", "if", "fun",
                   "was", "in", "at", "one", "can", "how", "in", "was", "lot", "get", "it", "put", "any", "got", "our", "him",
                   "guy", "am", "law", "end", "we", "us", "use", "ask", "rid", "max", "joy", "no", "did", "sad", "day",
                   "fan", "few", "off", "era", "top", "big", "red", "hot", "age", "pie", "age", "by", "mad", "gut", "fed",
                   "two", "see", "air", "hug", "bag", "cut", "has", "or", "buy", "sit", "way", "lay", "say", "ok", "mrs",
                   "shy", "god", "pay", "kid", "key", "too", "why", "box", "aid", "ton", "art", "met", "dry", "let", "bet",
                   "far", "sum", "bad", "yet", "yes", "ass", "set", "boy", "map", "low", "old", "who", "lab", "bam",
                   "pop", "sob", "dog", "run", "hat", "lie", "ego", "own", "its", "go", "add", "pet", "job", "due", "may", "nap", "ink",
                   "new", "bit", "as", "out", "me", "mr", "man", "ms", "fix"])
words_avoid.update(alphabet)
words_all_3 = set()
for index, review in reviews.sample(1000, random_state=1).iterrows():
  words_less_3 = set(word for word in review['cleanedComment'].split(' ') if len(word) <= 3 and word not in words_avoid)
  if words_less_3:
    words_all_3.update(words_less_3)

print(words_all_3)
print(len(words_all_3))

{'dud', 'nor', 'per', 'de', 'gpc', 'ekg', 'ten', 'rd', 'cpa', 'alg', 'gad', 'uc', 'cal', 'bus', 'mt', 'cc', 'sky', 'amt', 'raj', 'eet', 'eg', 'el', 'dvc', 'ugh', 'isn', 'chm', 'uva', 'pro', 'ou', 'th', 'cup', 'cid', 'lcc', 'nd', 've', 'bio', 'nku', 'pol', 'beg', 'ges', 'mba', 'uca', 'dnt', 'don', 'wpm', 'jjc', 'def', 'phd', 'yt', 'yhc', 'qs', 'six', 'esp', 'cpr', 're', 'unr', 'um', 'fav', 'gt', 'tea', 'etc', 'pts', 'cs', 'fly', 'sqr', 'net', 'll', 'ob', 'csi', 'psc', 'pa', 'fr', 'ii', 'pre', 'snd', 'ace', 'ut', 'ap', 'tri', 'men', 'ad', 'pt', 'gr', 'la', 'sba', 'arp', 'af', 'pgs', 'min', 'byu', 'sem', 'mid', 'mud', 'tim', 'uw', 'gpa', 'hah', 'pic', 'hs', 'ch', 'utc', 'huh', 'dad', 'asl', 'fi', 'sly', 'ah', 'wcd', 'via', 'da', 'nay', 'sge', 'tv', 'ill', 'fu', 'tfs', 'mwf', 'cat', 'axe', 'ny', 'smu', 'gem', 'bb', 'eng', 'bcc', 'med', 'avg', 'hag', 'cls', 'yea', 'nit', 'tax', 'jim', 'lit', 'nu', 'psy', 'osu', 'nyu', 'adn', 'int', 'ya', 'aap', 'req', 'sls', 'jmu', 'sci', 'hrs', 'si', 'bu',

In [None]:
counter_3_sample = Counter()
for comment in reviews.loc[500:1000, 'cleanedComment']:
  cond = False
  cleaned_word_set = set(comment.split(' '))
  for word in cleaned_word_set:
    if word in words_all_3:
      cond = True
      counter_3_sample[word] += 1
  if cond:
    print(comment)
    print()

print(counter_3_sample)

# To Remove:
'''
bio, etc, eng, mails
'''
# Abbreviations / Shortened
'''
etc, def, ed, via, pts, pg, tas(teaching assistants?), phd, req, bro, abt, org, gen, ed(unsure??), lil, fav
'''

this class has a horrifying amount of work you get tested twice per topic before class and during class every class period tuesday and thursday while she means well and knows her stuff she expects way too much for a gen ed requirement only take this class if love groupwork every week and if you are willing to have it take over your life

definately on the easy end of cal classes lets you make a cheat sheet for every test and you can cram anything you want into one side of a full sheet of paper you can use every previous one plus and extra one on the final if you do any homework you can do her tests

i liked her i know some students struggled but i am not sure why the tests were of average difficulty if you study and use apr frequently then you will pass i never went in for office hours and i do not think they were essential she makes class non stressful and interesting she only holds class on friday as needed which is appreciated

probably the single worst teacher i have ever had she i

'\netc, def, ed, via, pts, pg, tas(teaching assistants?), phd, req, bro, abt, org, gen, ed(unsure??), lil, fav\n'

In [None]:
# !pip install wordsegment

In [None]:
# Might need to spellcheck comments before segmenting them

# from wordsegment import load, segment
# load()
# for comment in comments_proper[:200]:
#   new_comment = segment(comment)
#   if comment.split(' ') != new_comment:
#     print(comment.split(' '))
#     print(new_comment)
#     print('\n')

# Pycontractions
Very long to do as opposed to a simple contraction mapping, will hold off on this for now.

In [None]:
# !python --version

In [None]:
# Because of weird java dependencies this needs to be done
# !sudo apt-get update -y

# !sudo apt install openjdk-8-jdk
# !sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
# !java -version

In [None]:
# %pip install git+https://github.com/MCFreddie777/language-check.git
# %pip install pycontractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/MCFreddie777/language-check.git
  Cloning https://github.com/MCFreddie777/language-check.git to /tmp/pip-req-build-saqqgiwj
  Running command git clone -q https://github.com/MCFreddie777/language-check.git /tmp/pip-req-build-saqqgiwj
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# from pycontractions import Contractions
# help(Contractions)

Help on class Contractions in module pycontractions.contractions:

class Contractions(builtins.object)
 |  Contractions(w2v_path=None, lang_code='en-US', kv_model=None, api_key=None)
 |  
 |  Expand and contract common English contractions in text.
 |  
 |  Uses a combination of pattern replacement, grammar checking, and Word Mover's Distance.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, w2v_path=None, lang_code='en-US', kv_model=None, api_key=None)
 |      w2v_path is a path to an embedding model used for calculating the Word Mover's Distance.
 |  
 |  contract_texts(self, texts)
 |      Return a generator over an iterable of text where each result has contracted common expansions.
 |  
 |  expand_texts(self, texts, precise=False, scores=False)
 |      Return a generator over an iterable of text where each result has common contractions expanded.
 |      
 |      If precise == True then it will use a much slower method that does not assume all occurrences
 |      of the same

In [None]:
# import gensim.downloader as api
# model = api.load("glove-twitter-100")

In [None]:
# cont = Contractions(kv_model=model)
# cont.load_models() # prevents loading the model on the first expand_texts

In [None]:
# Non-precise
# print(list(cont.expand_texts(["you ain't anything special, she's much faster and has Jamie's dog"])))
# list(cont.expand_texts(["I'd like to know how I'd done that!",
#                             "We're going to the zoo and I don't think I'll be home for dinner.",
#                             "Theyre going to the zoo and she'll be home for dinner."]))

["you have not anything special, she has much faster and has Jamie's dog"]


['I had like to know how I had done that!',
 'we are going to the zoo and I do not think I will be home for dinner.',
 'they are going to the zoo and she will be home for dinner.']

In [None]:
# Precise
# print(list(cont.expand_texts(["you ain't anything special, she's much faster and has Jamie's dog"], precise=True)))
# list(cont.expand_texts(["I'd like to know how I'd done that!",
#                             "We're going to the zoo and I don't think I'll be home for dinner.",
#                             "Theyre going to the zoo and she'll be home for dinner."], precise=True))

["you have not anything special, she has much faster and has Jamie's dog"]


['I would like to know how I had done that!',
 'we are going to the zoo and I do not think I will be home for dinner.',
 'they are going to the zoo and she will be home for dinner.']

In [None]:
# list(cont.expand_texts(["dont you see youre a tool! testing to see if this'll catch contractions that have no apostrophe and have mispellings", "he'll not like what he sees. hell, I don't know what I'm seeing. you think she'll like it?"], precise=True))

["do not you see you are a tool! testing to see if this'll catch contractions that have no apostrophe and have mispellings",
 'he will not like what he sees. hell, I do not know what I am seeing. you think she will like it?']

In [None]:
# One sample
# list(cont.expand_texts(["this'll be a decent test."]))

["this'll be a decent test."]

In [None]:
# def preprocess_pipeline2(df, print_index=25000):
#   cnt = 0 # to keep track of progress
#   comments_proper = []
#   for index, review in df.iterrows():
#     comment = review['comment']
#     fname = review['firstName'].lower().split(' ')
#     lname = review['lastName'].lower().split(' ')
#     names = set(fname + lname)
#     all_names.update(names)

#     cnt += 1
#     if cnt % print_index == 0:
#       print(cnt)

#     comment = remove_urls(comment)
#     comment = remove_phones(comment)
#     comment = remove_emails(comment)
#     comment = remove_html_entities(comment)
#     comment = remove_html_tags(comment)

#     comment = list(cont.expand_texts([comment], precise=True))[0]
#     comment_split = comment.split(' ')
#     new_comment_split = []
#     for i, word in enumerate(comment_split):
#       word = convert_emoticons(word)
#       word = word.lower()
#       # word = expand_contraction(word)
#       word = re.sub("[^a-z\s]+", ' ', word)   # replace characters that are not alphabetic, space, or underscore
#       word = re.sub(r'(.)\1\1+', '\g<1>', word)  # replace any three character+ sequence with one
#       word = re.sub('\s+', ' ', word)
#       word = word.strip() # trailing whitespace because punctuation replaced by space
#       new_comment_split.extend(word.split(' '))

#     # for i, word in enumerate(new_comment_split):
#     #   if word in names or word in stopwords:
#     #     new_comment_split[i] = ''
#     #   else:
#     #     new_comment_split[i] = vocab_mapping.get(word, word)

#     for i, word in enumerate(new_comment_split):
#       new_comment_split[i] = vocab_mapping.get(word, word)

#     comment = ' '.join(new_comment_split)
#     comment = re.sub('\s+', ' ', comment)
#     comment = comment.strip()

#     # comment = ' '.join(word for word in comment.split() if len(word) > 1)

#     comments_proper.append(comment)
#     for word in comment.split(' '):
#       all_words[word] += 1
#     # spellchecked_comments.append(' '.join(sym_spell.lookup(word, Verbosity.TOP, max_edit_distance=2, include_unknown=True)[0].term for word in comment.split()))
#   return comments_proper

# comments_proper_contractions = preprocess_pipeline2(reviews.loc[:25000], print_index=1000)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000


In [None]:
for comm1, comm2 in zip(comments_proper[500:600], comments_proper_contractions[500:600]):
  if comm1 != comm2:
    print(comm1)
    print(comm2)
    print('\n')

definately on the easy end of cal classes lets you make a cheat sheet for every test and you can cram anything you want into one side of a full sheet of paper you can use every previous one plus and extra one on the final if you do any homework you can do her tests
definately on the easy end of cal classes let us you make a cheat sheet for every test and you can cram anything you want into one side of a full sheet of paper you can use every previous one plus and extra one on the final if you do any homework you can do her tests


the sweets old man ever he lets you resubmit assignments you get low grades on if you attend class and pay attention you should have no problem passing his quizzes he gives alot of them but you have to be willing to put in work it is not a easy a
the sweets old man ever he let us you resubmit assignments you get low grades on if you attend class and pay attention you should have no problem passing his quizzes he gives alot of them but you have to be willing to

# Something

# Spelling Corrections

In [None]:
!sudo apt-get install swig

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
The following additional packages will be installed:
  swig3.0
Suggested packages:
  swig-doc swig-examples swig3.0-examples swig3.0-doc
The following NEW packages will be installed:
  swig swig3.0
0 upgraded, 2 newly installed, 0 to remove and 20 not upgraded.
Need to get 1,100 kB of archives.
After this operation, 5,822 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig3.0 amd64 3.0.12-1 [1,094 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig amd64 3.0.12-1 [6,460 B]
Fetched 1,100 kB in 0s (3,433 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/Fr

In [None]:
!pip install jamspell

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jamspell
  Using cached jamspell-0.0.12.tar.gz (174 kB)
Building wheels for collected packages: jamspell
  Building wheel for jamspell (setup.py) ... [?25l[?25hdone
  Created wheel for jamspell: filename=jamspell-0.0.12-cp37-cp37m-linux_x86_64.whl size=1347600 sha256=da3079b9c35d7c626a75e3362d9178a5f24ceb8500cc17387bd549d9d6488e27
  Stored in directory: /root/.cache/pip/wheels/68/df/9c/9b335e69aa0f28e7f508ec0ebefadcc703f168beb52ae7ebe7
Successfully built jamspell
Installing collected packages: jamspell
Successfully installed jamspell-0.0.12


In [None]:
import jamspell
# Need to some some tar install in order for this to work
corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel('en.bin')

False