# Preprocessing Pipeline

plus, bonus Naive Bayes Model at the end

In [1]:
import pandas as pd
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

from datetime import datetime
dateparse = lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S %z %Z")

In [2]:
reviews = pd.read_csv("Ethan's EDA/scraped_comments.csv", parse_dates=['date'], date_parser=dateparse).sample(n=50000, random_state=1)

In [3]:
reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

# TODO: move dropped rows into another CSV file and 

reviews.dropna(subset=["comment"], inplace=True)
# drop rows containing only "No Comments" (default value assigned by RMP to a review that didn't enter a comment)
reviews = reviews[reviews["comment"] != "No Comments"]
# replace all comments with less than 5 words with a NaN
reviews["comment"] = reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)
# drop rows containing NaN comment
reviews.dropna(subset=["comment"], inplace=True)

reviews.reset_index(drop=True, inplace=True)

In [4]:
comments_proper = []

for i in range(reviews.shape[0]):
    review = reviews["comment"][i]
    review = re.sub('&([a-zA-z]+|#\d+);', "", review)           # remove HTML codes
    review = re.sub('&#63;?', '', review)                       # HTML code for question mark evades erasure on occasion, handle here
    review = re.sub(r'\s*https?://\S+(\s+|$)', ' ', review)                                     # remove links
    review = re.sub("^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$", ' ', review)         # remove phone numbers
    review = re.sub("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", " ", review)              # remove email addresses
    
    review = re.sub('\d{4,}', ' ', review)                      # keep numbers between 1 and 3 digits
    review = re.sub('[^a-zA-Z\s\d]+', ' ', review)              # remove non-alphabetic characters, spaces, and any remaining digits

    review = re.sub(r'(.)\1\1+', '\g<1>', review)               # replace any three characters in a row with one
    
    review = re.sub('\s+', ' ', review)
    review = review.lower()                                     # lowercase review for uniformity
    comments_proper.append(review)

In [5]:
# amend list of stop words to keep whatever it is we want by removing words from list that we want to keep

# TODO: is the list of stopwords on git complete and accurate or does someone want to read through all 325 stopwords spacy gives and determine which ones to keep?
stopwords = STOP_WORDS
stopwords.remove("but")
stopwords.remove("not")
stopwords.remove("nor")
stopwords.remove("never")

In [6]:
nlp = spacy.load("en_core_web_md", disable=["parser", "ner"])

def lemmatize_pipe(doc):
    lemma_list = [str(tok.lemma_).lower() for tok in doc if tok.is_alpha and tok.text.lower() not in stopwords]
    return lemma_list

def preprocess_pipe(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=20):
        preproc_pipe.append(lemmatize_pipe(doc))
    return preproc_pipe

no_stopwords = preprocess_pipe(reviews["comment"])

In [7]:
reviews = reviews.loc[:, ["firstName", "lastName", "comment", "clarityRating"]]
reviews["cleanedComment"] = pd.Series([" ".join(comm) for comm in no_stopwords])
reviews.head()

Unnamed: 0,firstName,lastName,comment,clarityRating,cleanedComment
0,Keith,Halladay,Keith is the best. Not afraid to give his hone...,5,keith good afraid honest opinion grading paper...
1,Charles,Lankau,Lankau was great for 4000 and 5110. He makes c...,4,lankau great make class fun hilarious prepared...
2,James,McGivern,I hated every second of my life that I had to ...,2,hate second life sit class intelligent man but...
3,Howard G.,Tucker,Crazy Man. No grading system. You dont know yo...,5,crazy man grading system not know grade tell s...
4,Paul,Cohen,This teacher is the worst I've ever taken in m...,1,teacher bad take entire schooling career rude ...


In [56]:
reviews["sentiment"] = reviews["clarityRating"].apply(lambda x: 1 if x > 3 else 0 if x == 3 else -1)       # accuracy: 0.7518830305715551 for 50000 comments
#reviews["sentiment"] = reviews["clarityRating"].apply(lambda x: 1 if x > 2.5 else 0)                        # accuracy: 0.8635356668143553 for 50000 comments
reviews.head()

Unnamed: 0,firstName,lastName,comment,clarityRating,cleanedComment,sentiment
0,Keith,Halladay,Keith is the best. Not afraid to give his hone...,5,keith good afraid honest opinion grading paper...,1
1,Charles,Lankau,Lankau was great for 4000 and 5110. He makes c...,4,lankau great make class fun hilarious prepared...,1
2,James,McGivern,I hated every second of my life that I had to ...,2,hate second life sit class intelligent man but...,-1
3,Howard G.,Tucker,Crazy Man. No grading system. You dont know yo...,5,crazy man grading system not know grade tell s...,1
4,Paul,Cohen,This teacher is the worst I've ever taken in m...,1,teacher bad take entire schooling career rude ...,-1


##### Writing cleaned comments to a new csv file so we (hopefully) don't have to run this everytime

In [57]:
#reviews.to_csv("comments_preproc.csv", index=False)

## Naive Bayes model

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(comments_proper).toarray()

In [59]:
y = pd.get_dummies(reviews.loc[:, ["sentiment"]])     # isolate sentiments
y = y.loc[:, "sentiment"]

In [60]:
from sklearn.model_selection import train_test_split
#X_valid = X[0:20000]
#y_valid = y[0:20000]

#X = X[20000:]
#y = y[20000:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [61]:
from sklearn.naive_bayes import MultinomialNB
#from sklearn.naive_bayes import BernoulliNB
model = MultinomialNB().fit(X_train, y_train)
#model = BernoulliNB().fit(X_train, y_train)
y_pred = model.predict(X_test)

#y_pred = model.predict(X_valid)

In [62]:
# Test accuracy of model
from sklearn.metrics import confusion_matrix, accuracy_score

conf_m = confusion_matrix(y_test, y_pred)

acc_score = accuracy_score(y_test, y_pred)

print("Accuracy Score: " + str(acc_score * 100))
print(conf_m)

Accuracy Score: 72.0425343376163
[[1628  302  412]
 [ 280  376  465]
 [ 289  776 4500]]


*Matrix read like:*

* __Top left:__ 0s correctly identifed as 0s (*true negative*)
* __Top right:__ 0s incorrectly predicted as 1s (*false positive*)
* __Bottom left:__ 1s incorrectly predicted as 0s (*false negative*)
* __Bottom right:__ 1s correctly identifed as 1s (*true positive*)

Can extend to 3x3 case, though more complicated, see https://towardsdatascience.com/understanding-the-confusion-matrix-from-scikit-learn-c51d88929c79 for a better explanation than I can muster :)

## Calculate Precision and Recall of Model

See https://en.m.wikipedia.org/wiki/Precision_and_recall

In [63]:
# Precision wrt 0
prec_0 = conf_m[0][0] / float(conf_m[1][0] + conf_m[0][0])

# Recall wrt 0
rec_0 = conf_m[0][0] / float(conf_m[0][1] + conf_m[0][0])

print("Precision={0}; Recall={1}".format(prec_0,rec_0))

Precision=0.8532494758909853; Recall=0.8435233160621761


In [64]:
# Precision wrt 1
prec_1 = conf_m[1][1] / float(conf_m[0][1] + conf_m[1][1])

# Recall wrt 1
rec_1 = conf_m[1][1] / float(conf_m[1][0] + conf_m[1][1])

print("Precision={0}; Recall={1}".format(prec_1,rec_1))

Precision=0.5545722713864307; Recall=0.573170731707317


In [65]:
# More refined way to calculate precision and recall, should've done my research first lol
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")

print("Precision={0}; Recall={1}".format(precision, recall))

Precision=0.7402160534550013; Recall=0.720425343376163
