# Attempt incorporating professor-based split

In [1]:
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

from sklearn.feature_selection import chi2, SelectPercentile

In [2]:
reviews = pd.read_csv("../Data/scraped_comments_with_professor.csv").sample(n=15000, random_state=0)

### Barebones preprocessing

Same as `pausers.ipynb`

In [3]:
reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

reviews.dropna(subset=["comment"], inplace=True)
reviews = reviews[reviews["comment"] != "No Comments"]

reviews["comment"] = reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)
reviews.dropna(subset=["comment"], inplace=True)

reviews.reset_index(drop=True, inplace=True)

reviews.head()

Unnamed: 0,professor_id,comment_id,firstName,lastName,prof_class,comment,ratingTags,date,attendanceMandatory,grade,clarityRating,difficultyRating,helpfulRating,textbookUse,thumbsDownTotal,thumbsUpTotal,wouldTakeAgain
0,VGVhY2hlci0xNDYwNTM2,UmF0aW5nLTE3NTcyNzc2,Heather,Kruse,PSYC331,She is so incredible and is always willing to ...,,2010-09-03 12:58:29 +0000 UTC,,,5,3,5,3.0,0,0,
1,VGVhY2hlci0xMjkzMDkw,UmF0aW5nLTE2OTMxOTc3,Joseph,Lavalle,SPAN1101,Awesome Teacher. Highly Recommended. Funny guy...,,2010-03-31 02:34:03 +0000 UTC,,,5,1,5,4.0,0,0,
2,VGVhY2hlci0xMjEwNzE2,UmF0aW5nLTMwNjg1MDg2,John,Park,CMSC341,Honestly one of the best compsci professors at...,LOTS OF HOMEWORK--Amazing lectures--Caring,2018-11-08 12:20:45 +0000 UTC,non mandatory,,5,4,5,0.0,4,1,1.0
3,VGVhY2hlci0xMzQyNDg4,UmF0aW5nLTE5MTMzMjgw,Thomas,Buford,BSC110,He is an excellent teacher. He puts all of th...,,2011-10-31 15:37:23 +0000 UTC,,,4,2,4,3.0,0,0,
4,VGVhY2hlci0xMDcxMDM2,UmF0aW5nLTE2MTM3NjQz,Shawn,Kenny,ENGR4000,Fair prof. Very thorough in his coverage of th...,,2009-08-20 15:39:11 +0000 UTC,,,4,3,5,4.0,0,0,


In [4]:
comments_proper = []

for i in range(reviews.shape[0]):
    review = reviews["comment"][i]
    review = re.sub('&([a-zA-z]+|#\d+);', "", review)           # remove HTML codes
    review = re.sub('&#63;?', '', review)                       # HTML code for question mark evades erasure on occasion, handle here
    review = re.sub(r'\s*https?://\S+(\s+|$)', ' ', review)                                     # remove links
    review = re.sub("^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$", ' ', review)         # remove phone numbers
    review = re.sub("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", " ", review)              # remove email addresses

    review = re.sub(r'(.)\1\1+', '\g<1>', review)               # replace any three characters in a row with one

    review = re.sub('[^a-zA-Z]+', ' ', review)                  # remove non-alphabetic characters

    review = re.sub('\s+', ' ', review)
    review = review.lower()                                     # lowercase review for uniformity

    comments_proper.append(review)

In [5]:
reviews = reviews.loc[:, ["professor_id", "firstName", "lastName", "comment", "clarityRating"]]
reviews["cleanedComment"] = pd.Series(comments_proper)
reviews["sentiment"] = reviews["clarityRating"].apply(lambda x: 1 if x > 2.5 else 0)

reviews.head()

Unnamed: 0,professor_id,firstName,lastName,comment,clarityRating,cleanedComment,sentiment
0,VGVhY2hlci0xNDYwNTM2,Heather,Kruse,She is so incredible and is always willing to ...,5,she is so incredible and is always willing to ...,1
1,VGVhY2hlci0xMjkzMDkw,Joseph,Lavalle,Awesome Teacher. Highly Recommended. Funny guy...,5,awesome teacher highly recommended funny guy a...,1
2,VGVhY2hlci0xMjEwNzE2,John,Park,Honestly one of the best compsci professors at...,5,honestly one of the best compsci professors at...,1
3,VGVhY2hlci0xMzQyNDg4,Thomas,Buford,He is an excellent teacher. He puts all of th...,4,he is an excellent teacher he puts all of the ...,1
4,VGVhY2hlci0xMDcxMDM2,Shawn,Kenny,Fair prof. Very thorough in his coverage of th...,4,fair prof very thorough in his coverage of the...,1


### Split comments by professor

* `prof_train` and `prof_test` are professor IDs
* `cnt_train` and `cnt_test` are respective comment counts for each professor

In [6]:
prof_counts = reviews["professor_id"].value_counts()
prof_train, prof_test, cnt_train, cnt_test = train_test_split(prof_counts.index, prof_counts.values, test_size=0.2, random_state=1)

Get corresponding comments for all professors in train and test set separately

In [7]:
comm_train = reviews[reviews["professor_id"].isin(prof_train)]["cleanedComment"]
comm_test = reviews[reviews["professor_id"].isin(prof_test)]["cleanedComment"]

Get sentiments for all professors separately

In [8]:
sent_train = reviews[reviews["professor_id"].isin(prof_train)]["sentiment"]
sent_test = reviews[reviews["professor_id"].isin(prof_test)]["sentiment"]

### Run Naive Bayes Model

In [9]:
def evalPerformance(y_pred, y_test, mode="weighted"):
    acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy Score: " + str(acc_score * 100))
    
    f1 = f1_score(y_test, y_pred, average=mode)
    print("F1 Score: {0}".format(f1 * 100))

In [10]:
cv = CountVectorizer(ngram_range=(1,2), max_features=5000)
X = cv.fit_transform(reviews["cleanedComment"]).toarray()

In [11]:
X_train = cv.transform(comm_train)
model = MultinomialNB().fit(X_train, sent_train)

In [12]:
X_test = cv.transform(comm_test)
sent_pred = model.predict(X_test)

In [13]:
evalPerformance(sent_pred, sent_test)

Accuracy Score: 87.86167960479887
F1 Score: 88.2433905869451
