# Use negation tagging

In [26]:
import pandas as pd
import re
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from collections import Counter

from sklearn.feature_selection import chi2, SelectPercentile
from sklearn.pipeline import Pipeline

In [2]:
reviews = pd.read_csv("../Data/scraped_comments_with_professor.csv")

In [3]:
reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

reviews.dropna(subset=["comment"], inplace=True)
reviews = reviews[reviews["comment"] != "No Comments"]

reviews['firstName'].fillna('', inplace=True)
reviews['lastName'].fillna('', inplace=True)

reviews["comment"] = reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)
reviews.dropna(subset=["comment"], inplace=True)

In [4]:
reviews["starRating"] = (reviews["clarityRating"] + reviews["helpfulRating"]) / 2
reviews = reviews[reviews["starRating"] != 3.0]         # drop 3 star reviews
reviews.reset_index(drop=True, inplace=True)

reviews["starRating"].value_counts()

 5.0    176812
 4.5     58705
 4.0     52643
 1.0     45128
 2.0     26945
 3.5     22896
 1.5     19230
 2.5     17510
-1.0         1
Name: starRating, dtype: int64

In [5]:
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', ' ', text)

def remove_phones(text):
    return re.sub(r'\d{3}-\d{3}-\d{4}', ' ', text)

def remove_emails(text):
    return re.sub(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', ' ', text)

def remove_html_entities(text):
  text = re.sub('&[0-9a-zA-Z#]+;', ' ', text)
  return re.sub('&#63;?', '', text)

def remove_html_tags(text):
  return re.sub('<.{1,6}?>', ' ', text)

In [6]:
negative_words = ["not", "no", "nor", "neither"]
puncts = [".", ",", "!", "?"]

def negationTag(comment):
    new_comment = []
    negation_flag = False
    for token in nltk.word_tokenize(comment):
        if token in negative_words:
            negation_flag = True
            continue
        if negation_flag == True:
            if token in puncts:
                negation_flag = False
            else:
                token = "neg_" + token
        
        new_comment.append(token)
    
    return " ".join(new_comment)

In [38]:
negationTag("This class is not very hard. According to a friend, neither are our other classes.")
negationTag("This class is not great. nor was it informational.")

'This class is neg_great . neg_was neg_it neg_informational .'

In [7]:
EMOTICONS = {
    u":‑)":"emopos",
    u":-))":"emopos",
    u":-)))":"emopos",
    u":)":"emopos",
    u":))":"emopos",
    u":)))":"emopos",
    u":-]":"emopos",
    u":]":"emopos",
    u":-3":"emopos",
    u":3":"emopos",
    u":->":"emopos",
    u":>":"emopos",
    u"8-)":"emopos",
    u":-}":"emopos",
    u":}":"emopos",
    u":-)":"emopos",
    u":c)":"emopos",
    u":^)":"emopos",
    u"=]":"emopos",
    u"=)":"emopos",
    u":‑D":"emopos",
    u":D":"emopos",
    u"8‑D":"emopos",
    u"8D":"emopos",
    u"X‑D":"emopos",
    u"XD":"emopos",
    u"=D":"emopos",
    u"=3":"emopos",
    u"B^D":"emopos",
    u":-))":"emopos",
    u":-(":"emoneg",
    u":‑(":"emoneg",
    u":(":"emoneg",
    u":‑c":"emoneg",
    u":c":"emoneg",
    u":‑<":"emoneg",
    u":<":"emoneg",
    u":‑[":"emoneg",
    u":[":"emoneg",
    u":-||":"emoneg",
    u">:[":"emoneg",
    u":{":"emoneg",
    u">:(":"emoneg",
    u":'‑(":"emoneg",
    u":'(":"emoneg",
    u":'‑)":"emopos",
    u":')":"emopos",
    u"D‑':":"emoneg",
    u"D:<":"emoneg",
    u"D:":"emoneg",
    u"D8":"emoneg",
    u"D;":"emoneg",
    u"D=":"emoneg",
    u"DX":"emoneg",
    u";‑)":"emopos",
    u";)":"emopos",
    u"*-)":"emopos",
    u"*)":"emopos",
    u";‑]":"emopos",
    u";]":"emopos",
    u";^)":"emopos",
    u":‑,":"emopos",
    u";D":"emopos",
    u":‑P":"emopos",
    u":P":"emopos",
    u"X‑P":"emopos",
    u"XP":"emopos",
    u":‑Þ":"emopos",
    u":Þ":"emopos",
    u"=p":"emopos",
    u":‑/":"emoneg",
    u":/":"emoneg",
    u":-[.]":"emoneg",
    u">:[(\)]":"emoneg",
    u">:/":"emoneg",
    u":[(\)]":"emoneg",
    u"=/":"emoneg",
    u"=[(\)]":"emoneg",
    u":L":"emoneg",
    u"=L":"emoneg",
    u":‑|":"emoneg",
    u":|":"emoneg",
    u"O:‑)":"emopos",
    u"O:)":"emopos",
    u"0:‑3":"emopos",
    u"0:3":"emopos",
    u"0:‑)":"emopos",
    u"0:)":"emopos",
    u":‑b":"emopos",
    u"(>_<)":"emoneg",
    u"(>_<)>":"emoneg",
    u"^_^":"emopos",
    u"(^_^)/":"emopos",
    u"(^O^)／":"emopos",
    u"(^o^)／":"emopos",
    u"('_')":"emoneg",
    u"(/_;)":"emoneg",
    u"(T_T) (;_;)":"emoneg",
    u"(;_;":"emoneg",
    u"(;_:)":"emoneg",
    u"(;O;)":"emoneg",
    u"(:_;)":"emoneg",
    u"(ToT)":"emoneg",
    u";_;":"emoneg",
    u";-;":"emoneg",
    u";n;":"emoneg",
    u";n;":"emoneg",
    u"Q.Q":"emoneg",
    u"T.T":"emoneg",
    u"Q_Q":"emoneg",
    u"(-.-)":"emopos",
    u"(-_-)":"emopos",
    u"(；一_一)":"emopos",
    u"(=_=)":"emoneg",
    u"^m^":"emopos",
    u">^_^<":"emopos",
    u"<^!^>":"emopos",
    u"^/^":"emopos",
    u"（*^_^*）" :"emopos",
    u"(^<^) (^.^)":"emopos",
    u"(^^)":"emopos",
    u"(^.^)":"emopos",
    u"(^_^.)":"emopos",
    u"(^_^)":"emopos",
    u"(^^)":"emopos",
    u"(^J^)":"emopos",
    u"(*^.^*)":"emopos",
    u"(^—^）":"emopos",
    u"(#^.^#)":"emopos",
    u"(*^0^*)":"emopos",
    u"(*^^)v":"emopos",
    u"(^_^)v":"emopos",
    u'(-"-)':"emoneg",
    u"(ーー;)":"emoneg",
    u"(＾ｖ＾)":"emopos",
    u"(＾ｕ＾)":"emopos",
    u"(^)o(^)":"emopos",
    u"(^O^)":"emopos",
    u"(^o^)":"emopos",
    u")^o^(":"emopos",
    u":O o_O":"emoneg",
    u"o_0":"emoneg",
    u"o.O":"emoneg",
    u"(o.o)":"emoneg",
    u"(*￣m￣)": "emoneg",
}

for emote, val in EMOTICONS.items():
    EMOTICONS[emote] = val.lower().replace(',', ' ').replace(' ', '_')

def convert_emoticons(text):
  return EMOTICONS.get(text, text)

In [8]:
CONTRACTIONS = {
    "won't": "will not",
    "didn't": "did not",
    "don't": "do not",
    "wouldn't": "would not",
    "shouldn't": "should not",
    "isn't": "is not"
}

def expand_contractions(text):
    return CONTRACTIONS.get(text, text)

In [9]:
stopwords = ["he", "her", "she", "him", "guy", "we", "professor", "professors", "prof", "profs", "teacher", "teachers", "mr", "ms", "mrs",
"dr", "doctor", "class", "classes", "course", "courses", "college", "colleges", "university", "universities", "lecture", "lectures", "lab", "labs"]

In [10]:
def preproc(reviews: pd.DataFrame):
    comments_proper = []
    unseen = Counter()

    for index, review in reviews.iterrows():
        comment = review['comment']
        fname = review['firstName'].lower().split(' ')
        lname = review['lastName'].lower().split(' ')
        names = set(fname + lname)

        comment = remove_urls(comment)
        comment = remove_phones(comment)
        comment = remove_emails(comment)
        comment = remove_html_entities(comment)
        comment = remove_html_tags(comment)

        comment_split = comment.split(' ')
        new_comment_split = []
        for i, word in enumerate(comment_split):
            word = convert_emoticons(word)
            word = word.lower()
            word = expand_contractions(word)
            word = re.sub("[^a-zA-Z\s]+", ' ', word)   # replace characters that are not alphabetic, space, or underscore
            word = re.sub(r'(.)\1\1+', '\g<1>', word)  # replace any three character+ sequence with one
            word = re.sub('\s+', ' ', word)
            word = word.strip() # trailing whitespace because punctuation replaced by space
            new_comment_split.extend(word.split(' '))

        # Remove names from the comment
        for i, word in enumerate(new_comment_split):
            if word in names:
                new_comment_split[i] = ''

        # putting negation tagging line here improves accuracy *a lot*
        # probably bc putting it here essentially leads to negation tagging being ignored lol

        comment = ' '.join(new_comment_split)
        comment = re.sub('\s+', ' ', comment)
        comment = comment.strip()

        comment = negationTag(comment)        # tag words following negation words as negated

        comment = ' '.join(word for word in comment.split() if len(word) > 1 and word not in stopwords)

        comments_proper.append(comment)

    return comments_proper

In [11]:
comments_proper = preproc(reviews)

In [12]:
reviews = reviews.loc[:, ["professor_id", "firstName", "lastName", "comment", "starRating"]]
reviews["cleanedComment"] = pd.Series(comments_proper)
reviews["sentiment"] = reviews["starRating"].apply(lambda x: 1 if x > 2.5 else 0)

reviews.head()

Unnamed: 0,professor_id,firstName,lastName,comment,starRating,cleanedComment,sentiment
0,VGVhY2hlci0xMjQzMzQ3,Kurt,Douglass,"Good teacher, good lectures. Obviously cares a...",5.0,good good obviously cares about the subject ma...,1
1,VGVhY2hlci0xMjQzMzQ3,Kurt,Douglass,"Good teacher, very lenient with grading and at...",5.0,good very lenient with grading and attendance ...,1
2,VGVhY2hlci0xMjQzMzQ3,Kurt,Douglass,Very difficult class. His grading is hard to ...,1.5,very difficult his grading is hard to understa...,0
3,VGVhY2hlci0xMDExMDU2,Paula,Zobisch,Excellent mentor. Created valuable foundations...,5.0,excellent mentor created valuable foundations ...,1
4,VGVhY2hlci0xMDExMDU2,Paula,Zobisch,Awesome...Had her for two classes at baker col...,5.0,awesome had for two at baker reccommended,1


In [13]:
prof_counts = reviews["professor_id"].value_counts()
prof_train, prof_test, cnt_train, cnt_test = train_test_split(prof_counts.index, prof_counts.values, test_size=0.2, random_state=1)

In [14]:
comm_train = reviews[reviews["professor_id"].isin(prof_train)]["cleanedComment"]
comm_test = reviews[reviews["professor_id"].isin(prof_test)]["cleanedComment"]

sent_train = reviews[reviews["professor_id"].isin(prof_train)]["sentiment"]
sent_test = reviews[reviews["professor_id"].isin(prof_test)]["sentiment"]

In [15]:
pipeline = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1,2), max_df=0.5)), 
    ("selector"  , SelectPercentile(score_func=chi2, percentile=50)),
    ("classifer" , MultinomialNB())
])

In [40]:
from sklearn.metrics import confusion_matrix

def evalPerformance(y_pred, y_test, mode="weighted"):
    acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy: " + str(acc_score * 100))
    
    f1 = f1_score(y_test, y_pred, average=mode)
    print("F1 Score: {0}\n".format(f1 * 100))

    prec = precision_score(y_test, y_pred, average=mode)
    print("Precision: {0}".format(prec * 100))
    rec = recall_score(y_test, y_pred, average=mode)
    print("Recall: {0}\n".format(rec * 100))

    print(confusion_matrix(y_test, y_pred))

In [17]:
sentiment_fit = pipeline.fit(comm_train, sent_train)

In [18]:
sent_pred = sentiment_fit.predict(comm_test)

In [41]:
evalPerformance(sent_pred, sent_test)

Accuracy: 83.7391283852264
F1 Score: 83.98545557803826

Precision: 84.36174673856573
Recall: 83.7391283852264

[[16279  5774]
 [ 8024 54777]]


In [20]:
old_reviews = pd.read_csv("../Data/scraped_comments.csv").sample(n=100000, random_state=1)

old_reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

old_reviews.dropna(subset=["comment"], inplace=True)
old_reviews = old_reviews[old_reviews["comment"] != "No Comments"]
old_reviews["comment"] = old_reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)
old_reviews.dropna(subset=["comment"], inplace=True)

old_reviews.reset_index(drop=True, inplace=True)

In [21]:
old_reviews["starRating"] = (old_reviews["clarityRating"] + old_reviews["helpfulRating"]) / 2
old_reviews = old_reviews[old_reviews["starRating"] != 3.0]         # drop 3 star reviews

In [22]:
old_reviews = old_reviews[old_reviews["clarityRating"] == old_reviews["helpfulRating"]]
old_reviews.reset_index(inplace=True, drop=True)

In [23]:
preproc_comments = preproc(old_reviews)

In [24]:
preds = sentiment_fit.predict(preproc_comments)

In [42]:
real_sentiment = old_reviews["starRating"].apply(lambda x: 1 if x > 2.5 else 0)
evalPerformance(preds, real_sentiment)

Accuracy: 85.26323478387904
F1 Score: 85.4010619384567

Precision: 85.59684498324553
Recall: 85.26323478387904

[[11447  3557]
 [ 4513 35244]]
