# Logistic regression model

**uses essentially same preprocessing and broad parameters as `attempt-6.ipynb`**

In [1]:
import pandas as pd
import re
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, SelectPercentile
from sklearn.metrics import accuracy_score, f1_score

In [2]:
reviews = pd.read_csv("../Data/scraped_comments_with_professor.csv").sample(n=25000, random_state=1)

In [3]:
reviews.drop_duplicates(subset="comment_id", keep="first", inplace=True)

reviews.dropna(subset=["comment"], inplace=True)
reviews = reviews[reviews["comment"] != "No Comments"]

reviews["comment"] = reviews["comment"].apply(lambda x: x if len(x.split()) > 5 else None)
reviews.dropna(subset=["comment"], inplace=True)

### Get star ratings and drop middling ones

In [4]:
reviews["starRating"] = (reviews["clarityRating"] + reviews["helpfulRating"]) / 2
reviews = reviews[reviews["starRating"] != 3.0]         # drop 3 star reviews
reviews.reset_index(drop=True, inplace=True)

reviews["starRating"].value_counts()

5.0    9321
4.5    3088
4.0    2799
1.0    2305
2.0    1412
3.5    1180
1.5    1042
2.5     905
Name: starRating, dtype: int64

### Barebones preprocessing

In [5]:
comments_proper = []

for i in range(reviews.shape[0]):
    review = reviews["comment"][i]
    review = re.sub('&([a-zA-z]+|#\d+);', "", review)           # remove HTML codes
    review = re.sub('&#63;?', '', review)                       # HTML code for question mark evades erasure on occasion, handle here
    review = re.sub(r'\s*https?://\S+(\s+|$)', ' ', review)                                     # remove links
    review = re.sub("^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$", ' ', review)         # remove phone numbers
    review = re.sub("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", " ", review)              # remove email addresses

    review = re.sub(r'(.)\1\1+', '\g<1>', review)               # replace any three characters in a row with one

    review = re.sub('[^a-zA-Z]+', ' ', review)                  # remove non-alphabetic characters

    review = re.sub('\s+', ' ', review)
    review = review.lower()                                     # lowercase review for uniformity

    comments_proper.append(review)

In [6]:
reviews = reviews.loc[:, ["professor_id", "firstName", "lastName", "comment", "starRating"]]
reviews["cleanedComment"] = pd.Series(comments_proper)
reviews["sentiment"] = reviews["starRating"].apply(lambda x: 1 if x > 2.5 else 0)

reviews.head()

Unnamed: 0,professor_id,firstName,lastName,comment,starRating,cleanedComment,sentiment
0,VGVhY2hlci0xNTk4MTk5,Alex,Tourlakes,The class I was dreading the most turned out t...,5.0,the class i was dreading the most turned out t...,1
1,VGVhY2hlci0xMDQwNTQz,Gaby,Fahmy,Loved & benefitted a lot from pathophysiology ...,5.0,loved benefitted a lot from pathophysiology wi...,1
2,VGVhY2hlci05NDAzNzM=,Van,Peterson,Loved this class. awesome prof. just do the ho...,4.0,loved this class awesome prof just do the home...,1
3,VGVhY2hlci0xMzgwODgx,Tara,Thompson,she's great.. easygoing and always ready to help.,3.5,she s great easygoing and always ready to help,1
4,VGVhY2hlci0xMDk4NTQx,Ken,Blier,Great guy! Extremely helpful and teaches compl...,5.0,great guy extremely helpful and teaches comple...,1


### Professor split

In [7]:
prof_counts = reviews["professor_id"].value_counts()                    # professor-based train-test split
prof_train, prof_test, cnt_train, cnt_test = train_test_split(prof_counts.index, prof_counts.values, test_size=0.2, random_state=1)

comm_train = reviews[reviews["professor_id"].isin(prof_train)]["cleanedComment"]
comm_test = reviews[reviews["professor_id"].isin(prof_test)]["cleanedComment"]

sent_train = reviews[reviews["professor_id"].isin(prof_train)]["sentiment"]
sent_test = reviews[reviews["professor_id"].isin(prof_test)]["sentiment"]

### Prepare vectorizer and feature selector

In [8]:
cv = CountVectorizer(ngram_range=(1,2))             # consider adding min_df term (start at 0.001)
X = cv.fit_transform(reviews["cleanedComment"]).toarray()

In [9]:
percbest = SelectPercentile(score_func=chi2, percentile=25)      # select 25% most important features using chi2
percbest.fit_transform(X, reviews["sentiment"])

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Create and fit model

In [10]:
X_train = percbest.transform(cv.transform(comm_train))      # get training vector from top 25% of features
model = LogisticRegression(verbose=1, solver="liblinear", random_state=0, C=0.5, penalty="l2").fit(X_train, sent_train)            # train model

[LibLinear]

In [11]:
X_test = percbest.transform(cv.transform(comm_test))
sent_pred = model.predict(X_test)

In [12]:
def evalPerformance(y_pred, y_test, mode="weighted"):
    acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy Score: " + str(acc_score * 100))
    
    f1 = f1_score(y_test, y_pred, average=mode)
    print("F1 Score: {0}".format(f1 * 100))

In [13]:
evalPerformance(sent_pred, sent_test)

Accuracy Score: 91.53379743992814
F1 Score: 91.3535903444728


### Pickle model, vectorizer, and feature selector

In [14]:
from joblib import dump

dump(model, "LR-model.joblib")
dump(cv, "LR-vectorizer.joblib")
dump(percbest, "LR-chi2.joblib")

['LR-chi2.joblib']