# Imports

In [None]:
import pandas as pd
import numpy as np

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('omw-1.4')

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC, RandomForestRegressor as RFR, GradientBoostingClassifier as XGBoost
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

# Preprocessing

In [None]:
df_ratings = pd.read_csv("Data/clean_ratings.csv", infer_datetime_format=True)

In [None]:
df_ratings.head()

In [None]:
df_ratings.describe()

In [None]:
df_profs = pd.read_csv("Data/clean_prof_info.csv")

In [None]:
df_profs.head()

In [None]:
df_profs.dtypes

In [None]:
df_profs["firstName"] = df_profs["firstName"].apply(lambda x: x.strip())
df_profs["lastName"] = df_profs["lastName"].apply(lambda x: x.strip())

In [None]:
df_names = df_profs[["profID", "firstName", "lastName"]]

In [None]:
df_ratings.shape

In [None]:
# Add professor first and last names to df_ratings
df_ratings = df_ratings.merge(df_names, how="inner", on="profID")
df_ratings.head()

In [None]:
# No data lost, all profIDs have match in df_profs
df_ratings.shape

In [None]:
import re
stopword_list = set(stopwords.words("english"))
stopword_list.update([',', '.'])
lemmatizer = WordNetLemmatizer()

def preprocess_comment(row):
    """
    Tokenize, remove stopwords and punctuations at word ends, lemmatize, and then reassemble into one string.
    
    If any token matches the first or last name of the professor, it is dropped.
    
    All numbers are dropped.
    
    This is done to eliminate low-impact tokens and reduce vocabulary size.
    
    String type output required for easier ingestion by sklearn TfidfVectorizer.
    """
    comment = row.loc["comment"]
    re.sub(r"['!\"#$%&\'()*,./:;<=>?@[\\]^_`{|}~'] ", ' ', comment)
    tokens = word_tokenize(comment)
    
    ignore_list = stopword_list.copy()
    ignore_list.update([row.loc["firstName"], row.loc["lastName"]])
    
    tokens = [token.lower() for token in tokens if token not in ignore_list and not token.isnumeric()]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return " ".join(tokens)

In [None]:
df_ratings["comment"] = df_ratings.apply(preprocess_comment, axis=1)

In [None]:
df_ratings["comment"][:10]

In [None]:
# Denote ratings corresponding to scores of 4 or higher as hot
# Encode hot as 1, not hot as 0
df_ratings["Hot"] = df_ratings["helpfulRating"] >= 4
df_ratings["Hot"] = df_ratings["Hot"].astype(int)

In [None]:
X, y = df_ratings["comment"], df_ratings["Hot"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Encoding

## Frequency and TfIdf

In [None]:
tfidf_vectorizer = TfidfVectorizer(strip_accents="ascii")
count_vectorizer = CountVectorizer(strip_accents="ascii")

# note to future self: please remember to set the min_df parameter to limit the dimensionality.
# High-dimensional encoding drastically dlows down fitting and grid searching.

In [None]:
train_tfidf = tfidf_vectorizer.fit_transform(X_train)
test_tfidf = tfidf_vectorizer.transform(X_test)
train_count = count_vectorizer.fit_transform(X_train)
test_count = count_vectorizer.transform(X_test)

## Testing the Effect of Specifiying `min_df`
Only terms appearing in at least 5 comments are added to the vocabulary

In [None]:
min_df_vectorizer = TfidfVectorizer(strip_accents="ascii", min_df=5)
min_df_tfidf = min_df_vectorizer.fit_transform(X_train)

In [None]:
# Reduction of dimension by a factor of 3
print(train_tfidf.shape, min_df_tfidf.shape)

## GloVe (Global Vectors for Word Representation)

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

glove_path = "Data/glove.6B.100d.txt"
word2vec_path = glove_path + ".word2vec"

glove2word2vec(glove_path, word2vec_path)
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=False)

In [None]:
class Word2VecVectorizer:
    """Encode entire comments by taking the average of the word representations"""
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dimension = 100

    def transform(self, data):
        X = np.zeros((len(data), self.dimension))
        n = 0
        empty_count = 0
        
        for sentence in data:
            tokens = sentence.split()
            vecs = []

            for word in tokens:
                try:
                  # throws KeyError if word not found
                  vec = self.word2vec.get_vector(word)
                  vecs.append(vec)
                except KeyError:
                  pass
            
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis=0)
            else:
                empty_count += 1
                
            n += 1
            
        print(f"Number of samples with no words found: {empty_count}/{len(data)}")
        return X

In [None]:
word2vec_vectorizer = Word2VecVectorizer(word2vec)

In [None]:
train_word2vec = word2vec_vectorizer.transform(X_train)
test_word2vec = word2vec_vectorizer.transform(X_test)

# Models

In [None]:
def benchmark(clf, search_grid):
    """
    Train one classifier on the three types of encodings.

    Use gridsearch to find the best hyperparameters.

    Print the classification report on the test set.

    Return the best classifiers.
    """
    # Choose a classifier based on best average f1-score on 5-fold CV
    clf = GridSearchCV(estimator=clf, param_grid=search_grid, scoring="f1", n_jobs=-1, verbose=3)
    
    print("Frequency:")
    clf.fit(train_count, y_train)
    count_clf = clf.best_estimator_
    pred_count = count_clf.predict(test_count)
    
    # 0 for not hot, 1 for hot
    print(classification_report(y_test, pred_count, target_names=["Not Hot", "Hot"]))
    print("-" * 100)
    
    print("TfIdf:")
    clf.fit(train_tfidf, y_train)
    tfidf_clf = clf.best_estimator_
    pred_tfidf = tfidf_clf.predict(test_tfidf)
    
    print(classification_report(y_test, pred_tfidf, target_names=["Not Hot", "Hot"]))
    print("-" * 100)
    
    print("Word2Vec:")
    clf.fit(train_word2vec, y_train)
    word2vec_clf = clf.best_estimator_
    pred_word2vec = word2vec_clf.predict(test_word2vec)
    
    print(classification_report(y_test, pred_word2vec, target_names=["Not Hot", "Hot"]))
    
    return count_clf, tfidf_clf, word2vec_clf

## Random Forest (Classification)

In [None]:
rf = RFC(random_state=42, n_jobs=-1)
rf_grid = {"n_estimators":[50, 100, 200]}
rf_count, rf_tfidf, rf_word2vec = benchmark(rf, rf_grid)

## XGBoost

In [None]:
xgboost = XGBoost(random_state=42)
xgb_grid = {"learning_rate":[0.02, 0.1, 0.5]}
xgb_count, xgb_tfidf, xgb_word2vec = benchmark(xgboost, xgb_grid)

## Logistic Regression

In [None]:
log_reg = LogisticRegression(n_jobs=-1, solver="sag")
log_reg_grid = {"C":[0.5, 1, 2]}
log_reg_count, log_reg_tfidf, log_reg_word2vec = benchmark(log_reg, log_reg_grid)

## Naive Bayes

In [None]:
naive_bayes = MultinomialNB()

# no searchable parameters, set the grid with default value
# this simplifies to simple 5-fold CV
naive_bayes_grid = {"alpha":[1.0]}
naive_bayes_count, naive_bayes_tdidf, naive_bayes_word2vec = benchmark(naive_bayes, naive_bayes_grid)

## Support Vector Machine

In [None]:
svm = LinearSVC(random_state=42)
svm_grid = {"C":[0.5, 1, 2]}
svm_count, svm_tfidf, svm_word2vec = benchmark(svm, svm_grid)

## K-Nearest Neighbors

In [None]:
# Beware of memory issues!
knn = KNeighborsClassifier(n_jobs=-1)
knn_grid = {"n_neighbors":[1, 3, 5]}
knn_count, knn_tfidf, knn_word2vec = benchmark(knn, knn_grid)

## Random Forest (Regression)

Instead of predicting "hot or not" 0 or 1 boolean encoding.

Predict the integer `helpfulRating` scores. Note this column is technically categorical.

For simplicity, we treat it as a continuous variable.

In [None]:
y_regression_train = df_ratings.iloc[X_train.index]["helpfulRating"]
y_regression_test = df_ratings.iloc[X_test.index]["helpfulRating"]

In [None]:
# There are a small numbers of NA values in each series (12 and 8 respectively)
# Dropping them and reprocessing X is difficult
# Fill NA values with series mean instead

y_regression_train = y_regression_train.fillna(y_regression_train.mean())
y_regression_test = y_regression_test.fillna(y_regression_test.mean())

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
def eval_regression(actual, predicted):
    """
    Infer the accuracy, precision, and recall based on regression results.
    
    Also calculate the MAE and RMSE.
    """
    MAE = mean_absolute_error(actual, predicted)
    RMSE = mean_squared_error(actual, predicted, squared=False)
    true_hot = 0
    true_not_hot = 0
    false_hot = 0
    false_not_hot = 0
    
    for m, n in np.nditer([actual, predicted]):
        if m >= 4:
            if n >= 4:
                true_hot += 1
            else:
                false_not_hot += 1
        else:
            if n >= 4:
                false_hot += 1
            else:
                true_not_hot += 1
        
    total = true_hot + true_not_hot + false_hot + false_not_hot
    accuracy = (true_hot + true_not_hot) / total
    precision = true_hot / (true_hot + false_hot)
    recall = true_hot / (true_hot + false_not_hot)
    
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}", )
    print(f"Recall: {recall}")
    print(f"F1 score: {2 * recall * precision / (recall + precision)}")
    print(f"Mean Absolute Error: {MAE}")
    print(f"Root Mean Squared Error: {RMSE}")

In [None]:
rf_reg = RFR(n_jobs=-1, random_state=42)
rf_reg.fit(train_count, y_regression_train)
rf_reg_count_pred = rf_reg.predict(test_count)

In [None]:
print("Count:")
eval_regression(y_regression_test, rf_reg_count_pred)

In [None]:
rf_reg.fit(train_tfidf, y_regression_train)
rf_reg_tfidf_pred = rf_reg.predict(test_tfidf)

In [None]:
print("TfIdf:")
eval_regression(y_regression_test, rf_reg_tfidf_pred)

In [None]:
rf_reg.fit(train_word2vec, y_regression_train)
rf_reg_word2vec_pred = rf_reg.predict(test_word2vec)

In [None]:
print("Word2Vec:")
eval_regression(y_regression_test, rf_reg_word2vec_pred)

## Model Persistence
Save the trained models to disk so we can load them easily in the future.

In [None]:
from joblib import dump, load

In [None]:
dump(rf_count, "Models/rf_count.joblib")
dump(rf_tfidf, "Models/rf_tfidf.joblib")
dump(rf_word2vec, "Models/rf_word2vec.joblib")
dump(xgb_count, "Models/xgb_count.joblib")
dump(xgb_tfidf, "Models/xgb_tfidf.joblib")
dump(xgb_word2vec, "Models/xgb_word2vec.joblib")
dump(log_reg_count, "Models/log_reg_count.joblib")
dump(log_reg_tfidf, "Models/log_reg_tfidf.joblib")
dump(log_reg_word2vec, "Models/log_reg_word2vec.joblib")
dump(svm_count, "Models/svm_count.joblib")
dump(svm_tfidf, "Models/svm_tfidf.joblib")
dump(svm_word2vec, "Models/svm_word2vec.joblib")