# Imports

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\guoca\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\guoca\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\guoca\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\guoca\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [117]:
from sklearn.ensemble import RandomForestClassifier as RFC, RandomForestRegressor as RFR, GradientBoostingClassifier as XGBoost
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Preprocessing

In [4]:
df_ratings = pd.read_csv("Data/clean_ratings.csv", infer_datetime_format=True)

In [5]:
df_ratings.dtypes

profID                   int64
attendanceMandatory       bool
class                   object
comment                 object
date                    object
difficultyRating       float64
grade                   object
helpfulRating          float64
isForCredit               bool
isForOnlineClass          bool
ratingTags              object
wouldTakeAgain            bool
dtype: object

In [6]:
df_ratings.head()

Unnamed: 0,profID,attendanceMandatory,class,comment,date,difficultyRating,grade,helpfulRating,isForCredit,isForOnlineClass,ratingTags,wouldTakeAgain
0,7964,False,ANTHRCUL101,Fricke is the man. Entire class probably took ...,2019-04-28 17:13:12,1.0,A,5.0,False,False,"['Respected', 'Inspirational', 'Amazing Lectur...",True
1,7964,False,ANTHRO101,Tom Fricke is one of those professors you will...,2019-01-08 18:41:24,1.0,A+,5.0,False,False,"['Accessible Outside Class', 'Hilarious', 'Ama...",True
2,7964,False,ANTHRCUL101,Prof. Fricke is amazing. He is hilarious and t...,2018-12-16 03:11:18,1.0,A,5.0,False,False,"['Hilarious', 'Graded By Few Things', 'Caring']",True
3,7964,False,CULTANTHRO101,Such an easy class. Exams were exactly like th...,2018-12-12 10:03:19,1.0,A,5.0,False,False,"['Accessible Outside Class', 'Graded By Few Th...",True
4,7964,False,ANTHRCUL101,Easiest class i have taken at UM. The exams to...,2018-12-11 16:33:00,1.0,A+,5.0,False,False,"['Respected', 'Hilarious', 'Amazing Lectures']",True


In [48]:
df_ratings.describe()

Unnamed: 0,profID,difficultyRating,helpfulRating
count,51650.0,51630.0,51630.0
mean,1038128.0,2.999806,3.801511
std,726240.0,1.1799,1.445799
min,7775.0,1.0,1.0
25%,429404.0,2.0,3.0
50%,895103.0,3.0,4.0
75%,1656242.0,4.0,5.0
max,2880778.0,5.0,5.0


In [7]:
df_profs = pd.read_csv("Data/clean_prof_info.csv")

In [8]:
df_profs.head()

Unnamed: 0,profID,avgDifficulty,avgRating,department,firstName,lastName,numRatings,wouldTakeAgainPercent,fullName
0,7775,2.3,4.4,Mechanical Engineering,Alan,Wineman,34,80.0,Alan Wineman
1,7964,1.6,4.5,Anthropology,Thomas,Fricke,114,95.0,Thomas Fricke
2,8045,2.6,3.2,Biology,Julian,Adams,22,,Julian Adams
3,8189,3.1,4.6,Music,Deborah,Chodacki,8,,Deborah Chodacki
4,10260,4.0,2.3,Political Science,Arlene,Saxonhouse,86,66.0,Arlene Saxonhouse


In [9]:
df_profs.dtypes

profID                     int64
avgDifficulty            float64
avgRating                float64
department                object
firstName                 object
lastName                  object
numRatings                 int64
wouldTakeAgainPercent    float64
fullName                  object
dtype: object

In [10]:
df_profs["firstName"] = df_profs["firstName"].apply(lambda x: x.strip())
df_profs["lastName"] = df_profs["lastName"].apply(lambda x: x.strip())

In [11]:
df_names = df_profs[["profID", "firstName", "lastName"]]

In [12]:
df_ratings.shape

(51650, 12)

In [13]:
df_ratings = df_ratings.merge(df_names, how="inner", on="profID")
df_ratings.head()

Unnamed: 0,profID,attendanceMandatory,class,comment,date,difficultyRating,grade,helpfulRating,isForCredit,isForOnlineClass,ratingTags,wouldTakeAgain,firstName,lastName
0,7964,False,ANTHRCUL101,Fricke is the man. Entire class probably took ...,2019-04-28 17:13:12,1.0,A,5.0,False,False,"['Respected', 'Inspirational', 'Amazing Lectur...",True,Thomas,Fricke
1,7964,False,ANTHRO101,Tom Fricke is one of those professors you will...,2019-01-08 18:41:24,1.0,A+,5.0,False,False,"['Accessible Outside Class', 'Hilarious', 'Ama...",True,Thomas,Fricke
2,7964,False,ANTHRCUL101,Prof. Fricke is amazing. He is hilarious and t...,2018-12-16 03:11:18,1.0,A,5.0,False,False,"['Hilarious', 'Graded By Few Things', 'Caring']",True,Thomas,Fricke
3,7964,False,CULTANTHRO101,Such an easy class. Exams were exactly like th...,2018-12-12 10:03:19,1.0,A,5.0,False,False,"['Accessible Outside Class', 'Graded By Few Th...",True,Thomas,Fricke
4,7964,False,ANTHRCUL101,Easiest class i have taken at UM. The exams to...,2018-12-11 16:33:00,1.0,A+,5.0,False,False,"['Respected', 'Hilarious', 'Amazing Lectures']",True,Thomas,Fricke


In [14]:
# No data lost, all profIDs have match in df_profs
df_ratings.shape

(51650, 14)

In [39]:
import re
stopword_list = set(stopwords.words("english"))
stopword_list.update([',', '.'])
lemmatizer = WordNetLemmatizer()

def preprocess_comment(row):
    """
    Tokenize, remove stopwords and punctuations at word ends, lemmatize, and then reassemble into one string.
    
    If any token matches the first or last name of the professor, it is dropped.
    
    All numbers are dropped.
    
    This is done to eliminate low-impact tokens and reduce vocabulary size.
    
    String type output required for easier ingestion by sklearn TfidfVectorizer.
    """
    comment = row.loc["comment"]
    re.sub(r"['!\"#$%&\'()*,./:;<=>?@[\\]^_`{|}~'] ", ' ', comment)
    tokens = word_tokenize(comment)
    
    ignore_list = stopword_list.copy()
    ignore_list.update([row.loc["firstName"], row.loc["lastName"]])
    
    tokens = [token.lower() for token in tokens if token not in ignore_list and not token.isnumeric()]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return " ".join(tokens)

In [46]:
df_ratings["comment"] = df_ratings.apply(preprocess_comment, axis=1)

In [47]:
df_ratings["comment"][:10]

0    man entire class probably took five hour study...
1    tom one professor never ever forget i found co...
2    prof. amazing he hilarious tell great interest...
3    such easy class exam exactly like practice exa...
4    easiest class taken um the exam took majority ...
5    this easiest course ever want well long look s...
6    easy a. i like talked lot fieldwork made easie...
7    take it with fricke you literally given exam q...
8    easy class lecture brutal easily get topic com...
9    tom hilarious lecture much fun you tell really...
Name: comment, dtype: object

In [49]:
df_ratings["Hot"] = df_ratings["helpfulRating"] >= 4

In [95]:
df_ratings["Hot"] = df_ratings["Hot"].astype(int)

In [97]:
X, y = df_ratings["comment"], df_ratings["Hot"]

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Encoding

## Frequency and TfIdf

In [52]:
tfidf_vectorizer = TfidfVectorizer(strip_accents="ascii")
count_vectorizer = CountVectorizer(strip_accents="ascii")

# note to future self: please remember to set the min_df parameter to limit the dimensionality.
# High-dimensional encoding drastically dlows down fitting and grid searching.

In [53]:
train_tfidf = tfidf_vectorizer.fit_transform(X_train)
test_tfidf = tfidf_vectorizer.transform(X_test)
train_count = count_vectorizer.fit_transform(X_train)
test_count = count_vectorizer.transform(X_test)

## GloVe (Global Vectors for Word Representation)

In [55]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

glove_path = "Data/glove.6B.100d.txt"
word2vec_path = glove_path + ".word2vec"

glove2word2vec(glove_path, word2vec_path)
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=False)

  "class": algorithms.Blowfish,
  glove2word2vec(glove_path, word2vec_path)


In [86]:
class Word2VecVectorizer:
    """Encode entire comments by taking the average of the word representations"""
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dimension = 100

    def transform(self, data):
        X = np.zeros((len(data), self.dimension))
        n = 0
        empty_count = 0
        
        for sentence in data:
            tokens = sentence.split()
            vecs = []

            for word in tokens:
                try:
                  # throws KeyError if word not found
                  vec = self.word2vec.get_vector(word)
                  vecs.append(vec)
                except KeyError:
                  pass
            
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis=0)
            else:
                empty_count += 1
                
            n += 1
            
        print(f"Number of samples with no words found: {empty_count}/{len(data)}")
        return X

In [87]:
word2vec_vectorizer = Word2VecVectorizer(word2vec)

In [88]:
train_word2vec = word2vec_vectorizer.transform(X_train)
test_word2vec = word2vec_vectorizer.transform(X_test)

Number of samples with no words found: 22/38737
Number of samples with no words found: 2/12913


# Models

In [112]:
def benchmark(clf, search_grid):
    clf = GridSearchCV(estimator=clf, param_grid=search_grid, scoring="f1", n_jobs=-1, verbose=3)
    
    print("Frequency:")
    clf.fit(train_count, y_train)
    count_clf = clf.best_estimator_
    pred_count = count_clf.predict(test_count)
    
    # 0 for not hot, 1 for hot
    print(classification_report(y_test, pred_count, target_names=["Not Hot", "Hot"]))
    print("-" * 100)
    
    print("TfIdf:")
    clf.fit(train_tfidf, y_train)
    tfidf_clf = clf.best_estimator_
    pred_tfidf = tfidf_clf.predict(test_tfidf)
    
    print(classification_report(y_test, pred_tfidf, target_names=["Not Hot", "Hot"]))
    print("-" * 100)
    
    print("Word2Vec:")
    clf.fit(train_word2vec, y_train)
    word2vec_clf = clf.best_estimator_
    pred_word2vec = word2vec_clf.predict(test_word2vec)
    
    print(classification_report(y_test, pred_word2vec, target_names=["Not Hot", "Hot"]))
    
    return count_clf, tfidf_clf, word2vec_clf

## Random Forest (Classification)

In [113]:
rf = RFC(random_state=42, n_jobs=-1)
rf_grid = {"n_estimators":[50, 100, 200]}
rf_count, rf_tfidf, rf_word2vec = benchmark(rf, rf_grid)

Frequency:
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  1.9min remaining:  5.2min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:  3.1min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  5.1min finished


              precision    recall  f1-score   support

     Not Hot       0.86      0.56      0.68      4232
         Hot       0.82      0.95      0.88      8681

    accuracy                           0.83     12913
   macro avg       0.84      0.76      0.78     12913
weighted avg       0.83      0.83      0.82     12913

----------------------------------------------------------------------------------------------------
TfIdf:
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  1.9min remaining:  5.2min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:  3.1min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  4.8min finished


              precision    recall  f1-score   support

     Not Hot       0.86      0.56      0.68      4232
         Hot       0.82      0.96      0.88      8681

    accuracy                           0.83     12913
   macro avg       0.84      0.76      0.78     12913
weighted avg       0.83      0.83      0.81     12913

----------------------------------------------------------------------------------------------------
Word2Vec:
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:   39.8s remaining:  1.8min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:  1.2min remaining:   37.2s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  2.0min finished


              precision    recall  f1-score   support

     Not Hot       0.75      0.53      0.62      4232
         Hot       0.80      0.91      0.85      8681

    accuracy                           0.79     12913
   macro avg       0.78      0.72      0.74     12913
weighted avg       0.78      0.79      0.78     12913



## XGBoost

In [115]:
xgboost = XGBoost(random_state=42)
xgb_grid = {"learning_rate":[0.02, 0.1, 0.5]}
xgb_count, xgb_tfidf, xgb_word2vec = benchmark(xgboost, xgb_grid)

Frequency:
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  1.8min remaining:  4.8min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:  1.8min remaining:   53.0s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  2.6min finished


              precision    recall  f1-score   support

     Not Hot       0.80      0.62      0.70      4232
         Hot       0.83      0.93      0.88      8681

    accuracy                           0.83     12913
   macro avg       0.82      0.77      0.79     12913
weighted avg       0.82      0.83      0.82     12913

----------------------------------------------------------------------------------------------------
TfIdf:
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  2.6min remaining:  7.1min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:  2.7min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  3.8min finished


              precision    recall  f1-score   support

     Not Hot       0.80      0.63      0.70      4232
         Hot       0.84      0.92      0.88      8681

    accuracy                           0.83     12913
   macro avg       0.82      0.78      0.79     12913
weighted avg       0.82      0.83      0.82     12913

----------------------------------------------------------------------------------------------------
Word2Vec:
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  4.7min remaining: 13.0min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:  4.8min remaining:  2.4min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  6.9min finished


              precision    recall  f1-score   support

     Not Hot       0.74      0.58      0.65      4232
         Hot       0.81      0.90      0.85      8681

    accuracy                           0.79     12913
   macro avg       0.77      0.74      0.75     12913
weighted avg       0.79      0.79      0.79     12913



## Logistic Regression

In [123]:
log_reg = LogisticRegression(n_jobs=-1, solver="sag")
log_reg_grid = {"C":[0.5, 1, 2]}
log_reg_count, log_reg_tfidf, log_reg_word2vec = benchmark(log_reg, log_reg_grid)

Frequency:
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:    9.2s remaining:   25.5s
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:    9.3s remaining:    4.6s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   11.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


              precision    recall  f1-score   support

     Not Hot       0.80      0.70      0.75      4232
         Hot       0.86      0.92      0.89      8681

    accuracy                           0.85     12913
   macro avg       0.83      0.81      0.82     12913
weighted avg       0.84      0.85      0.84     12913

----------------------------------------------------------------------------------------------------
TfIdf:
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:    0.9s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:    1.0s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    1.4s finished


              precision    recall  f1-score   support

     Not Hot       0.81      0.71      0.76      4232
         Hot       0.87      0.92      0.89      8681

    accuracy                           0.85     12913
   macro avg       0.84      0.81      0.82     12913
weighted avg       0.85      0.85      0.85     12913

----------------------------------------------------------------------------------------------------
Word2Vec:
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:    2.2s remaining:    6.2s
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:    2.7s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    4.0s finished


              precision    recall  f1-score   support

     Not Hot       0.73      0.65      0.68      4232
         Hot       0.84      0.88      0.86      8681

    accuracy                           0.80     12913
   macro avg       0.78      0.76      0.77     12913
weighted avg       0.80      0.80      0.80     12913



## Naive Bayes

In [120]:
naive_bayes = MultinomialNB()

# no searchable parameters, set the grid with default value
# this simplifies to simple 5-fold CV
naive_bayes_grid = {"alpha":[1.0]}
naive_bayes_count, naive_bayes_tdidf, naive_bayes_word2vec = benchmark(naive_bayes, naive_bayes_grid)

Frequency:
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    3.7s remaining:    5.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


              precision    recall  f1-score   support

     Not Hot       0.75      0.77      0.76      4232
         Hot       0.88      0.88      0.88      8681

    accuracy                           0.84     12913
   macro avg       0.82      0.82      0.82     12913
weighted avg       0.84      0.84      0.84     12913

----------------------------------------------------------------------------------------------------
TfIdf:
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.2s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


              precision    recall  f1-score   support

     Not Hot       0.91      0.45      0.60      4232
         Hot       0.78      0.98      0.87      8681

    accuracy                           0.80     12913
   macro avg       0.85      0.71      0.73     12913
weighted avg       0.82      0.80      0.78     12913

----------------------------------------------------------------------------------------------------
Word2Vec:
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.0s finished


ValueError: Negative values in data passed to MultinomialNB (input X)

## Support Vector Machine

In [126]:
svm = SVC(random_state=42)
svm_grid = {"C":[1]}
svm_count, svm_tfidf, svm_word2vec = benchmark(svm, svm_grid)

Frequency:
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  9.7min remaining: 14.5min


KeyboardInterrupt: 

## Random Forest (Regression)

In [127]:
def eval_regression(actual, predicted):
    """Infer the accuracy, precision, and recall based on regression results."""
    true_hot = 0
    true_not_hot = 0
    false_hot = 0
    false_not_hot = 0
    
    for m, n in np.nditer([actual, predicted]):
        if m >= 4:
            if n >= 4:
                true_hot += 1
            else:
                false_not_hot += 1
        else:
            if n >= 4:
                false_hot += 1
            else:
                true_not_hot += 1
    
    total = true_hot + true_not_hot + false_hot + false_not_hot
    return (true_hot + true_not_hot) / total, true_hot / (true_hot + false_hot), true_hot / (true_hot + false_not_hot)

In [129]:
rf_reg = RFR(n_jobs=-1, random_state=42)
rf_reg.fit(train_count, y_train)
rf_reg_count_pred = rf_reg.predict(test_count)

In [130]:
print("Count:")
print(eval_regression(y_test, rf_reg_count_pred))

Count:


ZeroDivisionError: division by zero

In [None]:
rf_reg.fit(train_tfidf, y_train)
rf_reg_tfidf_pred = rf_reg.predict(test_tfidf)

In [None]:
print("TfIdf:")
print(eval_regression(y_test, rf_reg_tfidf_pred))

In [None]:
rf_reg.fit(train_word2vec, y_train)
rf_reg_word2vec_pred = rf_reg.predict(test_word2vec)

In [None]:
print("Word2Vec:")
print(eval_regression(y_test, rf_reg_word2vec_pred))
print("-" * 100)

## Model Persistence

In [137]:
np.savez_compressed("Models/Encodings/train_count.npz", train_count)
np.savez_compressed("Models/Encodings/train_tfidf.npz", train_tfidf)
np.savez_compressed("Models/Encodings/train_word2vec.npz", train_word2vec)
np.savez_compressed("Models/Encodings/test_count.npz", test_count)
np.savez_compressed("Models/Encodings/test_tfidf.npz", test_tfidf)
np.savez_compressed("Models/Encodings/test_word2vec.npz", test_word2vec)

In [142]:
from joblib import dump, load

In [145]:
dump(rf_count, "Models/rf_count.joblib")
dump(rf_tfidf, "Models/rf_tfidf.joblib")
dump(rf_word2vec, "Models/rf_word2vec.joblib")
dump(xgb_count, "Models/xgb_count.joblib")
dump(xgb_tfidf, "Models/xgb_tfidf.joblib")
dump(xgb_word2vec, "Models/xgb_word2vec.joblib")
dump(log_reg_count, "Models/log_reg_count.joblib")
dump(log_reg_tfidf, "Models/log_reg_tfidf.joblib")
dump(log_reg_word2vec, "Models/log_reg_word2vec.joblib")

['Models/log_reg_word2vec.joblib']