In [3]:
# # from tabulate import tabulate
# %matplotlib inline
# # import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
# import numpy as np
# import pickle
# from gensim.models.word2vec import Word2Vec
# from collections import Counter, defaultdict
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.pipeline import Pipeline
# from spacy.lang.en import English


# # Classifiers
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import LinearSVC
# from sklearn.linear_model import SGDClassifier
# from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import ExtraTreesClassifier

# # Evaluation
# from sklearn import metrics

GLOVE_6B_50D_PATH = "C:\\Users\\Ukachi\\PycharmProjects\\paper_codes\\files\\glove_6B.txt"
GLOVE_27B_200D_PATH = "C:\\Users\\Ukachi\\PycharmProjects\\paper_codes\\files\\glove.twitter.27B.200d.txt"
encoding="utf-8"

In [11]:
import numpy as np
import pickle

In [None]:
# !pip install gensim

In [10]:
def load_file(filepath):
    """
    This function is used to load a file from the specified file path
    This was used to load the mapping dictionaries for this script
    Parameters
    ----------
    filepath: str

    Returns
    Any file
    -------

    """

    with open(filepath, 'rb') as f:
        file = pickle.load(f)
        return file
    
def store_data(filepath, data):
    """
    This function is used for object serialization just to store what is going on
    Parameters
    ----------
    filepath: str The path where data is stored
    data: The data being stored

    Returns
    -------

    """
    pickle.dump(data, open(filepath, "wb"))
    print("Data stored successfully")
    


def split_data(data, label, percentage):
    """
    This function is used to split the data
    Args:
        data: data
        label: target
        percentage: test size

    Returns:
        X_train, X_test, y_train, y_test

    """
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=percentage)
    return X_train, X_test, y_train, y_test




In [None]:
def arrange_data():
    

In [None]:
imdb_data = load_file("C:\\Users\\Ukachi\\PycharmProjects\\paper_codes\\files\\imdb_data.pkl")
X = imdb_data.review
y = imdb_data.sentiment


nlp = English()
token_list = []

for review in X:
    my_review = nlp(review)
    for token in my_review:
        token_list.append(token.text)

all_words = set(token_list)
print(len(all_words))
    

X_data = []

for review in X:
    my_doc = nlp(review)
    token_review = []
    for token in my_doc:
        token_review.append(token.text)
    X_data.append(token_review)
    
store_data("X_data.pkl", X_data)


In [None]:
store_data("all_words.pkl", all_words)

In [None]:
len(X_data)

In [None]:
data = np.array(X_data)
target = np.array(y)


# Splitting the data set
train_data, test_data, train_label, test_label = split_data(data, target, 0.2)

results = {}

In [None]:
len(test_data)

In [None]:
# train word2vec on all the texts - both training and test set
# we're not using test labels, just texts so this is fine
model = Word2Vec(X_data, size=100, window=5, min_count=5, workers=2)
w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

store_data("w2v.pkl", w2v)

In [8]:
# reading the GLOVE files
import struct 

glove_small = {}
with open(GLOVE_6B_50D_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        nums = np.array(parts[1:], dtype=np.float32)
        glove_small[word] = nums
            
glove_big = {}
with open(GLOVE_27B_200D_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        nums=np.array(parts[1:], dtype=np.float32)
        glove_big[word] = nums
            


In [12]:
print(len(glove_small))
print(len(glove_big))

store_data("glove_small.pkl", glove_small)
store_data("glove_big.pkl", glove_big)

400000
1193514
Data stored successfully
Data stored successfully


In [None]:
glove_small

In [None]:
list_glove_small = list(glove_small.values())

In [None]:
list_glove_small[0: 4]

In [None]:
len(w2v)

In [None]:
X = data
y = target

In [None]:
X[2]

In [None]:
# We first create the classifiers that use count vectorizers and TFIDF vectorizers
# But since I have already done this I am going to move straight to the main thing

In [None]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
            
    def fit(self, X ,y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

    
# and a tf-idf version of the same
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [None]:
len(glove_big)

In [None]:
# Extra Trees classifier is almost universally great, let's stack it with our embeddings
etree_glove_small = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_small)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_glove_small_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_small)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_glove_big = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_big)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_glove_big_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_big)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_w2v = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(w2v)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_w2v_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(w2v)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])

In [None]:
all_etree_models = [
    ("etree_glove_small", etree_glove_small),
    ("etree_glove_small_tfidf", etree_glove_small_tfidf),
    ("etree_glove_big", etree_glove_big),
    ("etree_glove_big_tfidf", etree_glove_big_tfidf),
    ("etree_w2v", etree_w2v),
    ("etree_tfidf_w2v", etree_w2v_tfidf)
]

results = {}

In [None]:
def use_classifier(pipeline, classifier_name,
                   X_train=train_data, y_train=train_label,
                   X_test=test_data, y_test=test_label):
    """
    This function is used to apply the classifiers and collate the results
    Args:
        clf: Machine Learning classifier
        classifier_name: Description of classifier

    Returns:

    """
    print(f"Working on {classifier_name}")
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    
    print(metrics.classification_report(y_test, predictions, target_names=["Negative", "Positive"]))
    f1_score = metrics.f1_score(y_test, predictions)

    print(f"F1-score for {classifier_name} = {f1_score}")
    
    return f1_score

In [None]:
for name, pipeline in all_etree_models:
    results[name] = use_classifier(pipeline, name)

In [None]:
etree_2 = results

print(etree_2)



In [None]:
all_results {"IMDB_etree":etree_2
             "IMDB_lr":
             "IMDB_SVM":
             "IMDB_NB":
             "IMDB_DT":
             "IMDB_RF":
}

In [None]:
store_data("etree_results", etree_2)

In [None]:
etree_1

In [None]:
# The other algorithms used in the traditional_case
# Extra Trees classifier is almost universally great, let's stack it with our embeddings
lr_glove_small = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_small)), 
                        ("logistic regression", LogisticRegression(max_iter=10000, tol=0.1, solver="lbfgs"))])
lr_glove_small_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_small)), 
                        ("logistic regression", LogisticRegression(max_iter=10000, tol=0.1, solver="lbfgs"))])
lr_glove_big = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_big)), 
                        ("logistic regression", LogisticRegression(max_iter=10000, tol=0.1, solver="lbfgs"))])
lr_glove_big_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_big)), 
                        ("logistic regression", LogisticRegression(max_iter=10000, tol=0.1, solver="lbfgs"))])
lr_w2v = Pipeline([("w2v vectorizer", MeanEmbeddingVectorizer(w2v)), 
                        ("logistic regression", LogisticRegression(max_iter=10000, tol=0.1, solver="lbfgs"))])
lr_w2v_tfidf = Pipeline([("w2v vectorizer", TfidfEmbeddingVectorizer(w2v)), 
                        ("logistic regression", LogisticRegression(max_iter=10000, tol=0.1, solver="lbfgs"))])

In [None]:
all_lr_models = [
    ("lr_glove_small", lr_glove_small),
    ("lr_glove_small_tfidf", lr_glove_small_tfidf),
    ("lr_glove_big", lr_glove_big),
    ("lr_glove_big_tfidf", lr_glove_big_tfidf),
    ("lr_w2v", lr_w2v),
    ("lr_w2v_tfidf", lr_w2v_tfidf)
]

lr_results = {}
for name, pipeline in all_lr_models:
    lr_results[name] = use_classifier(pipeline, name)

In [None]:
store_data("lr_w2v_results.pkl", lr_results)

In [None]:
print(lr_results)

In [None]:
# Create the classifiers
# The other algorithms used in the traditional_case
# Extra Trees classifier is almost universally great, let's stack it with our embeddings
for penalty in ["l2", "l1"]:
    svm_glove_small = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_small)), 
                            (f"SVM_{penalty}", LinearSVC(penalty=penalty, tol=1e-3, dual=False))])
    
    svm_glove_small_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_small)), 
                            (f"SVM_{penalty}", LinearSVC(penalty=penalty, tol=1e-3, dual=False))])
    
    svm_glove_big = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_big)), 
                            (f"SVM_{penalty}", LinearSVC(penalty=penalty, tol=1e-3, dual=False))])
    
    svm_glove_big_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_big)), 
                            (f"SVM_{penalty}", LinearSVC(penalty=penalty, tol=1e-3, dual=False))])
    
    svm_w2v = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(w2v)), 
                            (f"SVM_{penalty}", LinearSVC(penalty=penalty, tol=1e-3, dual=False))])
    
    svm_w2v_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(w2v)), 
                            (f"SVM_{penalty}", LinearSVC(penalty=penalty, tol=1e-3, dual=False))])
    
    all_svm_models = [
    (f"svm_glove_small_{penalty}", svm_glove_small),
    (f"svm_glove_small_tfidf_{penalty}", svm_glove_small_tfidf),
    (f"svm_glove_big_{penalty}", svm_glove_big),
    (f"svm_glove_big_tfidf_{penalty}", svm_glove_big_tfidf),
    (f"svm_w2v_{penalty}", svm_w2v),
    (f"svm_w2v_tfidf_{penalty}", svm_w2v_tfidf)
]

    svm_results = {}
    for name, pipeline in all_svm_models:
        svm_results[name] = use_classifier(pipeline, name)
    
    store_data(f"svm_results_{penalty}.pkl", svm_results)

In [None]:
all_svm_models = [
    ("svm_glove_small", svm_glove_small),
    ("svm_glove_small_tfidf", svm_glove_small_tfidf),
    ("svm_glove_big", svm_glove_big),
    ("svm_glove_big_tfidf", svm_glove_big_tfidf),
    ("svm_w2v", svm_w2v),
    ("svm_w2v_tfidf", svm_w2v_tfidf)
]

svm_results = {}
for name, pipeline in all_svm_models:
    svm_results[name] = use_classifier(pipeline, name)
    
store_data("svm_results.pkl", svm_results)

In [19]:
# Create the classifiers
# The other algorithms used in the traditional_case
# Extra Trees classifier is almost universally great, let's stack it with our embeddings
for penalty in ["l2", "l1"]:
    sgd_glove_small = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_small)), 
                            (f"SVM_{penalty}", SGDClassifier(alpha=.0001, max_iter=50, penalty=penalty))])
    
    sgd_glove_small_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_small)), 
                            (f"SVM_{penalty}", SGDClassifier(alpha=.0001, max_iter=50, penalty=penalty))])
    
    sgd_glove_big = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_big)), 
                            (f"SVM_{penalty}", SGDClassifier(alpha=.0001, max_iter=50, penalty=penalty))])
    
    sgd_glove_big_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_big)), 
                            (f"SVM_{penalty}", SGDClassifier(alpha=.0001, max_iter=50, penalty=penalty))])
    
    sgd_w2v = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(w2v)), 
                            (f"SVM_{penalty}", SGDClassifier(alpha=.0001, max_iter=50, penalty=penalty))])
    
    sgd_w2v_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(w2v)), 
                            (f"SVM_{penalty}", SGDClassifier(alpha=.0001, max_iter=50, penalty=penalty))])
    all_sgd_models = [
    (f"sgd_glove_small_{penalty}", sgd_glove_small),
    (f"sgd_glove_small_tfidf_{penalty}", sgd_glove_small_tfidf),
    (f"sgd_glove_big_{penalty}", sgd_glove_big),
    (f"sgd_glove_big_tfidf_{penalty}", sgd_glove_big_tfidf),
    (f"sgd_w2v_{penalty}", sgd_w2v),
    (f"sgd_w2v_tfidf_{penalty}", sgd_w2v_tfidf)
]


    sgd_results = {}
    for name, pipeline in all_sgd_models:
        sgd_results[name] = use_classifier(pipeline, name)
    
    store_data(f"sgd_results_{penalty}.pkl", sgd_results)

Working on sgd_glove_small_l2




              precision    recall  f1-score   support

    Negative       0.75      0.77      0.76      4939
    Positive       0.77      0.75      0.76      5061

   micro avg       0.76      0.76      0.76     10000
   macro avg       0.76      0.76      0.76     10000
weighted avg       0.76      0.76      0.76     10000

F1-score for sgd_glove_small_l2 = 0.7579835308294838
Working on sgd_glove_small_tfidf_l2




              precision    recall  f1-score   support

    Negative       0.68      0.85      0.76      4939
    Positive       0.81      0.60      0.69      5061

   micro avg       0.73      0.73      0.73     10000
   macro avg       0.74      0.73      0.72     10000
weighted avg       0.74      0.73      0.72     10000

F1-score for sgd_glove_small_tfidf_l2 = 0.6903737259343148
Working on sgd_glove_big_l2




              precision    recall  f1-score   support

    Negative       0.79      0.87      0.83      4939
    Positive       0.86      0.78      0.82      5061

   micro avg       0.83      0.83      0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000

F1-score for sgd_glove_big_l2 = 0.8194949599916866
Working on sgd_glove_big_tfidf_l2




              precision    recall  f1-score   support

    Negative       0.88      0.68      0.77      4939
    Positive       0.75      0.91      0.82      5061

   micro avg       0.80      0.80      0.80     10000
   macro avg       0.81      0.80      0.79     10000
weighted avg       0.81      0.80      0.79     10000

F1-score for sgd_glove_big_tfidf_l2 = 0.8194679241925439
Working on sgd_w2v_l2




              precision    recall  f1-score   support

    Negative       0.86      0.85      0.85      4939
    Positive       0.85      0.87      0.86      5061

   micro avg       0.86      0.86      0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

F1-score for sgd_w2v_l2 = 0.8588223768002351
Working on sgd_w2v_tfidf_l2




              precision    recall  f1-score   support

    Negative       0.86      0.85      0.86      4939
    Positive       0.85      0.87      0.86      5061

   micro avg       0.86      0.86      0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

F1-score for sgd_w2v_tfidf_l2 = 0.8614180929095355
Data stored successfully
Working on sgd_glove_small_l1




              precision    recall  f1-score   support

    Negative       0.72      0.82      0.77      4939
    Positive       0.80      0.69      0.74      5061

   micro avg       0.76      0.76      0.76     10000
   macro avg       0.76      0.76      0.76     10000
weighted avg       0.76      0.76      0.76     10000

F1-score for sgd_glove_small_l1 = 0.7425491439441979
Working on sgd_glove_small_tfidf_l1




              precision    recall  f1-score   support

    Negative       0.69      0.84      0.75      4939
    Positive       0.80      0.62      0.70      5061

   micro avg       0.73      0.73      0.73     10000
   macro avg       0.74      0.73      0.73     10000
weighted avg       0.74      0.73      0.73     10000

F1-score for sgd_glove_small_tfidf_l1 = 0.7009096960284004
Working on sgd_glove_big_l1




              precision    recall  f1-score   support

    Negative       0.85      0.80      0.83      4939
    Positive       0.82      0.86      0.84      5061

   micro avg       0.83      0.83      0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000

F1-score for sgd_glove_big_l1 = 0.8383118134515322
Working on sgd_glove_big_tfidf_l1




              precision    recall  f1-score   support

    Negative       0.92      0.57      0.70      4939
    Positive       0.69      0.95      0.80      5061

   micro avg       0.76      0.76      0.76     10000
   macro avg       0.80      0.76      0.75     10000
weighted avg       0.80      0.76      0.75     10000

F1-score for sgd_glove_big_tfidf_l1 = 0.8018040591330494
Working on sgd_w2v_l1




              precision    recall  f1-score   support

    Negative       0.89      0.81      0.84      4939
    Positive       0.83      0.90      0.86      5061

   micro avg       0.85      0.85      0.85     10000
   macro avg       0.86      0.85      0.85     10000
weighted avg       0.86      0.85      0.85     10000

F1-score for sgd_w2v_l1 = 0.8608218140503693
Working on sgd_w2v_tfidf_l1




              precision    recall  f1-score   support

    Negative       0.81      0.90      0.85      4939
    Positive       0.89      0.80      0.84      5061

   micro avg       0.85      0.85      0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

F1-score for sgd_w2v_tfidf_l1 = 0.8424122487240914
Data stored successfully


In [21]:
bnb_glove_small = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_small)), 
                            (f"mnb", BernoulliNB(alpha=0.01))])
    
bnb_glove_small_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_small)), 
                            (f"mnb", BernoulliNB(alpha=0.01))])
    
bnb_glove_big = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_big)), 
                            (f"mnb", BernoulliNB(alpha=0.01))])
    
bnb_glove_big_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_big)), 
                            (f"mnb", BernoulliNB(alpha=0.01))])
    
bnb_w2v = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(w2v)), 
                            (f"mnb", BernoulliNB(alpha=0.01))])
    
bnb_w2v_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(w2v)), 
                            (f"mnb", BernoulliNB(alpha=0.01))])
all_bnb_models = [
    (f"bnb_glove_small", bnb_glove_small),
    (f"bnb_glove_small_tfidf", bnb_glove_small_tfidf),
    (f"bnb_glove_big", bnb_glove_big),
    (f"bnb_glove_big_tfidf", bnb_glove_big_tfidf),
    (f"bnb_w2v", bnb_w2v),
    (f"bnb_w2v_tfidf", bnb_w2v_tfidf)
]


bnb_results = {}
for name, pipeline in all_bnb_models:
    bnb_results[name] = use_classifier(pipeline, name)
    
store_data(f"mnb_results.pkl", bnb_results)

Working on bnb_glove_small
              precision    recall  f1-score   support

    Negative       0.61      0.69      0.65      4939
    Positive       0.66      0.58      0.61      5061

   micro avg       0.63      0.63      0.63     10000
   macro avg       0.63      0.63      0.63     10000
weighted avg       0.63      0.63      0.63     10000

F1-score for bnb_glove_small = 0.6136387517074707
Working on bnb_glove_small_tfidf
              precision    recall  f1-score   support

    Negative       0.66      0.69      0.67      4939
    Positive       0.68      0.65      0.66      5061

   micro avg       0.67      0.67      0.67     10000
   macro avg       0.67      0.67      0.67     10000
weighted avg       0.67      0.67      0.67     10000

F1-score for bnb_glove_small_tfidf = 0.6628768652928637
Working on bnb_glove_big
              precision    recall  f1-score   support

    Negative       0.70      0.75      0.73      4939
    Positive       0.74      0.69      0.71   

In [22]:
dt_glove_small = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_small)), 
                            (f"dt", DecisionTreeClassifier())])
    
dt_glove_small_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_small)), 
                            (f"dt", DecisionTreeClassifier())])
    
dt_glove_big = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_big)), 
                            (f"dt", DecisionTreeClassifier())])
    
dt_glove_big_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_big)), 
                            (f"dt", DecisionTreeClassifier())])
    
dt_w2v = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(w2v)), 
                            (f"dt", DecisionTreeClassifier())])
    
dt_w2v_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(w2v)), 
                            (f"dt", DecisionTreeClassifier())])
all_dt_models = [
    (f"dt_glove_small", dt_glove_small),
    (f"dt_glove_small_tfidf", dt_glove_small_tfidf),
    (f"dt_glove_big", dt_glove_big),
    (f"dt_glove_big_tfidf", dt_glove_big_tfidf),
    (f"dt_w2v", dt_w2v),
    (f"dt_w2v_tfidf", dt_w2v_tfidf)
]


dt_results = {}
for name, pipeline in all_dt_models:
    dt_results[name] = use_classifier(pipeline, name)
    
store_data(f"dt_results.pkl", dt_results)

Working on dt_glove_small
              precision    recall  f1-score   support

    Negative       0.63      0.65      0.64      4939
    Positive       0.65      0.64      0.64      5061

   micro avg       0.64      0.64      0.64     10000
   macro avg       0.64      0.64      0.64     10000
weighted avg       0.64      0.64      0.64     10000

F1-score for dt_glove_small = 0.6421798582692884
Working on dt_glove_small_tfidf
              precision    recall  f1-score   support

    Negative       0.63      0.64      0.63      4939
    Positive       0.64      0.64      0.64      5061

   micro avg       0.64      0.64      0.64     10000
   macro avg       0.64      0.64      0.64     10000
weighted avg       0.64      0.64      0.64     10000

F1-score for dt_glove_small_tfidf = 0.641434657683543
Working on dt_glove_big
              precision    recall  f1-score   support

    Negative       0.65      0.67      0.66      4939
    Positive       0.67      0.65      0.66      506

In [24]:
rf_glove_small = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_small)), 
                            (f"rf", RandomForestClassifier(n_estimators=200))])
    
rf_glove_small_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_small)), 
                            (f"rf", RandomForestClassifier(n_estimators=200))])
    
rf_glove_big = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_big)), 
                            (f"rf", RandomForestClassifier(n_estimators=200))])
    
rf_glove_big_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_big)), 
                            (f"rf", RandomForestClassifier(n_estimators=200))])
    
rf_w2v = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(w2v)), 
                            (f"rf", RandomForestClassifier(n_estimators=200))])
    
rf_w2v_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(w2v)), 
                            (f"rf", RandomForestClassifier(n_estimators=200))])
all_rf_models = [
    (f"rf_glove_small", rf_glove_small),
    (f"rf_glove_small_tfidf", rf_glove_small_tfidf),
    (f"rf_glove_big", rf_glove_big),
    (f"rf_glove_big_tfidf", rf_glove_big_tfidf),
    (f"rf_w2v", rf_w2v),
    (f"rf_w2v_tfidf", rf_w2v_tfidf)
]


rf_results = {}
for name, pipeline in all_rf_models:
    rf_results[name] = use_classifier(pipeline, name)
    
store_data(f"rf_results.pkl", rf_results)

Working on rf_glove_small
              precision    recall  f1-score   support

    Negative       0.74      0.75      0.74      4939
    Positive       0.75      0.75      0.75      5061

   micro avg       0.75      0.75      0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000

F1-score for rf_glove_small = 0.7487606583382908
Working on rf_glove_small_tfidf
              precision    recall  f1-score   support

    Negative       0.73      0.74      0.74      4939
    Positive       0.74      0.73      0.74      5061

   micro avg       0.74      0.74      0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000

F1-score for rf_glove_small_tfidf = 0.7387978142076502
Working on rf_glove_big
              precision    recall  f1-score   support

    Negative       0.79      0.79      0.79      4939
    Positive       0.80      0.79      0.80      50