# Word Embedding Classifier Pipeline

In [20]:
import os

import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import sklearn.neural_network
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from collections import OrderedDict

# Custom functions 
from utils import print_gridsearch_results, test_on_estimator, plot_cv_single_param

RANDOM_STATE = 123

## Data prep

In [2]:
data_dir = 'data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

print("Shape of data\n---------------")
print(f"x_train_df shape: {x_train_df.shape} ")
print(f"y_train_df shape: {y_train_df.shape} ")

# Get the text as a list of strings
x_train_text = x_train_df['text'].values
y_train = y_train_df['is_positive_sentiment'].values

Shape of data
---------------
x_train_df shape: (2400, 2) 
y_train_df shape: (2400, 1) 


## Dataset Exploration

In [3]:
num_train_samples = x_train_text.shape[0]
num_positive_train_samples = np.count_nonzero(y_train == 1)
fraction_positive_train = float(num_positive_train_samples) / float(num_train_samples)


print(f"Total number of training samples = {num_train_samples}")
print(f"Fraction positive training samples = {fraction_positive_train}")


Total number of training samples = 2400
Fraction positive training samples = 0.5


## Word Embedding

In [4]:
zip_file_path = os.path.join(
    'pretrained_embedding_vectors/',
    'glove.6B.50d.txt.zip')

word_embeddings = pd.read_csv(
    zip_file_path,
    header=None, sep=' ', index_col=0,
    nrows=100000, compression='zip', encoding='utf-8', quoting=3)

# Build a dict that will map from string word to 50-dim vector
word_list = word_embeddings.index.values.tolist()
word2vec = OrderedDict(zip(word_list, word_embeddings.values))

In [5]:
import re

x_train_embeddings = np.zeros((num_train_samples, 50))

unique_vocab_words = set()

for ind in range(num_train_samples):
    sample = x_train_text[ind]
    stripped_sample = re.sub(r'[^\w\s]', '', sample).lower()
    sample_embedding = []
    #print(f"Current sample = {stripped_sample}")
    for word in stripped_sample.split(' '):
        if word in word2vec.keys() and word not in text.ENGLISH_STOP_WORDS:
            sample_embedding.append(word2vec[word])
            unique_vocab_words.add(word)
            
    if len(sample_embedding) == 0:
        sample_embedding = [0] * 50
        
    sample_embedding = np.array(sample_embedding)
    avg_sample_embedding = np.nanmean(sample_embedding, axis=0)
    x_train_embeddings[ind] = avg_sample_embedding

In [6]:
# Size of vocab set
len(unique_vocab_words)

3902

## Cross validation 

In [7]:
cv_splitter = sklearn.model_selection.StratifiedKFold(n_splits=5)
#cv_splitter.get_n_splits(X, y)

## Hyperparameter searcher: Random Forest classifier

In [8]:
random_forest_classifier = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=RANDOM_STATE)

In [9]:
# Random Forest hyperparameter grid
random_forest_parameter_grid = dict(
    n_estimators = [20, 50, 100],
    max_depth = [5, 10],
    #min_samples_leaf = [1, 3, 5],
    #random_state=[101, 202],  # try two possible seeds to initialize parameters
    random_state=[100],
    )

In [10]:
random_forest_grid_searcher = sklearn.model_selection.GridSearchCV(
    random_forest_classifier,
    random_forest_parameter_grid,
    scoring='balanced_accuracy',
    cv=cv_splitter,
    return_train_score=True,
    refit=False)

### Fit on Word Embeddings

In [11]:
random_forest_grid_searcher.fit(x_train_embeddings, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(max_depth=5, random_state=123),
             param_grid={'max_depth': [5, 10], 'n_estimators': [20, 50, 100],
                         'random_state': [100]},
             refit=False, return_train_score=True, scoring='balanced_accuracy')

In [12]:
random_forest_grid_search_results_df = pd.DataFrame(random_forest_grid_searcher.cv_results_).copy()
n_trials_grid_search = random_forest_grid_search_results_df.shape[0]

In [13]:
param_keys = ['param_n_estimators', 'param_max_depth']

# Rearrange row order so it is easy to skim
random_forest_grid_search_results_df.sort_values(param_keys, inplace=True)
random_forest_grid_search_results_df[param_keys + ['mean_train_score', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_n_estimators,param_max_depth,mean_train_score,mean_test_score,rank_test_score
0,20,5,0.840937,0.72875,5
3,20,10,0.980625,0.726667,6
1,50,5,0.849167,0.73375,4
4,50,10,0.987083,0.74625,2
2,100,5,0.848229,0.735,3
5,100,10,0.989062,0.75,1


In [14]:
best_random_forest = random_forest_classifier.set_params(**random_forest_grid_searcher.best_params_)
best_random_forest.fit(x_train_embeddings, y_train)

RandomForestClassifier(max_depth=10, random_state=100)

## Hyperparameter searcher: Gradient Boosted Tree classifier

In [78]:
# Gradient Boosted Tree classifier with default values
gbtree_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=5, min_samples_leaf=1, random_state=100)

In [79]:
# Gradient Boosted Tree hyperparameter grid
gbtree_parameter_grid = dict(
    n_estimators = [20, 50, 100],
    max_depth = [5, 10],
    #min_samples_leaf = [1, 3, 5],
    #random_state=[101, 202],  # try two possible seeds to initialize parameters
    random_state=[100],
    )

In [80]:
gbtree_grid_searcher = sklearn.model_selection.GridSearchCV(
    gbtree_classifier,
    gbtree_parameter_grid,
    scoring='balanced_accuracy',
    cv=cv_splitter,
    return_train_score=True,
    refit=False)

In [81]:
gbtree_grid_searcher.fit(x_train_embeddings, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=GradientBoostingClassifier(max_depth=5,
                                                  random_state=100),
             param_grid={'max_depth': [5, 10], 'n_estimators': [20, 50, 100],
                         'random_state': [100]},
             refit=False, return_train_score=True, scoring='balanced_accuracy')

In [82]:
gbtree_grid_search_results_df = pd.DataFrame(gbtree_grid_searcher.cv_results_).copy()
n_trials_grid_search = gbtree_grid_search_results_df.shape[0]

In [83]:
param_keys = ['param_n_estimators', 'param_max_depth']

# Rearrange row order so it is easy to skim
gbtree_grid_search_results_df.sort_values(param_keys, inplace=True)

In [84]:
gbtree_grid_search_results_df[param_keys + ['mean_train_score', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_n_estimators,param_max_depth,mean_train_score,mean_test_score,rank_test_score
0,20,5,0.909583,0.727083,4
3,20,10,0.996354,0.7075,6
1,50,5,0.973958,0.743333,2
4,50,10,0.996354,0.720417,5
2,100,5,0.995729,0.744583,1
5,100,10,0.996354,0.737917,3


In [85]:
best_gbtree = gbtree_classifier.set_params(**gbtree_grid_searcher.best_params_)
best_gbtree.fit(x_train_embeddings, y_train)

GradientBoostingClassifier(max_depth=5, random_state=100)

## Figures

In [None]:
fig, loss_ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), sharex=True, sharey=True)

for run_id in range(n_runs):
    tr_label = 'train log loss' if run_id == 0 else ''
    te_label = 'test log loss' if run_id == 0 else ''
    
    loss_ax.plot(np.log2(size_list), tr_loss_arr[:,run_id], 'bd', label=tr_label)
    loss_ax.plot(np.log2(size_list), te_loss_arr[:,run_id], 'rd', label=te_label)

loss_ax.set_xticks(np.log2(size_list));
loss_ax.set_xticklabels(size_list);
loss_ax.xaxis.grid(False);

loss_ax.set_ylim([0, 0.8]); # Don't touch this please
loss_ax.set_yticks(np.arange(0, 0.8, 0.1));
loss_ax.set_title("Log Loss vs Size")
loss_ax.set_ylabel('log loss');
loss_ax.set_xlabel('size');
loss_ax.legend(loc='upper right');

## Hyperparameter Search: L1-Logistic Regression

In [26]:
lasso = sklearn.linear_model.LogisticRegression(
    penalty='l1', solver='saga', random_state=101)

In [34]:
lasso_hyperparameter_grid_by_name = dict(
    C=np.logspace(0, 8, 9),
    max_iter=[20, 30, 40], # sneaky way to do "early stopping" 
                       # we'll take either iter 20 or iter 40 in training process, by best valid performance
    )

In [35]:
lasso_searcher = sklearn.model_selection.GridSearchCV(
    lasso,
    lasso_hyperparameter_grid_by_name,
    scoring='balanced_accuracy',
    cv=cv_splitter,
    return_train_score=True,
    refit=False)

In [36]:
lasso_searcher.fit(x_train_embeddings, y_train)









GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(C=10.0, max_iter=20, penalty='l1',
                                          random_state=101, solver='saga'),
             param_grid={'C': array([1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06, 1.e+07,
       1.e+08]),
                         'max_iter': [20, 30, 40]},
             refit=False, return_train_score=True, scoring='balanced_accuracy')

In [37]:
lasso_search_results_df = pd.DataFrame(lasso_searcher.cv_results_).copy()

In [38]:
param_keys = ['param_C', 'param_max_iter']

# Rearrange row order so it is easy to skim
lasso_search_results_df.sort_values(param_keys, inplace=True)

In [39]:
lasso_search_results_df[param_keys + ['mean_train_score', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_C,param_max_iter,mean_train_score,mean_test_score,rank_test_score
0,1.0,20,0.770729,0.755833,26
1,1.0,30,0.770625,0.755833,26
2,1.0,40,0.770833,0.756667,8
3,10.0,20,0.769063,0.757083,1
4,10.0,30,0.769688,0.756667,8
5,10.0,40,0.770104,0.756667,8
6,100.0,20,0.769167,0.756667,8
7,100.0,30,0.769479,0.756667,8
8,100.0,40,0.769792,0.756667,8
9,1000.0,20,0.769167,0.757083,1


In [33]:
best_lasso = lasso.set_params(**lasso_searcher.best_params_)
best_lasso.fit(x_train_embeddings, y_train)



LogisticRegression(C=10.0, max_iter=20, penalty='l1', random_state=101,
                   solver='saga')

## Load test set for predictions

In [41]:
x_test_df = pd.read_csv(os.path.join(data_dir, 'x_test.csv'))

# Get the text as a list of strings
x_test_text = x_test_df['text'].values
num_test_samples = x_test_text.shape[0]

In [43]:
x_test_embeddings = np.zeros((num_test_samples, 50))

for ind in range(num_test_samples):
    sample = x_test_text[ind]
    stripped_sample = re.sub(r'[^\w\s]', '', sample).lower()
    sample_embedding = []
    #print(f"Current sample = {stripped_sample}")
    for word in stripped_sample.split(' '):
        if word in word2vec.keys() and word not in text.ENGLISH_STOP_WORDS:
            sample_embedding.append(word2vec[word])
            unique_vocab_words.add(word)
            
    if len(sample_embedding) == 0:
        sample_embedding = [0] * 50
        
    sample_embedding = np.array(sample_embedding)
    avg_sample_embedding = np.nanmean(sample_embedding, axis=0)
    x_test_embeddings[ind] = avg_sample_embedding

In [44]:
best_lasso_yhat_test = best_lasso.predict_proba(x_test_embeddings)[:,1]
outfile = os.path.join('word_embedding_preds', 'yproba1_test.txt')
np.savetxt(outfile, best_lasso_yhat_test)

# MLP Classifier

In [28]:
mlp = sklearn.neural_network.MLPClassifier(solver='lbfgs')
mlp_parameters = {
     'mlp__hidden_layer_sizes': [4,16,64],
     'mlp__alpha': [0.0, 0.0001, 0.01, 1.00],
     'mlp__max_iter': [100,500, 1000], # sneaky way to do "early stopping" 
}
mlp_pipeline_tuple = ('mlp', mlp)

In [11]:
my_parameter_grid_by_name = dict(
    hidden_layer_sizes=[
        4,
        16,
        64,
        ],
    alpha=[
        0.0,
        0.0001,
        0.01,
        1.00,
        ],
    random_state=[  # try two possible seeds to initialize parameters
        101, 202,
        ],
    )

In [29]:
embedding_mlp_pipeline = Pipeline([
#     embedding_pipeline_tuple,
    mlp_pipeline_tuple,
 ])
embedding_mlp_full_grid = { 
#     **embedding_parameters,
    **mlp_parameters
}


embedding_mlp_grid_searcher = GridSearchCV(
    embedding_mlp_pipeline, 
    embedding_mlp_full_grid, 
    cv=cv_splitter, 
    n_jobs=3, 
    verbose=3,
    scoring='balanced_accuracy',
    return_train_score=True
)
embedding_mlp_grid_searcher.fit(x_train_embeddings, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:    6.6s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:   42.2s
[Parallel(n_jobs=3)]: Done 180 out of 180 | elapsed:  1.5min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('mlp', MLPClassifier(solver='lbfgs'))]),
             n_jobs=3,
             param_grid={'mlp__alpha': [0.0, 0.0001, 0.01, 1.0],
                         'mlp__hidden_layer_sizes': [4, 16, 64],
                         'mlp__max_iter': [100, 500, 1000]},
             return_train_score=True, scoring='balanced_accuracy', verbose=3)

In [30]:
print_gridsearch_results(embedding_mlp_grid_searcher, list(embedding_mlp_full_grid.keys()))

Dataframe has shape: (36, 23)
Number of trials used in grid search:  36


Unnamed: 0,param_mlp__hidden_layer_sizes,param_mlp__alpha,param_mlp__max_iter,mean_train_score,mean_test_score,mean_fit_time,rank_test_score
27,4,1.0,100,0.792604,0.750833,0.159112,1
29,4,1.0,1000,0.814375,0.744583,1.550458,2
28,4,1.0,500,0.816979,0.742917,0.734303,3
18,4,0.01,100,0.798646,0.7425,0.149641,4
9,4,0.0001,100,0.80375,0.741667,0.134256,5
0,4,0.0,100,0.795,0.74125,0.233203,6
1,4,0.0,500,0.803333,0.7375,0.683363,7
20,4,0.01,1000,0.801667,0.735833,1.21589,8
2,4,0.0,1000,0.810417,0.735,0.799426,9
10,4,0.0001,500,0.792917,0.729167,0.601615,10
