# Word Embedding Classifier Pipeline

In [109]:
import os

import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from collections import OrderedDict

RANDOM_STATE = 123

## Data prep

In [110]:
data_dir = 'data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

print("Shape of data\n---------------")
print(f"x_train_df shape: {x_train_df.shape} ")
print(f"y_train_df shape: {y_train_df.shape} ")

# Get the text as a list of strings
x_train_text = x_train_df['text'].values
y_train = y_train_df['is_positive_sentiment'].values

Shape of data
---------------
x_train_df shape: (2400, 2) 
y_train_df shape: (2400, 1) 


## Dataset Exploration

In [111]:
num_train_samples = x_train_text.shape[0]
num_positive_train_samples = np.count_nonzero(y_train == 1)
fraction_positive_train = float(num_positive_train_samples) / float(num_train_samples)


print(f"Total number of training samples = {num_train_samples}")
print(f"Fraction positive training samples = {fraction_positive_train}")


Total number of training samples = 2400
Fraction positive training samples = 0.5


## Word Embedding

In [112]:
zip_file_path = os.path.join(
    'pretrained_embedding_vectors/',
    'glove.6B.50d.txt.zip')

word_embeddings = pd.read_csv(
    zip_file_path,
    header=None, sep=' ', index_col=0,
    nrows=100000, compression='zip', encoding='utf-8', quoting=3)

# Build a dict that will map from string word to 50-dim vector
word_list = word_embeddings.index.values.tolist()
word2vec = OrderedDict(zip(word_list, word_embeddings.values))

# Show some examples of word embeddings
# Each word will get mapped to a 
n_words = len(word2vec.keys())

print("Loaded pretrained embeddings for %d possible words" % n_words)
print("Each embedding vector has %d dimensions" % (
    list(word2vec.values())[0].size))

print("word2vec['london'] = ")
print(word2vec['london'])

print("word2vec['england'] = ")
print(word2vec['england'])

Loaded pretrained embeddings for 99998 possible words
Each embedding vector has 50 dimensions
word2vec['london'] = 
[ 0.032886   0.99637   -0.6975    -0.58375    0.053403  -0.35758
 -0.96735   -0.048156  -0.23417   -0.31642   -0.080246   0.0075121
 -0.69211   -0.19357    0.040528   0.74492    0.079019  -0.13893
 -1.5938     0.33824    2.5535     0.87576   -0.1597     0.85763
 -0.68158   -1.3948     0.13189    0.10129   -0.7461     0.67386
  2.5619    -0.19922    0.76751   -0.4867     0.39738   -0.6253
  0.63504   -0.1989    -0.0953    -0.22472    0.61698   -0.21968
  0.2584    -0.39371    0.47571    0.57736   -0.55713   -0.6259
  0.60789   -0.30978  ]
word2vec['england'] = 
[-3.6165e-01 -1.0607e-01 -1.1168e+00 -6.7270e-01 -1.6521e-01  6.8828e-02
 -1.1727e+00  7.1667e-01  1.6573e-01 -7.5759e-01 -1.4659e-01  3.5785e-01
 -6.9141e-01 -1.2047e+00  1.5224e-01  6.3566e-01  8.7442e-01 -6.1517e-01
 -1.7471e+00  3.5292e-01  2.2251e-02  3.8990e-01  2.0703e-02  7.4169e-01
  2.4097e-03 -1.3529e+00 

In [125]:
import re

x_train_embeddings = np.zeros((num_train_samples, 50))

for ind in range(num_train_samples):
    sample = x_train_text[ind]
    stripped_sample = re.sub(r'[^\w\s]', '', sample).lower()
    sample_embedding = []
    #print(f"Current sample = {stripped_sample}")
    for word in stripped_sample.split(' '):
        if word in word2vec.keys() and word not in text.ENGLISH_STOP_WORDS:
            sample_embedding.append(word2vec[word])
    
    if len(sample_embedding) == 0:
        sample_embedding = [0] * 50
        
    sample_embedding = np.array(sample_embedding)
    avg_sample_embedding = np.nanmean(sample_embedding, axis=0)
    x_train_embeddings[ind] = avg_sample_embedding

## Cross validation 

In [126]:
cv_splitter = sklearn.model_selection.StratifiedKFold(n_splits=5)
#cv_splitter.get_n_splits(X, y)

## Hyperparameter searcher: Random Forest classifier

In [127]:
random_forest_classifier = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=RANDOM_STATE)

In [128]:
# Random Forest hyperparameter grid
random_forest_parameter_grid = dict(
    n_estimators = [20, 50, 100],
    max_depth = [5, 10],
    #min_samples_leaf = [1, 3, 5],
    #random_state=[101, 202],  # try two possible seeds to initialize parameters
    random_state=[100],
    )

In [129]:
random_forest_grid_searcher = sklearn.model_selection.GridSearchCV(
    random_forest_classifier,
    random_forest_parameter_grid,
    scoring='balanced_accuracy',
    cv=cv_splitter,
    return_train_score=True,
    refit=False)

### Fit on Word Embeddings

In [130]:
random_forest_grid_searcher.fit(x_train_embeddings, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(max_depth=5, random_state=123),
             param_grid={'max_depth': [5, 10], 'n_estimators': [20, 50, 100],
                         'random_state': [100]},
             refit=False, return_train_score=True, scoring='balanced_accuracy')

In [131]:
random_forest_grid_search_results_df = pd.DataFrame(random_forest_grid_searcher.cv_results_).copy()
n_trials_grid_search = random_forest_grid_search_results_df.shape[0]

In [132]:
param_keys = ['param_n_estimators', 'param_max_depth']

# Rearrange row order so it is easy to skim
random_forest_grid_search_results_df.sort_values(param_keys, inplace=True)
random_forest_grid_search_results_df[param_keys + ['mean_train_score', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_n_estimators,param_max_depth,mean_train_score,mean_test_score,rank_test_score
0,20,5,0.840937,0.72875,5
3,20,10,0.980625,0.726667,6
1,50,5,0.849167,0.73375,4
4,50,10,0.987083,0.74625,2
2,100,5,0.848229,0.735,3
5,100,10,0.989062,0.75,1


In [133]:
best_random_forest = random_forest_classifier.set_params(**random_forest_grid_searcher.best_params_)
best_random_forest.fit(x_train_embeddings, y_train)

RandomForestClassifier(max_depth=10, random_state=100)

## Hyperparameter searcher: Gradient Boosted Tree classifier

In [78]:
# Gradient Boosted Tree classifier with default values
gbtree_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=5, min_samples_leaf=1, random_state=100)

In [79]:
# Gradient Boosted Tree hyperparameter grid
gbtree_parameter_grid = dict(
    n_estimators = [20, 50, 100],
    max_depth = [5, 10],
    #min_samples_leaf = [1, 3, 5],
    #random_state=[101, 202],  # try two possible seeds to initialize parameters
    random_state=[100],
    )

In [80]:
gbtree_grid_searcher = sklearn.model_selection.GridSearchCV(
    gbtree_classifier,
    gbtree_parameter_grid,
    scoring='balanced_accuracy',
    cv=cv_splitter,
    return_train_score=True,
    refit=False)

In [81]:
gbtree_grid_searcher.fit(x_train_embeddings, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=GradientBoostingClassifier(max_depth=5,
                                                  random_state=100),
             param_grid={'max_depth': [5, 10], 'n_estimators': [20, 50, 100],
                         'random_state': [100]},
             refit=False, return_train_score=True, scoring='balanced_accuracy')

In [82]:
gbtree_grid_search_results_df = pd.DataFrame(gbtree_grid_searcher.cv_results_).copy()
n_trials_grid_search = gbtree_grid_search_results_df.shape[0]

In [83]:
param_keys = ['param_n_estimators', 'param_max_depth']

# Rearrange row order so it is easy to skim
gbtree_grid_search_results_df.sort_values(param_keys, inplace=True)

In [84]:
gbtree_grid_search_results_df[param_keys + ['mean_train_score', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_n_estimators,param_max_depth,mean_train_score,mean_test_score,rank_test_score
0,20,5,0.909583,0.727083,4
3,20,10,0.996354,0.7075,6
1,50,5,0.973958,0.743333,2
4,50,10,0.996354,0.720417,5
2,100,5,0.995729,0.744583,1
5,100,10,0.996354,0.737917,3


In [85]:
best_gbtree = gbtree_classifier.set_params(**gbtree_grid_searcher.best_params_)
best_gbtree.fit(x_train_embeddings, y_train)

GradientBoostingClassifier(max_depth=5, random_state=100)

## Figures

In [None]:
fig, loss_ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), sharex=True, sharey=True)

for run_id in range(n_runs):
    tr_label = 'train log loss' if run_id == 0 else ''
    te_label = 'test log loss' if run_id == 0 else ''
    
    loss_ax.plot(np.log2(size_list), tr_loss_arr[:,run_id], 'bd', label=tr_label)
    loss_ax.plot(np.log2(size_list), te_loss_arr[:,run_id], 'rd', label=te_label)

loss_ax.set_xticks(np.log2(size_list));
loss_ax.set_xticklabels(size_list);
loss_ax.xaxis.grid(False);

loss_ax.set_ylim([0, 0.8]); # Don't touch this please
loss_ax.set_yticks(np.arange(0, 0.8, 0.1));
loss_ax.set_title("Log Loss vs Size")
loss_ax.set_ylabel('log loss');
loss_ax.set_xlabel('size');
loss_ax.legend(loc='upper right');

## Hyperparameter Search: L1-Logistic Regression

In [95]:
lasso = sklearn.linear_model.LogisticRegression(
    penalty='l1', solver='saga', random_state=101)

In [102]:
lasso_hyperparameter_grid_by_name = dict(
    C=np.logspace(0, 8, 9),
    max_iter=[20, 40], # sneaky way to do "early stopping" 
                       # we'll take either iter 20 or iter 40 in training process, by best valid performance
    )

In [103]:
lasso_searcher = sklearn.model_selection.GridSearchCV(
    lasso,
    lasso_hyperparameter_grid_by_name,
    scoring='balanced_accuracy',
    cv=cv_splitter,
    return_train_score=True,
    refit=False)

In [104]:
lasso_searcher.fit(x_train_embeddings, y_train)







GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(penalty='l1', random_state=101,
                                          solver='saga'),
             param_grid={'C': array([1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06, 1.e+07,
       1.e+08]),
                         'max_iter': [20, 40]},
             refit=False, return_train_score=True, scoring='balanced_accuracy')

In [105]:
lasso_search_results_df = pd.DataFrame(lasso_searcher.cv_results_).copy()

In [106]:
param_keys = ['param_C', 'param_max_iter']

# Rearrange row order so it is easy to skim
lasso_search_results_df.sort_values(param_keys, inplace=True)

In [107]:
lasso_search_results_df[param_keys + ['mean_train_score', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_C,param_max_iter,mean_train_score,mean_test_score,rank_test_score
0,1.0,20,0.770729,0.755833,18
1,1.0,40,0.770833,0.756667,8
2,10.0,20,0.769063,0.757083,1
3,10.0,40,0.770104,0.756667,8
4,100.0,20,0.769167,0.756667,8
5,100.0,40,0.769792,0.756667,8
6,1000.0,20,0.769167,0.757083,1
7,1000.0,40,0.769792,0.756667,8
8,10000.0,20,0.769167,0.757083,1
9,10000.0,40,0.769792,0.756667,8


In [108]:
best_lasso = lasso.set_params(**lasso_searcher.best_params_)
best_lasso.fit(x_train_embeddings, y_train)



LogisticRegression(C=10.0, max_iter=20, penalty='l1', random_state=101,
                   solver='saga')