# Bag-of-Words Classifier Pipeline

In [1]:
import os

import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

RANDOM_STATE = 123

## Data prep

In [2]:
data_dir = 'data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

print("Shape of data\n---------------")
print(f"x_train_df shape: {x_train_df.shape} ")
print(f"y_train_df shape: {y_train_df.shape} ")

# Get the text as a list of strings
x_train_text = x_train_df['text'].values
y_train = y_train_df['is_positive_sentiment'].values

Shape of data
---------------
x_train_df shape: (2400, 2) 
y_train_df shape: (2400, 1) 


## Dataset Exploration

In [3]:
num_train_samples = x_train_text.shape[0]
num_positive_train_samples = np.count_nonzero(y_train == 1)
fraction_positive_train = float(num_positive_train_samples) / float(num_train_samples)


print(f"Total number of training samples = {num_train_samples}")
print(f"Fraction positive training samples = {fraction_positive_train}")


Total number of training samples = 2400
Fraction positive training samples = 0.5


## Basic comparison of two vectorizers - one with counts and one using tfidf

In [4]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit_transform(x_train_text)
x_train_text_count = count_vectorizer.transform(x_train_text).toarray()
# count_vectorizer.vocabulary_

In [5]:
tf_vectorizer = TfidfVectorizer(stop_words='english')
tf_vectorizer.fit_transform(x_train_text)
x_train_text_tf = tf_vectorizer.transform(x_train_text).toarray()
# tf_vectorizer.vocabulary_

## Cross validation 

In [7]:
cv_splitter = sklearn.model_selection.StratifiedKFold(n_splits=5)
#cv_splitter.get_n_splits(X, y)

## Hyperparameter searcher: Random Forest classifier

In [8]:
random_forest_classifier = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=RANDOM_STATE)

In [9]:
# Random Forest hyperparameter grid
random_forest_parameter_grid = dict(
    n_estimators = [20, 50, 100],
    max_depth = [5, 10],
    #min_samples_leaf = [1, 3, 5],
    #random_state=[101, 202],  # try two possible seeds to initialize parameters
    random_state=[100],
    )

In [10]:
random_forest_grid_searcher = sklearn.model_selection.GridSearchCV(
    random_forest_classifier,
    random_forest_parameter_grid,
    scoring='balanced_accuracy',
    cv=cv_splitter,
    return_train_score=True,
    refit=False)

### Fit on CountVector

In [25]:
random_forest_grid_searcher.fit(x_train_text_count, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(max_depth=10, n_estimators=50,
                                              random_state=100),
             param_grid={'max_depth': [5, 10], 'n_estimators': [20, 50, 100],
                         'random_state': [100]},
             refit=False, return_train_score=True, scoring='balanced_accuracy')

In [26]:
random_forest_grid_search_results_df = pd.DataFrame(random_forest_grid_searcher.cv_results_).copy()
n_trials_grid_search = random_forest_grid_search_results_df.shape[0]

In [27]:
param_keys = ['param_n_estimators', 'param_max_depth']

# Rearrange row order so it is easy to skim
random_forest_grid_search_results_df.sort_values(param_keys, inplace=True)
random_forest_grid_search_results_df[param_keys + ['mean_train_score', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_n_estimators,param_max_depth,mean_train_score,mean_test_score,rank_test_score
0,20,5,0.70875,0.679167,6
3,20,10,0.789792,0.722917,5
1,50,5,0.788542,0.735,4
4,50,10,0.839896,0.75875,1
2,100,5,0.820312,0.752917,3
5,100,10,0.838958,0.75375,2


### Fit on TFIDF

In [22]:
random_forest_grid_searcher.fit(x_train_text_tf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(max_depth=10, n_estimators=50,
                                              random_state=100),
             param_grid={'max_depth': [5, 10], 'n_estimators': [20, 50, 100],
                         'random_state': [100]},
             refit=False, return_train_score=True, scoring='balanced_accuracy')

In [23]:
random_forest_grid_search_results_df = pd.DataFrame(random_forest_grid_searcher.cv_results_).copy()
n_trials_grid_search = random_forest_grid_search_results_df.shape[0]

In [24]:
param_keys = ['param_n_estimators', 'param_max_depth']

# Rearrange row order so it is easy to skim
random_forest_grid_search_results_df.sort_values(param_keys, inplace=True)
random_forest_grid_search_results_df[param_keys + ['mean_train_score', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_n_estimators,param_max_depth,mean_train_score,mean_test_score,rank_test_score
0,20,5,0.702188,0.64875,6
3,20,10,0.786354,0.708333,5
1,50,5,0.795,0.7275,4
4,50,10,0.844479,0.7425,1
2,100,5,0.831042,0.735833,3
5,100,10,0.845833,0.74,2


In [29]:
best_random_forest = random_forest_classifier.set_params(**random_forest_grid_searcher.best_params_)
best_random_forest.fit(x_train_text_count, y_train)

RandomForestClassifier(max_depth=10, n_estimators=50, random_state=100)

## Hyperparameter searcher: Gradient Boosted Tree classifier

In [31]:
# Gradient Boosted Tree classifier with default values
gbtree_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=5, min_samples_leaf=1, random_state=100)

In [32]:
# Gradient Boosted Tree hyperparameter grid
gbtree_parameter_grid = dict(
    n_estimators = [20, 50, 100],
    max_depth = [5, 10],
    #min_samples_leaf = [1, 3, 5],
    #random_state=[101, 202],  # try two possible seeds to initialize parameters
    random_state=[100],
    )

In [33]:
gbtree_grid_searcher = sklearn.model_selection.GridSearchCV(
    gbtree_classifier,
    gbtree_parameter_grid,
    scoring='balanced_accuracy',
    cv=cv_splitter,
    return_train_score=True,
    refit=False)

In [35]:
gbtree_grid_searcher.fit(x_train_text_count, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=GradientBoostingClassifier(max_depth=5,
                                                  random_state=100),
             param_grid={'max_depth': [5, 10], 'n_estimators': [20, 50, 100],
                         'random_state': [100]},
             refit=False, return_train_score=True, scoring='balanced_accuracy')

In [36]:
gbtree_grid_search_results_df = pd.DataFrame(gbtree_grid_searcher.cv_results_).copy()
n_trials_grid_search = gbtree_grid_search_results_df.shape[0]

In [37]:
param_keys = ['param_n_estimators', 'param_max_depth']

# Rearrange row order so it is easy to skim
gbtree_grid_search_results_df.sort_values(param_keys, inplace=True)

In [38]:
gbtree_grid_search_results_df[param_keys + ['mean_train_score', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_n_estimators,param_max_depth,mean_train_score,mean_test_score,rank_test_score
0,20,5,0.74,0.696667,6
3,20,10,0.787292,0.72,5
1,50,5,0.799687,0.7275,4
4,50,10,0.865104,0.764167,2
2,100,5,0.847188,0.7625,3
5,100,10,0.918021,0.778333,1


In [41]:
best_gbtree = gbtree_classifier.set_params(**gbtree_grid_searcher.best_params_)
best_gbtree.fit(x_train_text_count, y_train)

KeyboardInterrupt: 

## Figures

In [None]:
fig, loss_ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), sharex=True, sharey=True)

for run_id in range(n_runs):
    tr_label = 'train log loss' if run_id == 0 else ''
    te_label = 'test log loss' if run_id == 0 else ''
    
    loss_ax.plot(np.log2(size_list), tr_loss_arr[:,run_id], 'bd', label=tr_label)
    loss_ax.plot(np.log2(size_list), te_loss_arr[:,run_id], 'rd', label=te_label)

loss_ax.set_xticks(np.log2(size_list));
loss_ax.set_xticklabels(size_list);
loss_ax.xaxis.grid(False);

loss_ax.set_ylim([0, 0.8]); # Don't touch this please
loss_ax.set_yticks(np.arange(0, 0.8, 0.1));
loss_ax.set_title("Log Loss vs Size")
loss_ax.set_ylabel('log loss');
loss_ax.set_xlabel('size');
loss_ax.legend(loc='upper right');

## Hyperparameter Search: L1-Logistic Regression

In [42]:
lasso = sklearn.linear_model.LogisticRegression(
    penalty='l1', solver='saga', random_state=101)

In [43]:
lasso_hyperparameter_grid_by_name = dict(
    C=np.logspace(-4, 4, 9),
    max_iter=[20, 40], # sneaky way to do "early stopping" 
                       # we'll take either iter 20 or iter 40 in training process, by best valid performance
    )

In [44]:
lasso_searcher = sklearn.model_selection.GridSearchCV(
    lasso,
    lasso_hyperparameter_grid_by_name,
    scoring='balanced_accuracy',
    cv=cv_splitter,
    return_train_score=True,
    refit=False)

In [45]:
lasso_searcher.fit(x_train_text_count, y_train)





GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(penalty='l1', random_state=101,
                                          solver='saga'),
             param_grid={'C': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03,
       1.e+04]),
                         'max_iter': [20, 40]},
             refit=False, return_train_score=True, scoring='balanced_accuracy')

In [46]:
lasso_search_results_df = pd.DataFrame(lasso_searcher.cv_results_).copy()

In [48]:
best_lasso = lasso.set_params(**lasso_searcher.best_params_)
best_lasso.fit(x_train_text_count, y_train)



LogisticRegression(C=10000.0, max_iter=20, penalty='l1', random_state=101,
                   solver='saga')