# Project B: Problem 1 Classifier Pipeline

In [36]:
import os

import numpy as np
import pandas as pd
import sklearn.tree
import sklearn.ensemble

## Import cleaned data (using HW4 data for now)

In [37]:
DATA_DIR = os.path.join("../hw4/data_product_reviews/")
x_tr_df = pd.read_csv(os.path.join(DATA_DIR, 'x_train.csv.zip'))
y_tr_df = pd.read_csv(os.path.join(DATA_DIR, 'y_train.csv'))
x_tr_NF = np.minimum(x_tr_df.values, 1.0).copy()
y_tr_N = y_tr_df.values[:,0].copy()

print("Training data")
print("x_tr_NF.shape: %s" % str(x_tr_NF.shape))
print("y_tr_N.shape : %s" % str(y_tr_N.shape))
print("mean(y_tr_N) : %.3f" % np.mean(y_tr_N))

Training data
x_tr_NF.shape: (6346, 7729)
y_tr_N.shape : (6346,)
mean(y_tr_N) : 0.500


## Even smaller dataset
HW4 dataset took too long for testing purposes

In [58]:
# We generated this training set for you.

N = 12

x_tr_N = np.asarray([
    -0.975, -0.825, -0.603, -0.378, -0.284, -0.102,
     0.169,  0.311,  0.431,  0.663,  0.795,  0.976])
x_tr_NF = x_tr_N.reshape((N,1)) # need an (N,1) shaped array for later use with sklearn

y_tr_N = np.asarray([0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1])

In [59]:
def make_dataset(n_examples=10, seed=101, flip_fraction=0.15):
    N = int(n_examples)
    prng = np.random.RandomState(seed)

    # Make x values between -1 and 1, roughly evenly spaced
    x_N = np.linspace(-1, 1, N) + 0.05 * prng.randn(N)

    # Make y values such that broadly, the true function says:
    # y_n = 1  if x_n > 0 
    # y_n = 0  otherwise
    y_N = np.asarray(x_N > 0, dtype=np.int32)

    # flip a small percentage of the values
    chosen_ids = prng.permutation(np.arange(N))[:int(np.floor(flip_fraction * N))]
    y_N[chosen_ids] = 1 - y_N[chosen_ids]
    return x_N.reshape((N,1)), y_N

In [60]:
M = 100
x_va_MF, y_va_M = make_dataset(n_examples=M, seed=201)

## Dataset Exploration

In [61]:
num_train_samples = x_tr_NF.shape[0]
num_positive_train_samples = np.count_nonzero(y_tr_N == 1)
fraction_positive_train = float(num_positive_train_samples) / float(num_train_samples)


print(f"Total number of training samples = {num_train_samples}")
print(f"Fraction positive training samples = {fraction_positive_train}")


Total number of training samples = 12
Fraction positive training samples = 0.5


## Cross validation 

In [62]:
cv_splitter = sklearn.model_selection.StratifiedKFold(n_splits=5)
#cv_splitter.get_n_splits(X, y)

## Hyperparameter searcher: Gradient Boosted Tree classifier

In [63]:
# Gradient Boosted Tree classifier with default values
gbtree_classifier = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, max_depth=5, min_samples_leaf=1, random_state=100)

In [88]:
# Gradient Boosted Tree hyperparameter grid
gbtree_parameter_grid = dict(
    n_estimators = [20, 50, 100],
    max_depth = [5, 10],
    #min_samples_leaf = [1, 3, 5],
    #random_state=[101, 202],  # try two possible seeds to initialize parameters
    random_state=[100],
    )

In [89]:
gbtree_grid_searcher = sklearn.model_selection.GridSearchCV(
    gbtree_classifier,
    gbtree_parameter_grid,
    scoring='balanced_accuracy',
    cv=cv_splitter,
    return_train_score=True,
    refit=False)

In [90]:
gbtree_grid_searcher.fit(x_tr_NF, y_tr_N)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=GradientBoostingClassifier(max_depth=5, n_estimators=20,
                                                  random_state=101),
             param_grid={'max_depth': [5, 10], 'n_estimators': [20, 50, 100],
                         'random_state': [100]},
             refit=False, return_train_score=True, scoring='balanced_accuracy')

## Pick the best version

In [91]:
gbtree_grid_search_results_df = pd.DataFrame(gbtree_grid_searcher.cv_results_).copy()
print("Dataframe has shape: %s" % (str(gbtree_grid_search_results_df.shape)))
n_trials_grid_search = gbtree_grid_search_results_df.shape[0]

print("Dataframe has columns:")
for c in gbtree_grid_search_results_df.columns:
    print("-- %s" % c)

Dataframe has shape: (6, 23)
Dataframe has columns:
-- mean_fit_time
-- std_fit_time
-- mean_score_time
-- std_score_time
-- param_max_depth
-- param_n_estimators
-- param_random_state
-- params
-- split0_test_score
-- split1_test_score
-- split2_test_score
-- split3_test_score
-- split4_test_score
-- mean_test_score
-- std_test_score
-- rank_test_score
-- split0_train_score
-- split1_train_score
-- split2_train_score
-- split3_train_score
-- split4_train_score
-- mean_train_score
-- std_train_score


In [94]:
param_keys = ['param_n_estimators', 'param_max_depth']

# Rearrange row order so it is easy to skim
gbtree_grid_search_results_df.sort_values(param_keys, inplace=True)

In [95]:
gbtree_grid_search_results_df[param_keys + ['mean_train_score', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_n_estimators,param_max_depth,mean_train_score,mean_test_score,rank_test_score
0,20,5,1.0,0.75,1
3,20,10,1.0,0.75,1
1,50,5,1.0,0.75,1
4,50,10,1.0,0.75,1
2,100,5,1.0,0.75,1
5,100,10,1.0,0.75,1


In [96]:
best_gbtree = gbtree_classifier.set_params(**gbtree_grid_searcher.best_params_)
best_gbtree.fit(x_tr_NF, y_tr_N)

GradientBoostingClassifier(max_depth=5, n_estimators=20, random_state=100)

## Figures

In [None]:
fig, loss_ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), sharex=True, sharey=True)

for run_id in range(n_runs):
    tr_label = 'train log loss' if run_id == 0 else ''
    te_label = 'test log loss' if run_id == 0 else ''
    
    loss_ax.plot(np.log2(size_list), tr_loss_arr[:,run_id], 'bd', label=tr_label)
    loss_ax.plot(np.log2(size_list), te_loss_arr[:,run_id], 'rd', label=te_label)

loss_ax.set_xticks(np.log2(size_list));
loss_ax.set_xticklabels(size_list);
loss_ax.xaxis.grid(False);

loss_ax.set_ylim([0, 0.8]); # Don't touch this please
loss_ax.set_yticks(np.arange(0, 0.8, 0.1));
loss_ax.set_title("Log Loss vs Size")
loss_ax.set_ylabel('log loss');
loss_ax.set_xlabel('size');
loss_ax.legend(loc='upper right');

## Hyperparameter Search: L1-Logistic Regression

In [97]:
lasso = sklearn.linear_model.LogisticRegression(
    penalty='l1', solver='saga', random_state=101)

In [98]:
lasso_hyperparameter_grid_by_name = dict(
    C=np.logspace(-4, 4, 9),
    max_iter=[20, 40], # sneaky way to do "early stopping" 
                       # we'll take either iter 20 or iter 40 in training process, by best valid performance
    )

In [99]:
lasso_searcher = sklearn.model_selection.GridSearchCV(
    lasso,
    lasso_hyperparameter_grid_by_name,
    scoring='balanced_accuracy',
    cv=cv_splitter,
    return_train_score=True,
    refit=False)

In [100]:
lasso_searcher.fit(x_tr_NF, y_tr_N)



GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(penalty='l1', random_state=101,
                                          solver='saga'),
             param_grid={'C': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03,
       1.e+04]),
                         'max_iter': [20, 40]},
             refit=False, return_train_score=True, scoring='balanced_accuracy')

In [102]:
lasso_search_results_df = pd.DataFrame(lasso_searcher.cv_results_).copy()

In [103]:
best_lasso = lasso.set_params(**lasso_searcher.best_params_)
best_lasso.fit(x_tr_NF, y_tr_N)

LogisticRegression(C=10.0, max_iter=20, penalty='l1', random_state=101,
                   solver='saga')