# Split-Source Classifiers
Ensemble of classifiers based on the source of the review

In [1]:
import os
import numpy as np
import pandas as pd
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import sklearn.neural_network

# Custom functions 
# Custom functions 
from utils import (
    print_gridsearch_results, 
    test_on_estimator, 
    plot_cv_train_test, 
    plot_cv_folds, 
    analysis_of_mistakes,
)

RANDOM_STATE = 123

In [2]:
# Plotting utils
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('seaborn') # pretty matplotlib plots

import seaborn as sns
sns.set('notebook', font_scale=1.25, style='whitegrid')

## Data prep

In [3]:
data_dir = 'data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

# Where to output predictions on the test_set
x_test_df = pd.read_csv(os.path.join(data_dir, 'x_test.csv'))
output_dir = 'split_source_predictions'

print("Shape of data\n---------------")
print(f"x_train_df shape: {x_train_df.shape} ")
print(f"x_test_df shape: {x_test_df.shape} ")
print(f"y_train_df shape: {y_train_df.shape} ")

# # Get the text as a list of strings
# x_train_text = x_train_df['text'].values
# x_test_text = x_test_df['text'].values
# y_train = y_train_df['is_positive_sentiment'].values

# Get list of website names
x_train_df['website_name'].unique()

Shape of data
---------------
x_train_df shape: (2400, 2) 
x_test_df shape: (600, 2) 
y_train_df shape: (2400, 1) 


array(['amazon', 'imdb', 'yelp'], dtype=object)

In [4]:
combined_dataset = pd.concat((x_train_df, y_train_df), axis=1)
combined_dataset

Unnamed: 0,website_name,text,is_positive_sentiment
0,amazon,Oh and I forgot to also mention the weird colo...,0
1,amazon,THAT one didn't work either.,0
2,amazon,Waste of 13 bucks.,0
3,amazon,"Product is useless, since it does not have eno...",0
4,amazon,None of the three sizes they sent with the hea...,0
...,...,...,...
2395,yelp,The sweet potato fries were very good and seas...,1
2396,yelp,I could eat their bruschetta all day it is dev...,1
2397,yelp,Ambience is perfect.,1
2398,yelp,We ordered the duck rare and it was pink and t...,1


### Split train data

In [5]:
amazon_train = combined_dataset[combined_dataset['website_name'] == 'amazon']
amazon_x_train = amazon_train['text'].values
amazon_y_train = amazon_train['is_positive_sentiment'].values

In [6]:
imdb_train = combined_dataset[combined_dataset['website_name'] == 'imdb']
imdb_x_train = imdb_train['text'].values
imdb_y_train = imdb_train['is_positive_sentiment'].values

In [7]:
yelp_train = combined_dataset[combined_dataset['website_name'] == 'yelp']
yelp_x_train = yelp_train['text'].values
yelp_y_train = yelp_train['is_positive_sentiment'].values

### Split test data

In [8]:
amazon_x_df = x_test_df[x_test_df['website_name'] == 'amazon']
imdb_x_df = x_test_df[x_test_df['website_name'] == 'imdb']
yelp_x_df = x_test_df[x_test_df['website_name'] == 'yelp']

## Define stable CV-splitter

In [9]:
cv_splitter = sklearn.model_selection.StratifiedKFold(n_splits=5)

## Construct TFIDF vectorizer

In [10]:
tf_vectorizer = TfidfVectorizer(stop_words='english')
# #  Parameters for grid search
# tf_parameters = { 
#     'tf__min_df': np.arange(1,3),
#     'tf__max_df': (0.05, 0.5),
#     'tf__ngram_range': [(1, 1), (1, 2)],
# }

# Static Parameters based on earlier grid_search 
tf_vectorizer.set_params(min_df = 1, ngram_range = (1,2))

# Set the tuple for the pipeline
tf_pipeline_tuple = ("tf", tf_vectorizer)


## L1-Logistic Regression Classifier

In [11]:
lasso = sklearn.linear_model.LogisticRegression(penalty='l1', solver='saga', random_state=RANDOM_STATE)
# logit_lasso hyperparameter grid
lasso_parameters = {
     'lasso__C': np.logspace(-3, 3, 7),
     'lasso__max_iter': [20, 40, 60], # sneaky way to do "early stopping" 
}
lasso_pipeline_tuple = ('lasso', lasso)


### Train L1-LR classifier on Amazon data

In [12]:
amazon_tf_lasso_pipeline = Pipeline([
    tf_pipeline_tuple,
    lasso_pipeline_tuple,
 ])
amazon_tf_lasso_full_grid = { 
#     **count_parameters,
    **lasso_parameters
}


amazon_tf_lasso_grid_searcher = GridSearchCV(
    amazon_tf_lasso_pipeline, 
    amazon_tf_lasso_full_grid, 
    cv=cv_splitter, 
    n_jobs=3, 
    verbose=3,
    scoring='balanced_accuracy',
    return_train_score=True
)
amazon_tf_lasso_grid_searcher.fit(amazon_x_train, amazon_y_train)

Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  31 tasks      | elapsed:    1.5s
[Parallel(n_jobs=3)]: Done 105 out of 105 | elapsed:    9.9s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('tf',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('lasso',
                                        LogisticRegression(penalty='l1',
                                                           random_state=123,
                                                           solver='saga'))]),
             n_jobs=3,
             param_grid={'lasso__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'lasso__max_iter': [20, 40, 60]},
             return_train_score=True, scoring='balanced_accuracy', verbose=3)

In [13]:
print_gridsearch_results(amazon_tf_lasso_grid_searcher, list(amazon_tf_lasso_full_grid.keys()))

Dataframe has shape: (21, 22)
Number of trials used in grid search:  21


Unnamed: 0,param_lasso__C,param_lasso__max_iter,mean_train_score,mean_test_score,mean_fit_time,rank_test_score
17,100.0,60,0.998125,0.81,0.621128,1
16,100.0,40,0.998125,0.8075,0.470451,2
15,100.0,20,0.998125,0.80375,0.258704,3
20,1000.0,60,0.998125,0.80125,0.806793,4
19,1000.0,40,0.998125,0.8,0.563114,5
12,10.0,20,0.9975,0.7975,0.083791,6
13,10.0,40,0.997812,0.79625,0.12228,7
18,1000.0,20,0.998125,0.795,0.285502,8
14,10.0,60,0.997812,0.795,0.158897,9
11,1.0,60,0.757188,0.7525,0.028437,10


In [16]:
len(amazon_tf_lasso_grid_searcher.best_estimator_[0].vocabulary_)

4233

### Train L1-LR pipeline on IMDB data

In [17]:
imdb_tf_lasso_pipeline = Pipeline([
    tf_pipeline_tuple,
    lasso_pipeline_tuple,
 ])
imdb_tf_lasso_full_grid = { 
#     **count_parameters,
    **lasso_parameters
}


imdb_tf_lasso_grid_searcher = GridSearchCV(
    imdb_tf_lasso_pipeline, 
    imdb_tf_lasso_full_grid, 
    cv=cv_splitter, 
    n_jobs=3, 
    verbose=3,
    scoring='balanced_accuracy',
    return_train_score=True
)
imdb_tf_lasso_grid_searcher.fit(imdb_x_train, imdb_y_train)

Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  74 tasks      | elapsed:    2.3s
[Parallel(n_jobs=3)]: Done 100 out of 105 | elapsed:    9.4s remaining:    0.5s
[Parallel(n_jobs=3)]: Done 105 out of 105 | elapsed:   10.4s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('tf',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('lasso',
                                        LogisticRegression(penalty='l1',
                                                           random_state=123,
                                                           solver='saga'))]),
             n_jobs=3,
             param_grid={'lasso__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'lasso__max_iter': [20, 40, 60]},
             return_train_score=True, scoring='balanced_accuracy', verbose=3)

In [18]:
print_gridsearch_results(imdb_tf_lasso_grid_searcher, list(imdb_tf_lasso_full_grid.keys()))

Dataframe has shape: (21, 22)
Number of trials used in grid search:  21


Unnamed: 0,param_lasso__C,param_lasso__max_iter,mean_train_score,mean_test_score,mean_fit_time,rank_test_score
20,1000.0,60,1.0,0.75625,1.279127,1
15,100.0,20,1.0,0.75375,0.410488,2
19,1000.0,40,1.0,0.75375,0.897405,2
17,100.0,60,1.0,0.75,0.886791,4
18,1000.0,20,1.0,0.74875,0.436201,5
16,100.0,40,1.0,0.74875,0.689389,5
12,10.0,20,0.995,0.745,0.127068,7
14,10.0,60,0.996562,0.73875,0.205664,8
13,10.0,40,0.995937,0.73625,0.161936,9
11,1.0,60,0.632188,0.59375,0.04926,10


In [20]:
len(imdb_tf_lasso_grid_searcher.best_estimator_[0].vocabulary_)

6749

### Train L1-LR pipeline on Yelp data

In [21]:
yelp_tf_lasso_pipeline = Pipeline([
    tf_pipeline_tuple,
    lasso_pipeline_tuple,
 ])
yelp_tf_lasso_full_grid = { 
#     **count_parameters,
    **lasso_parameters
}


yelp_tf_lasso_grid_searcher = GridSearchCV(
    yelp_tf_lasso_pipeline, 
    yelp_tf_lasso_full_grid, 
    cv=cv_splitter, 
    n_jobs=3, 
    verbose=3,
    scoring='balanced_accuracy',
    return_train_score=True
)
yelp_tf_lasso_grid_searcher.fit(yelp_x_train, yelp_y_train)

Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  74 tasks      | elapsed:    1.8s
[Parallel(n_jobs=3)]: Done 100 out of 105 | elapsed:    6.8s remaining:    0.3s
[Parallel(n_jobs=3)]: Done 105 out of 105 | elapsed:    7.5s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('tf',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('lasso',
                                        LogisticRegression(penalty='l1',
                                                           random_state=123,
                                                           solver='saga'))]),
             n_jobs=3,
             param_grid={'lasso__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'lasso__max_iter': [20, 40, 60]},
             return_train_score=True, scoring='balanced_accuracy', verbose=3)

In [22]:
print_gridsearch_results(yelp_tf_lasso_grid_searcher, list(yelp_tf_lasso_full_grid.keys()))

Dataframe has shape: (21, 22)
Number of trials used in grid search:  21


Unnamed: 0,param_lasso__C,param_lasso__max_iter,mean_train_score,mean_test_score,mean_fit_time,rank_test_score
16,100.0,40,0.9975,0.7875,0.491751,1
19,1000.0,40,0.9975,0.78625,0.610252,2
15,100.0,20,0.9975,0.78625,0.271898,3
20,1000.0,60,0.9975,0.785,0.90434,4
17,100.0,60,0.9975,0.78375,0.653458,5
18,1000.0,20,0.9975,0.78125,0.305852,6
12,10.0,20,0.997188,0.7775,0.090821,7
13,10.0,40,0.997188,0.775,0.138057,8
14,10.0,60,0.9975,0.76875,0.173914,9
11,1.0,60,0.7425,0.72,0.029209,10


In [23]:
len(yelp_tf_lasso_grid_searcher.best_estimator_[0].vocabulary_)

4613

## Generate L1-LR test set predictions

In [25]:
tf_lasso_filename = os.path.join(output_dir, 'splitsource_tf_lasso_yproba1_test.txt')

# amazon_x_test = tf_vectorizer(amazon_x_df['text'].values)
amazon_yhat_positive_proba = amazon_tf_lasso_grid_searcher.best_estimator_.predict_proba(amazon_x_df['text'].values)[:, 1]

# imdb_x_test = tf_vectorizer(imdb_x_df['text'].values)
imdb_yhat_positive_proba = imdb_tf_lasso_grid_searcher.best_estimator_.predict_proba(imdb_x_df['text'].values)[:, 1]

# yelp_x_test = tf_vectorizer(yelp_x_df['text'].values)
yelp_yhat_positive_proba = yelp_tf_lasso_grid_searcher.best_estimator_.predict_proba(yelp_x_df['text'].values)[:, 1]

np.savetxt(tf_lasso_filename, np.r_[amazon_yhat_positive_proba, imdb_yhat_positive_proba, yelp_yhat_positive_proba])

## MLP Pipeline on Split data

In [26]:
mlp = sklearn.neural_network.MLPClassifier(solver='lbfgs', random_state=RANDOM_STATE)
mlp_parameters = {
    'mlp__hidden_layer_sizes': [16, 32, 64],
    'mlp__alpha': [0.0001,0.01,1, 10],
    'mlp__max_iter': [50, 100, 200, 500], # sneaky way to do "early stopping" 
}
mlp_pipeline_tuple = ('mlp', mlp)

### Train MLP pipeline on Amazon data

In [None]:
amazon_tf_mlp_pipeline = Pipeline([
    tf_pipeline_tuple,
    mlp_pipeline_tuple,
 ])
amazon_tf_mlp_full_grid = { 
#     **tf_parameters,
    **mlp_parameters
}


amazon_tf_mlp_grid_searcher = GridSearchCV(
    amazon_tf_mlp_pipeline, 
    amazon_tf_mlp_full_grid, 
    cv=cv_splitter, 
    n_jobs=3, 
    verbose=3,
    scoring='balanced_accuracy',
    return_train_score=True
)
amazon_tf_mlp_grid_searcher.fit(amazon_x_train, amazon_y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:    6.2s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:   59.7s


In [47]:
print_gridsearch_results(amazon_tf_mlp_grid_searcher, list(amazon_tf_mlp_full_grid.keys()))

Dataframe has shape: (48, 23)
Number of trials used in grid search:  48


Unnamed: 0,param_mlp__hidden_layer_sizes,param_mlp__alpha,param_mlp__max_iter,mean_train_score,mean_test_score,mean_fit_time,rank_test_score
30,32,1.0,200,0.998125,0.80375,1.870596,1
32,64,1.0,50,0.998125,0.80375,3.189671,1
28,32,1.0,50,0.998125,0.80375,1.391155,1
29,32,1.0,100,0.998125,0.80375,1.625681,1
31,32,1.0,500,0.998125,0.80375,1.832335,1
24,16,1.0,50,0.998125,0.8025,0.806531,6
25,16,1.0,100,0.998125,0.8025,0.813847,6
26,16,1.0,200,0.998125,0.8025,0.835278,6
27,16,1.0,500,0.998125,0.8025,0.816622,6
35,64,1.0,500,0.998125,0.8025,3.26104,6


### Train MLP pipeline on IMDB data

In [48]:
imdb_tf_mlp_pipeline = Pipeline([
    tf_pipeline_tuple,
    mlp_pipeline_tuple,
 ])
imdb_tf_mlp_full_grid = { 
#     **tf_parameters,
    **mlp_parameters
}


imdb_tf_mlp_grid_searcher = GridSearchCV(
    imdb_tf_mlp_pipeline, 
    imdb_tf_mlp_full_grid, 
    cv=cv_splitter, 
    n_jobs=5, 
    verbose=3,
    scoring='balanced_accuracy',
    return_train_score=True
)
imdb_tf_mlp_grid_searcher.fit(imdb_x_train, imdb_y_train)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   10.4s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done 240 out of 240 | elapsed:  3.8min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('tf',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('mlp',
                                        MLPClassifier(random_state=123,
                                                      solver='lbfgs'))]),
             n_jobs=3,
             param_grid={'mlp__alpha': [0.0001, 0.01, 1, 10],
                         'mlp__hidden_layer_sizes': [16, 32, 64],
                         'mlp__max_iter': [50, 100, 200, 500]},
             return_train_score=True, scoring='balanced_accuracy', verbose=3)

In [49]:
print_gridsearch_results(imdb_tf_mlp_grid_searcher, list(imdb_tf_mlp_full_grid.keys()))

Dataframe has shape: (48, 23)
Number of trials used in grid search:  48


Unnamed: 0,param_mlp__hidden_layer_sizes,param_mlp__alpha,param_mlp__max_iter,mean_train_score,mean_test_score,mean_fit_time,rank_test_score
32,64,1.0,50,1.0,0.76,5.163308,1
31,32,1.0,500,1.0,0.76,3.472196,1
30,32,1.0,200,1.0,0.76,3.307467,1
29,32,1.0,100,1.0,0.76,3.197352,1
34,64,1.0,200,1.0,0.75875,6.490193,5
25,16,1.0,100,1.0,0.75875,1.531367,5
27,16,1.0,500,1.0,0.75875,1.561036,5
35,64,1.0,500,1.0,0.75875,6.062405,5
26,16,1.0,200,1.0,0.75875,1.434991,5
24,16,1.0,50,1.0,0.7575,1.776293,10


### Train MLP pipeline on Yelp data

In [50]:
yelp_tf_mlp_pipeline = Pipeline([
    tf_pipeline_tuple,
    mlp_pipeline_tuple,
 ])
yelp_tf_mlp_full_grid = { 
#     **tf_parameters,
    **mlp_parameters
}


yelp_tf_mlp_grid_searcher = GridSearchCV(
    yelp_tf_mlp_pipeline, 
    yelp_tf_mlp_full_grid, 
    cv=cv_splitter, 
    n_jobs=3, 
    verbose=3,
    scoring='balanced_accuracy',
    return_train_score=True
)
yelp_tf_mlp_grid_searcher.fit(yelp_x_train, yelp_y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:    9.0s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done 240 out of 240 | elapsed:  2.7min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('tf',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('mlp',
                                        MLPClassifier(random_state=123,
                                                      solver='lbfgs'))]),
             n_jobs=3,
             param_grid={'mlp__alpha': [0.0001, 0.01, 1, 10],
                         'mlp__hidden_layer_sizes': [16, 32, 64],
                         'mlp__max_iter': [50, 100, 200, 500]},
             return_train_score=True, scoring='balanced_accuracy', verbose=3)

In [52]:
print_gridsearch_results(yelp_tf_mlp_grid_searcher, list(yelp_tf_mlp_full_grid.keys()))

Dataframe has shape: (48, 23)
Number of trials used in grid search:  48


Unnamed: 0,param_mlp__hidden_layer_sizes,param_mlp__alpha,param_mlp__max_iter,mean_train_score,mean_test_score,mean_fit_time,rank_test_score
37,16,10.0,100,0.993125,0.79125,0.929766,1
43,32,10.0,500,0.993437,0.79125,1.423992,1
38,16,10.0,200,0.993125,0.79125,1.022622,1
42,32,10.0,200,0.993437,0.79125,1.341438,1
41,32,10.0,100,0.993437,0.79125,1.4083,1
40,32,10.0,50,0.993437,0.79125,1.420648,1
39,16,10.0,500,0.993125,0.79125,0.968841,1
47,64,10.0,500,0.993437,0.79,2.278597,8
46,64,10.0,200,0.993437,0.79,2.364537,8
36,16,10.0,50,0.993125,0.79,0.764004,8


## Generate MLP test set predictions

In [54]:
tf_mlp_filename = os.path.join(output_dir, 'splitsource_tf_mlp_yproba1_test.txt')

# amazon_x_test = tf_vectorizer(amazon_x_df['text'].values)
amazon_yhat_positive_proba = amazon_tf_mlp_grid_searcher.best_estimator_.predict_proba(amazon_x_df['text'].values)[:, 1]

# imdb_x_test = tf_vectorizer(imdb_x_df['text'].values)
imdb_yhat_positive_proba = imdb_tf_mlp_grid_searcher.best_estimator_.predict_proba(imdb_x_df['text'].values)[:, 1]

# yelp_x_test = tf_vectorizer(yelp_x_df['text'].values)
yelp_yhat_positive_proba = yelp_tf_mlp_grid_searcher.best_estimator_.predict_proba(yelp_x_df['text'].values)[:, 1]

np.savetxt(tf_mlp_filename, np.r_[amazon_yhat_positive_proba, imdb_yhat_positive_proba, yelp_yhat_positive_proba])

## Training on Gradient Boosted Tree pipeline

In [55]:
gbtree_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=5, min_samples_leaf=1, random_state=RANDOM_STATE)
# GradientBoosting-Tree hyperparameter grid
gbtree_parameters = {
    "gbtree__n_estimators": [100, 200, 300],
    "gbtree__max_depth": [5, 10, 20],
    "gbtree__min_samples_leaf": [1, 3, 5],
}
gbtree_pipeline_tuple = ('gbtree', gbtree_classifier)


### Train GBTree on Amazon data

In [58]:
amazon_tf_gbtree_pipeline = Pipeline([
    tf_pipeline_tuple,
    gbtree_pipeline_tuple,
 ])
amazon_tf_gbtree_full_grid = { 
#     **tf_parameters,
    **gbtree_parameters
}


amazon_tf_gbtree_grid_searcher = GridSearchCV(
    amazon_tf_gbtree_pipeline, 
    amazon_tf_gbtree_full_grid, 
    cv=cv_splitter, 
    n_jobs=3, 
    verbose=3,
    scoring='balanced_accuracy',
    return_train_score=True
)
amazon_tf_gbtree_grid_searcher.fit(amazon_x_train, amazon_y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   14.1s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 135 out of 135 | elapsed:  2.0min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('tf',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('gbtree',
                                        GradientBoostingClassifier(max_depth=5,
                                                                   random_state=123))]),
             n_jobs=3,
             param_grid={'gbtree__max_depth': [5, 10, 20],
                         'gbtree__min_samples_leaf': [1, 3, 5],
                         'gbtree__n_estimators': [100, 200, 300]},
             return_train_score=True, scoring='balanced_accuracy', verbose=3)

In [59]:
print_gridsearch_results(amazon_tf_gbtree_grid_searcher, list(amazon_tf_gbtree_full_grid.keys()))

Dataframe has shape: (27, 23)
Number of trials used in grid search:  27


Unnamed: 0,param_gbtree__n_estimators,param_gbtree__max_depth,param_gbtree__min_samples_leaf,mean_train_score,mean_test_score,mean_fit_time,rank_test_score
10,200,10,1,0.998125,0.77875,2.556552,1
18,100,20,1,0.998125,0.77625,2.438857,2
1,200,5,1,0.998125,0.775,1.467556,3
3,100,5,3,0.90375,0.775,0.478909,3
11,300,10,1,0.998125,0.775,3.774988,3
0,100,5,1,0.924063,0.775,0.87852,6
4,200,5,3,0.939688,0.77375,0.954777,7
20,300,20,1,0.998125,0.77375,8.050723,7
9,100,10,1,0.998125,0.77375,1.301967,7
19,200,20,1,0.998125,0.77125,5.200863,10


### Train GBTree on IMDB data

In [60]:
imdb_tf_gbtree_pipeline = Pipeline([
    tf_pipeline_tuple,
    gbtree_pipeline_tuple,
 ])
imdb_tf_gbtree_full_grid = { 
#     **tf_parameters,
    **gbtree_parameters
}


imdb_tf_gbtree_grid_searcher = GridSearchCV(
    imdb_tf_gbtree_pipeline, 
    imdb_tf_gbtree_full_grid, 
    cv=cv_splitter, 
    n_jobs=3, 
    verbose=3,
    scoring='balanced_accuracy',
    return_train_score=True
)
imdb_tf_gbtree_grid_searcher.fit(imdb_x_train, imdb_y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   17.2s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:  2.5min
[Parallel(n_jobs=3)]: Done 135 out of 135 | elapsed:  2.8min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('tf',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('gbtree',
                                        GradientBoostingClassifier(max_depth=5,
                                                                   random_state=123))]),
             n_jobs=3,
             param_grid={'gbtree__max_depth': [5, 10, 20],
                         'gbtree__min_samples_leaf': [1, 3, 5],
                         'gbtree__n_estimators': [100, 200, 300]},
             return_train_score=True, scoring='balanced_accuracy', verbose=3)

In [61]:
print_gridsearch_results(imdb_tf_gbtree_grid_searcher, list(imdb_tf_gbtree_full_grid.keys()))

Dataframe has shape: (27, 23)
Number of trials used in grid search:  27


Unnamed: 0,param_gbtree__n_estimators,param_gbtree__max_depth,param_gbtree__min_samples_leaf,mean_train_score,mean_test_score,mean_fit_time,rank_test_score
18,100,20,1,0.999063,0.7375,4.280513,1
11,300,10,1,0.999063,0.73625,6.651887,2
0,100,5,1,0.9525,0.73375,1.026561,3
2,300,5,1,0.99875,0.73125,3.208768,4
3,100,5,3,0.90875,0.73,0.832723,5
19,200,20,1,0.999063,0.73,8.286133,6
10,200,10,1,0.999063,0.72625,4.513877,7
9,100,10,1,0.99875,0.725,2.329493,8
12,100,10,3,0.945625,0.72375,1.612523,9
4,200,5,3,0.935625,0.72125,1.606114,10


### Train GBTree on Yelp data

In [62]:
yelp_tf_gbtree_pipeline = Pipeline([
    tf_pipeline_tuple,
    gbtree_pipeline_tuple,
 ])
yelp_tf_gbtree_full_grid = { 
#     **tf_parameters,
    **gbtree_parameters
}


yelp_tf_gbtree_grid_searcher = GridSearchCV(
    yelp_tf_gbtree_pipeline, 
    yelp_tf_gbtree_full_grid, 
    cv=cv_splitter, 
    n_jobs=3, 
    verbose=3,
    scoring='balanced_accuracy',
    return_train_score=True
)
yelp_tf_gbtree_grid_searcher.fit(yelp_x_train, yelp_y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   11.7s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done 135 out of 135 | elapsed:  1.9min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('tf',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('gbtree',
                                        GradientBoostingClassifier(max_depth=5,
                                                                   random_state=123))]),
             n_jobs=3,
             param_grid={'gbtree__max_depth': [5, 10, 20],
                         'gbtree__min_samples_leaf': [1, 3, 5],
                         'gbtree__n_estimators': [100, 200, 300]},
             return_train_score=True, scoring='balanced_accuracy', verbose=3)

In [63]:
print_gridsearch_results(yelp_tf_gbtree_grid_searcher, list(yelp_tf_gbtree_full_grid.keys()))

Dataframe has shape: (27, 23)
Number of trials used in grid search:  27


Unnamed: 0,param_gbtree__n_estimators,param_gbtree__max_depth,param_gbtree__min_samples_leaf,mean_train_score,mean_test_score,mean_fit_time,rank_test_score
0,100,5,1,0.923125,0.73625,0.861517,1
6,100,5,5,0.854375,0.73625,0.50933,1
11,300,10,1,0.9975,0.73375,3.945416,3
1,200,5,1,0.9975,0.73375,1.477007,4
9,100,10,1,0.9975,0.7325,1.409518,5
2,300,5,1,0.9975,0.73125,2.145985,6
3,100,5,3,0.895,0.72875,0.580733,7
4,200,5,3,0.935312,0.72875,1.06719,7
5,300,5,3,0.950313,0.72875,1.625991,7
10,200,10,1,0.9975,0.72875,2.728589,7


### Train GBTree predictions

In [64]:
tf_gbtree_filename = os.path.join(output_dir, 'splitsource_tf_gbtree_yproba1_test.txt')

# amazon_x_test = tf_vectorizer(amazon_x_df['text'].values)
amazon_yhat_positive_proba = amazon_tf_gbtree_grid_searcher.best_estimator_.predict_proba(amazon_x_df['text'].values)[:, 1]

# imdb_x_test = tf_vectorizer(imdb_x_df['text'].values)
imdb_yhat_positive_proba = imdb_tf_gbtree_grid_searcher.best_estimator_.predict_proba(imdb_x_df['text'].values)[:, 1]

# yelp_x_test = tf_vectorizer(yelp_x_df['text'].values)
yelp_yhat_positive_proba = yelp_tf_gbtree_grid_searcher.best_estimator_.predict_proba(yelp_x_df['text'].values)[:, 1]

np.savetxt(tf_gbtree_filename, np.r_[amazon_yhat_positive_proba, imdb_yhat_positive_proba, yelp_yhat_positive_proba])