# This is a single file that will run all relevant tests and output the corresponding results
### NOTE1: Cell 3 has a hard coded path to the imbd data to avoid downloading and unzipping in code, so please adjust that path such that it points to the correct aclImbd folder in your directory
### NOTE2: The final models for each model-data section are hardcoded with the parameters we found to work best (ie the GSCV results are not dynamically populating the final models' parameters)

In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

##### Get the data

In [2]:
def getIMBD(path):
    posData = []
    posTarget = []
    negData = []
    negTarget = []
    i = 0
    directory = os.path.join(path, "pos")
    for filename in os.listdir(directory):
        f = open(os.path.join(directory, filename), "r", encoding="utf8")
        posData.append(f.read())
        posTarget.append(1)
        f.close()
        i = i + 1
    i = 0
    directory = os.path.join(path, "neg")
    for filename in os.listdir(directory):
        f = open(os.path.join(directory, filename), "r", encoding="utf8")
        negData.append(f.read())
        negTarget.append(0)
        f.close()
        i = i + 1
    return (posData + negData), (posTarget + negTarget)

In [3]:
twenty_train = fetch_20newsgroups(subset='train', remove=(['headers', 'footers', 'quotes']), shuffle=True)
twenty_test = fetch_20newsgroups(subset='test', remove=(['headers', 'footers', 'quotes']), shuffle=True)
imbd_train_data, imbd_train_target = getIMBD("..\\aclImdb\\train")
imbd_test_data, imbd_test_target = getIMBD("..\\aclImdb\\test")

# Logistic Regression - Twenty News Group
##### Set up a pre-processing pipeline separately from the main-pipeline so that later we can pre-process before training our final model on the train dataset.  Define the parameters over which to do the GSCV and then perform the GSCV on the set of permutations of those parameters.

In [4]:
twentyLogRegPreProcessingPipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('scale', StandardScaler(with_mean=False)),
    ('norm', Normalizer())
])
twentyLogRegPipe = Pipeline([
    ('ppp', twentyLogRegPreProcessingPipe),
    ('clf', LogisticRegression())
])
twentyLogRegParam = {
    'ppp__vect__strip_accents': ['ascii'],
    'ppp__vect__lowercase': [True],
    'ppp__vect__stop_words': ['english'],
    'ppp__vect__ngram_range': [(1,1), (1,2), (1,3)],
    'ppp__vect__max_df': [0.3, 0.4, 0.5],
    'ppp__vect__min_df': [2, 3],
    'ppp__vect__max_features': [10000, 100000],
    'clf__penalty': ['l2'],
    'clf__dual': [False],
    'clf__tol': [0.0001],
    'clf__C': [1.0],
    'clf__fit_intercept': [False],
    'clf__intercept_scaling': [1.0],
    'clf__class_weight': [None],
    'clf__random_state': [None],
    'clf__solver': ['lbfgs'],
    'clf__max_iter': [100],
    'clf__multi_class': ['auto'],
    'clf__l1_ratio': [None]
}
twentyLogRegGSCV = GridSearchCV(twentyLogRegPipe, twentyLogRegParam, cv=10, n_jobs=-1, verbose=1000)
twentyLogRegGSCV = twentyLogRegGSCV.fit(twenty_train.data, twenty_train.target)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.5s
Pickling array (shape=(11314,), dtype=int32).
[Paral

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  39 tasks      | elapsed:  1.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  1.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  43 tasks      | elapsed:  1.3min
[Para

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  3.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  3.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  83 tasks      | elapsed:  3.2min
Pickling array (shape=(11314,), dtype=int32).[Parallel(n_jobs=-1)]: Done  84 tasks      | elapsed:  3.2min

Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  85 tasks      | elapsed:  3.4min
Pickl

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 122 tasks      | elapsed:  5.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 123 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:  5.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 126 tasks      | elapsed:  5.2min
Pickl

[Parallel(n_jobs=-1)]: Done 163 tasks      | elapsed:  6.4min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:  6.4min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  6.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 166 tasks      | elapsed:  6.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 167 tasks      | elapsed:  6.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Para

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:  8.4min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 206 tasks      | elapsed:  8.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 207 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:  8.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 209 tasks      | elapsed:  8.7min
Pickl

[Parallel(n_jobs=-1)]: Done 246 tasks      | elapsed: 10.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 247 tasks      | elapsed: 10.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed: 10.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 249 tasks      | elapsed: 10.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed: 10.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Para

[Parallel(n_jobs=-1)]: Done 288 tasks      | elapsed: 11.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 289 tasks      | elapsed: 11.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 290 tasks      | elapsed: 11.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 291 tasks      | elapsed: 11.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed: 11.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Para

[Parallel(n_jobs=-1)]: Done 330 tasks      | elapsed: 13.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 331 tasks      | elapsed: 13.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 332 tasks      | elapsed: 13.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed: 13.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 334 tasks      | elapsed: 13.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
[Para

##### The best score and associated parameters followed by a table of more detailed results

In [5]:
twentyLogRegGSCV.best_score_

0.7476577691355842

In [6]:
for param_name in sorted(twentyLogRegParam.keys()):
    print("%s: %r" % (param_name, twentyLogRegGSCV.best_params_[param_name]))

clf__C: 1.0
clf__class_weight: None
clf__dual: False
clf__fit_intercept: False
clf__intercept_scaling: 1.0
clf__l1_ratio: None
clf__max_iter: 100
clf__multi_class: 'auto'
clf__penalty: 'l2'
clf__random_state: None
clf__solver: 'lbfgs'
clf__tol: 0.0001
ppp__vect__lowercase: True
ppp__vect__max_df: 0.3
ppp__vect__max_features: 100000
ppp__vect__min_df: 3
ppp__vect__ngram_range: (1, 1)
ppp__vect__stop_words: 'english'
ppp__vect__strip_accents: 'ascii'


In [7]:
df = pd.DataFrame(twentyLogRegGSCV.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__class_weight,param_clf__dual,param_clf__fit_intercept,param_clf__intercept_scaling,param_clf__l1_ratio,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,3.115836,0.081352,0.199192,0.029401,1,,False,False,1,,...,0.738556,0.738516,0.731621,0.758865,0.720249,0.755338,0.721282,0.732279,0.014411,22
1,7.480905,0.098841,0.275776,0.026153,1,,False,False,1,,...,0.733275,0.735866,0.723649,0.757979,0.722913,0.746441,0.718611,0.729273,0.013339,25
2,12.697232,0.162128,0.367342,0.0516,1,,False,False,1,,...,0.732394,0.733216,0.725421,0.75266,0.716696,0.747331,0.720392,0.728213,0.012562,34
3,3.01713,0.147388,0.187499,0.013974,1,,False,False,1,,...,0.739437,0.737633,0.734278,0.763298,0.720249,0.757117,0.723954,0.733604,0.015202,16
4,7.416729,0.069514,0.277145,0.029625,1,,False,False,1,,...,0.730634,0.733216,0.732507,0.753546,0.720249,0.747331,0.71683,0.729273,0.012261,25
5,12.833184,0.252045,0.35616,0.033634,1,,False,False,1,,...,0.730634,0.734099,0.728078,0.753546,0.717584,0.747331,0.721282,0.728743,0.012429,31
6,4.171541,0.163564,0.196177,0.026087,1,,False,False,1,,...,0.743838,0.757067,0.734278,0.749113,0.738899,0.762456,0.741763,0.741648,0.012123,4
7,13.318709,0.351049,0.337776,0.052527,1,,False,False,1,,...,0.747359,0.751767,0.725421,0.746454,0.730018,0.757117,0.726625,0.736168,0.01264,7
8,18.708656,0.449756,0.432591,0.052462,1,,False,False,1,,...,0.743838,0.75,0.723649,0.746454,0.735346,0.743772,0.729297,0.734665,0.010336,13
9,3.958235,0.294432,0.223565,0.047993,1,,False,False,1,,...,0.755282,0.760601,0.738707,0.763298,0.742451,0.764235,0.737311,0.747658,0.013055,1


##### Use the pre-processing pipe to pre-process both the train and test data separately, then fit a model with the above best parameters to the train data and evaluate the result on the test data

In [8]:
twentyLogRegPreProcessingPipe.fit(twenty_train.data)
X_train_LogReg_twenty = twentyLogRegPreProcessingPipe.transform(twenty_train.data)
X_test_LogReg_twenty = twentyLogRegPreProcessingPipe.transform(twenty_test.data)
twentyLogReg = LogisticRegression()
twentyLogReg.fit(X_train_LogReg_twenty, twenty_train.target)
y_pred_LogReg_twenty = twentyLogReg.predict(X_test_LogReg_twenty)
metrics.accuracy_score(twenty_test.target, y_pred_LogReg_twenty)



0.6347583643122676

In [9]:
metrics.confusion_matrix(twenty_test.target, y_pred_LogReg_twenty)

array([[ 83,   5,   4,   1,   1,   1,  25,   6,  13,   5,   4,   4,   2,
         13,  20,  99,  11,  21,   0,   1],
       [  2, 242,  14,   9,   7,  20,  48,   4,   5,   2,   1,   8,   9,
          3,  11,   1,   1,   2,   0,   0],
       [  0,  26, 214,  34,   6,  17,  45,   3,   7,   1,   0,   1,   5,
          8,  17,   6,   2,   2,   0,   0],
       [  0,  13,  30, 219,  16,   8,  60,   5,   3,   0,   0,   2,  27,
          2,   6,   1,   0,   0,   0,   0],
       [  0,   7,  13,  19, 207,   9,  69,   4,   4,   0,   0,   2,  26,
          4,  14,   5,   1,   0,   1,   0],
       [  0,  34,  15,   6,   3, 264,  50,   1,   0,   2,   1,   1,   5,
          3,   8,   1,   0,   1,   0,   0],
       [  0,   1,   2,  13,   5,   1, 326,  11,   9,   5,   1,   0,   8,
          0,   6,   1,   1,   0,   0,   0],
       [  0,   3,   2,   0,   1,   1,  47, 273,  25,   1,   2,   2,  13,
          3,  17,   2,   1,   2,   1,   0],
       [  1,   2,   2,   1,   1,   1,  29,  27, 289,   3,   3,  

In [10]:
print(metrics.classification_report(twenty_test.target, y_pred_LogReg_twenty, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.67      0.26      0.37       319
           comp.graphics       0.66      0.62      0.64       389
 comp.os.ms-windows.misc       0.66      0.54      0.60       394
comp.sys.ibm.pc.hardware       0.67      0.56      0.61       392
   comp.sys.mac.hardware       0.81      0.54      0.65       385
          comp.windows.x       0.80      0.67      0.73       395
            misc.forsale       0.32      0.84      0.46       390
               rec.autos       0.70      0.69      0.69       396
         rec.motorcycles       0.67      0.73      0.70       398
      rec.sport.baseball       0.87      0.79      0.83       397
        rec.sport.hockey       0.90      0.87      0.88       399
               sci.crypt       0.78      0.64      0.70       396
         sci.electronics       0.59      0.55      0.57       393
                 sci.med       0.65      0.75      0.70       396
         

# Logistic Regression - IBMD Movie Reviews
##### Set up a pre-processing pipeline separately from the main-pipeline so that later we can pre-process before training our final model on the train dataset.  Define the parameters over which to do the GSCV and then perform the GSCV on the set of permutations of those parameters.

In [11]:
imbdLogRegPreProcessingPipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('scale', StandardScaler(with_mean=False)),
    ('norm', Normalizer())
])
imbdLogRegPipe = Pipeline([
    ('ppp', imbdLogRegPreProcessingPipe),
    ('clf', LogisticRegression())
])
imbdLogRegParam = {
    'ppp__vect__strip_accents': ['ascii'],
    'ppp__vect__lowercase': [True],
    'ppp__vect__stop_words': ['english'],
    'ppp__vect__ngram_range': [(1,1), (1,2), (1,3)],
    'ppp__vect__max_df': [0.3, 0.4, 0.5],
    'ppp__vect__min_df': [2, 3],
    'ppp__vect__max_features': [10000, 100000],
    'clf__penalty': ['l2'],
    'clf__dual': [False],
    'clf__tol': [0.0001],
    'clf__C': [1.0],
    'clf__fit_intercept': [False],
    'clf__intercept_scaling': [1.0],
    'clf__class_weight': [None],
    'clf__random_state': [None],
    'clf__solver': ['lbfgs'],
    'clf__max_iter': [100],
    'clf__multi_class': ['auto'],
    'clf__l1_ratio': [None]
}
imbdLogRegGSCV = GridSearchCV(imbdLogRegPipe, imbdLogRegParam, cv=10, n_jobs=-1, verbose=1000)
imbdLogRegGSCV = imbdLogRegGSCV.fit(imbd_train_data, imbd_train_target)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.1s
Pickling array (shape=(22500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.5s
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    5.9s

Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=

[Parallel(n_jobs=-1)]: Done  51 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:  3.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  3.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed:  3.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  55 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  4.1min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  4.3min
Pickling array (shap

[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  7.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 106 tasks      | elapsed:  7.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 107 tasks      | elapsed:  7.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:  7.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:  7.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 110 tasks      | elapsed:  7.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 111 tasks      | elapsed:  8.1min
Pickling array (shap

Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed: 11.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed: 11.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 162 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 163 tasks      | elapsed: 11.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed: 11.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 12.0min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype

[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 214 tasks      | elapsed: 15.9min
Pickling array (shape=(22500,), dtype=int32).[Parallel(n_jobs=-1)]: Done 215 tasks      | elapsed: 15.9min

Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed: 15.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed: 15.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 218 tasks      | elapsed: 16.0min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 219 tasks      | elapsed: 16.0min
Pickling array (shap

[Parallel(n_jobs=-1)]: Done 267 tasks      | elapsed: 20.0min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed: 20.1min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed: 20.1min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed: 20.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed: 20.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 20.3min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed: 20.3min
Pickling array (shap

[Parallel(n_jobs=-1)]: Done 321 tasks      | elapsed: 23.8min
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed: 23.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 323 tasks      | elapsed: 24.0min
[Parallel(n_jobs=-1)]: Done 324 tasks      | elapsed: 24.1min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 325 tasks      | elapsed: 24.3min
[Parallel(n_jobs=-1)]: Done 326 tasks      | elapsed: 24.3min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 327 tasks      | elapsed: 24.6min
Pickling array (shap

##### The best score and associated parameters followed by a table of more detailed results

In [12]:
imbdLogRegGSCV.best_score_

0.86088

In [13]:
for param_name in sorted(imbdLogRegParam.keys()):
    print("%s: %r" % (param_name, imbdLogRegGSCV.best_params_[param_name]))

clf__C: 1.0
clf__class_weight: None
clf__dual: False
clf__fit_intercept: False
clf__intercept_scaling: 1.0
clf__l1_ratio: None
clf__max_iter: 100
clf__multi_class: 'auto'
clf__penalty: 'l2'
clf__random_state: None
clf__solver: 'lbfgs'
clf__tol: 0.0001
ppp__vect__lowercase: True
ppp__vect__max_df: 0.4
ppp__vect__max_features: 10000
ppp__vect__min_df: 3
ppp__vect__ngram_range: (1, 3)
ppp__vect__stop_words: 'english'
ppp__vect__strip_accents: 'ascii'


In [14]:
df = pd.DataFrame(imbdLogRegGSCV.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__class_weight,param_clf__dual,param_clf__fit_intercept,param_clf__intercept_scaling,param_clf__l1_ratio,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,4.596287,0.169305,0.490514,0.038701,1,,False,False,1,,...,0.8356,0.846,0.8448,0.8276,0.8456,0.8632,0.8416,0.84444,0.012796,17
1,14.673386,0.113024,0.694018,0.044869,1,,False,False,1,,...,0.8504,0.8732,0.8572,0.8456,0.8684,0.88,0.8564,0.85952,0.012684,12
2,31.110434,1.767497,1.002485,0.141073,1,,False,False,1,,...,0.8508,0.8724,0.8604,0.8468,0.8672,0.88,0.8596,0.86012,0.01204,6
3,5.202582,0.223678,0.483498,0.039177,1,,False,False,1,,...,0.836,0.8468,0.8456,0.8276,0.8464,0.862,0.8412,0.84452,0.012347,16
4,15.805909,0.555543,0.72233,0.063049,1,,False,False,1,,...,0.848,0.8724,0.8572,0.8472,0.8676,0.8796,0.858,0.85972,0.012141,10
5,29.691892,0.255847,0.919886,0.041605,1,,False,False,1,,...,0.8516,0.8732,0.8608,0.8468,0.8644,0.8816,0.8588,0.86008,0.012356,8
6,4.820853,0.060804,0.474468,0.052257,1,,False,False,1,,...,0.792,0.7976,0.7996,0.7972,0.8092,0.8224,0.8044,0.80228,0.012872,35
7,15.957071,0.134804,0.799309,0.053475,1,,False,False,1,,...,0.822,0.84,0.8376,0.8188,0.84,0.8416,0.8312,0.83228,0.011588,29
8,30.356741,0.195807,0.998012,0.059962,1,,False,False,1,,...,0.8208,0.8396,0.84,0.8188,0.8404,0.8408,0.8364,0.83348,0.010616,24
9,4.842649,0.089375,0.478697,0.033354,1,,False,False,1,,...,0.7984,0.8052,0.8072,0.8036,0.814,0.8324,0.8096,0.81056,0.011939,33


##### Use the pre-processing pipe to pre-process both the train and test data separately, then fit a model with the above best parameters to the train data and evaluate the result on the test data

In [15]:
imbdLogRegPreProcessingPipe.fit(imbd_train_data)
X_train_LogReg_imbd = imbdLogRegPreProcessingPipe.transform(imbd_train_data)
X_test_LogReg_imbd = imbdLogRegPreProcessingPipe.transform(imbd_test_data)
imbdLogReg = LogisticRegression()
imbdLogReg.fit(X_train_LogReg_imbd, imbd_train_target)
y_pred_LogReg_imbd = imbdLogReg.predict(X_test_LogReg_imbd)
metrics.accuracy_score(imbd_test_target, y_pred_LogReg_imbd)



0.84296

In [16]:
metrics.confusion_matrix(imbd_test_target, y_pred_LogReg_imbd)

array([[10689,  1811],
       [ 2115, 10385]], dtype=int64)

In [17]:
print(metrics.classification_report(imbd_test_target, y_pred_LogReg_imbd))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84     12500
           1       0.85      0.83      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



# Decision Tree Classifier - Twenty News Group
##### Set up a pre-processing pipeline separately from the main-pipeline so that later we can pre-process before training our final model on the train dataset.  Define the parameters over which to do the GSCV and then perform the GSCV on the set of permutations of those parameters.

In [19]:
twentyDecTrePreProcessingPipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('scale', StandardScaler(with_mean=False)),
    ('norm', Normalizer())
])
twentyDecTrePipe = Pipeline([
    ('ppp', twentyDecTrePreProcessingPipe),
    ('clf', DecisionTreeClassifier())
])
twentyDecTreParam = {
    'ppp__vect__strip_accents': ['ascii'],
    'ppp__vect__lowercase': [True],
    'ppp__vect__stop_words': ['english'],
    'ppp__vect__ngram_range': [(1,1), (1,2), (1,3)],
    'ppp__vect__max_df': [0.3, 0.4, 0.5],
    'ppp__vect__min_df': [2, 3],
    'ppp__vect__max_features': [10000, 100000],
    'clf__criterion': ['gini'],
    'clf__splitter': ['best'],
    'clf__max_depth': [None],
    'clf__min_samples_split': [2],
    'clf__min_samples_leaf': [1],
    'clf__min_weight_fraction_leaf': [0.0],
    'clf__max_features': [None],
    'clf__random_state': [None],
    'clf__max_leaf_nodes': [None],
    'clf__min_impurity_decrease': [0.0],
    'clf__class_weight': [None],
}
twentyDecTreGSCV = GridSearchCV(twentyDecTrePipe, twentyDecTreParam, cv=10, n_jobs=-1, verbose=1000)
twentyDecTreGSCV = twentyDecTreGSCV.fit(twenty_train.data, twenty_train.target)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    9.9s
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=

[Parallel(n_jobs=-1)]: Done  39 tasks      | elapsed:  2.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  2.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  2.3min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.3min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  43 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:  2.4min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dt

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  5.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  5.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  83 tasks      | elapsed:  5.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  84 tasks      | elapsed:  5.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  85 tasks      | elapsed:  5.6min
[Para

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 122 tasks      | elapsed:  8.4min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 123 tasks      | elapsed:  8.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  8.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:  8.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 126 tasks      | elapsed:  8.5min
Pickl

[Parallel(n_jobs=-1)]: Done 163 tasks      | elapsed: 10.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed: 10.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 10.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 166 tasks      | elapsed: 10.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 167 tasks      | elapsed: 10.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Para

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed: 13.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 206 tasks      | elapsed: 13.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 207 tasks      | elapsed: 14.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed: 14.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 209 tasks      | elapsed: 14.2min
Pickl

[Parallel(n_jobs=-1)]: Done 246 tasks      | elapsed: 16.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 247 tasks      | elapsed: 16.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed: 16.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 249 tasks      | elapsed: 16.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed: 16.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Para

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 288 tasks      | elapsed: 18.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 289 tasks      | elapsed: 18.9min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 290 tasks      | elapsed: 19.0min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 291 tasks      | elapsed: 19.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed: 19.1min
Pickl

[Parallel(n_jobs=-1)]: Done 329 tasks      | elapsed: 22.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 330 tasks      | elapsed: 22.3min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 331 tasks      | elapsed: 22.3min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 332 tasks      | elapsed: 22.3min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed: 22.4min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
[Para

##### The best score and associated parameters followed by a table of more detailed results

In [20]:
twentyDecTreGSCV.best_score_

0.48930528548700725

In [21]:
for param_name in sorted(twentyDecTreParam.keys()):
    print("%s: %r" % (param_name, twentyDecTreGSCV.best_params_[param_name]))

clf__class_weight: None
clf__criterion: 'gini'
clf__max_depth: None
clf__max_features: None
clf__max_leaf_nodes: None
clf__min_impurity_decrease: 0.0
clf__min_samples_leaf: 1
clf__min_samples_split: 2
clf__min_weight_fraction_leaf: 0.0
clf__random_state: None
clf__splitter: 'best'
ppp__vect__lowercase: True
ppp__vect__max_df: 0.4
ppp__vect__max_features: 10000
ppp__vect__min_df: 2
ppp__vect__ngram_range: (1, 1)
ppp__vect__stop_words: 'english'
ppp__vect__strip_accents: 'ascii'


In [22]:
df = pd.DataFrame(twentyDecTreGSCV.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__class_weight,param_clf__criterion,param_clf__max_depth,param_clf__max_features,param_clf__max_leaf_nodes,param_clf__min_impurity_decrease,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,8.485921,0.141636,0.176358,0.02338,,gini,,,,0,...,0.488556,0.498233,0.485385,0.506206,0.484902,0.479537,0.487979,0.48683,0.01128,2
1,12.856859,0.12956,0.27638,0.032368,,gini,,,,0,...,0.487676,0.4947,0.462356,0.494681,0.46714,0.482206,0.485307,0.480113,0.011486,22
2,18.313725,0.229579,0.378407,0.0592,,gini,,,,0,...,0.47007,0.499117,0.483614,0.494681,0.48135,0.484875,0.482636,0.482234,0.009458,16
3,8.759743,0.230496,0.176704,0.015862,,gini,,,,0,...,0.485035,0.500883,0.465899,0.490248,0.475133,0.483096,0.481745,0.480997,0.009757,21
4,12.764579,0.454024,0.249605,0.019807,,gini,,,,0,...,0.5,0.488516,0.454384,0.492021,0.488455,0.474199,0.483526,0.479494,0.013053,23
5,17.340568,0.156643,0.348924,0.034651,,gini,,,,0,...,0.485915,0.484099,0.460585,0.505319,0.486679,0.482206,0.483526,0.481085,0.011853,19
6,11.100911,0.101316,0.175833,0.016234,,gini,,,,0,...,0.484155,0.496466,0.45527,0.485816,0.491119,0.466192,0.47106,0.47534,0.013997,32
7,22.824365,0.20443,0.283118,0.022736,,gini,,,,0,...,0.475352,0.492049,0.472985,0.487589,0.48135,0.477758,0.463936,0.474191,0.010078,36
8,27.807246,0.260524,0.364327,0.035904,,gini,,,,0,...,0.484155,0.486749,0.470328,0.494681,0.492895,0.465302,0.473731,0.477373,0.013543,28
9,10.089492,0.13257,0.168763,0.009402,,gini,,,,0,...,0.481514,0.477032,0.464128,0.486702,0.48579,0.462633,0.479074,0.474898,0.008124,34


##### Use the pre-processing pipe to pre-process both the train and test data separately, then fit a model with the above best parameters to the train data and evaluate the result on the test data

In [23]:
twentyDecTrePreProcessingPipe.fit(twenty_train.data)
X_train_DecTre_twenty = twentyDecTrePreProcessingPipe.transform(twenty_train.data)
X_test_DecTre_twenty = twentyDecTrePreProcessingPipe.transform(twenty_test.data)
twentyDecTre = DecisionTreeClassifier()
twentyDecTre.fit(X_train_DecTre_twenty, twenty_train.target)
y_pred_DecTre_twenty = twentyDecTre.predict(X_test_DecTre_twenty)
metrics.accuracy_score(twenty_test.target, y_pred_DecTre_twenty)

0.38728093467870417

In [24]:
metrics.confusion_matrix(twenty_test.target, y_pred_DecTre_twenty)

array([[ 61,  15,   3,   9,   6,   5,   7,  19,   8,   4,  13,   4,   6,
         14,  12,  54,   8,  15,  22,  34],
       [  1, 181,  34,  19,  13,  29,   7,  21,  11,   3,   9,   5,  17,
         13,  10,   1,   2,   8,   4,   1],
       [  4,  36, 132,  38,  20,  37,   9,  33,  10,   3,   5,   7,  14,
         11,  11,   2,  11,   4,   5,   2],
       [  3,  28,  30, 123,  39,  20,  14,  31,  10,   0,   4,   9,  45,
         13,   5,   4,   4,   2,   4,   4],
       [  3,  38,  12,  31, 141,   7,  17,  40,   8,   2,   8,   8,  31,
          7,   8,   3,   7,   8,   5,   1],
       [  5,  48,  41,  14,  13, 162,   5,  29,   4,   4,   5,   7,  21,
          7,   6,   4,   4,   7,   7,   2],
       [  1,  21,   4,  10,  20,   8, 219,  34,   9,   3,   9,   2,  20,
          7,   7,   3,   5,   3,   2,   3],
       [  7,  35,   8,   9,   8,  11,  14, 173,  21,   8,   9,   3,  19,
         21,  15,   4,  13,   8,   5,   5],
       [ 11,  18,   8,   6,  13,   3,  14,  43, 179,   8,  15,  

In [25]:
print(metrics.classification_report(twenty_test.target, y_pred_DecTre_twenty, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.26      0.19      0.22       319
           comp.graphics       0.27      0.47      0.34       389
 comp.os.ms-windows.misc       0.39      0.34      0.36       394
comp.sys.ibm.pc.hardware       0.39      0.31      0.35       392
   comp.sys.mac.hardware       0.40      0.37      0.38       385
          comp.windows.x       0.45      0.41      0.43       395
            misc.forsale       0.61      0.56      0.58       390
               rec.autos       0.23      0.44      0.30       396
         rec.motorcycles       0.53      0.45      0.49       398
      rec.sport.baseball       0.52      0.37      0.43       397
        rec.sport.hockey       0.46      0.56      0.51       399
               sci.crypt       0.58      0.44      0.50       396
         sci.electronics       0.26      0.27      0.26       393
                 sci.med       0.34      0.38      0.36       396
         

# Decision Tree Classifier - IBMD Movie Reviews
##### Set up a pre-processing pipeline separately from the main-pipeline so that later we can pre-process before training our final model on the train dataset.  Define the parameters over which to do the GSCV and then perform the GSCV on the set of permutations of those parameters.

In [26]:
imbdDecTrePreProcessingPipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('scale', StandardScaler(with_mean=False)),
    ('norm', Normalizer())
])
imbdDecTrePipe = Pipeline([
    ('ppp', imbdDecTrePreProcessingPipe),
    ('clf', DecisionTreeClassifier())
])
imbdDecTreParam = {
    'ppp__vect__strip_accents': ['ascii'],
    'ppp__vect__lowercase': [True],
    'ppp__vect__stop_words': ['english'],
    'ppp__vect__ngram_range': [(1,1), (1,2), (1,3)],
    'ppp__vect__max_df': [0.3, 0.4, 0.5],
    'ppp__vect__min_df': [2, 3],
    'ppp__vect__max_features': [10000, 100000],
    'clf__criterion': ['gini'],
    'clf__splitter': ['best'],
    'clf__max_depth': [None],
    'clf__min_samples_split': [2],
    'clf__min_samples_leaf': [1],
    'clf__min_weight_fraction_leaf': [0.0],
    'clf__max_features': [None],
    'clf__random_state': [None],
    'clf__max_leaf_nodes': [None],
    'clf__min_impurity_decrease': [0.0],
    'clf__class_weight': [None],
}
imbdDecTreGSCV = GridSearchCV(imbdDecTrePipe, imbdDecTreParam, cv=10, n_jobs=-1, verbose=1000)
imbdDecTreGSCV = imbdDecTreGSCV.fit(imbd_train_data, imbd_train_target)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   24.1s
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   25.4s
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=

[Parallel(n_jobs=-1)]: Done  51 tasks      | elapsed:  8.0min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:  8.1min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  8.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed:  8.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  55 tasks      | elapsed:  8.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  8.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  9.3min
Pickling array (shap

[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 19.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 106 tasks      | elapsed: 19.3min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 107 tasks      | elapsed: 19.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed: 19.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed: 20.1min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 110 tasks      | elapsed: 20.3min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 111 tasks      | elapsed: 20.6min
Pickling array (shap

Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed: 28.7min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed: 29.1min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 162 tasks      | elapsed: 29.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 163 tasks      | elapsed: 29.3min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed: 29.3min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 29.7min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype

[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed: 39.7min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 214 tasks      | elapsed: 39.7min
[Parallel(n_jobs=-1)]: Done 215 tasks      | elapsed: 39.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed: 39.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed: 40.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 218 tasks      | elapsed: 40.3min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 219 tasks      | elapsed: 40.3min
Pickling array (shap

[Parallel(n_jobs=-1)]: Done 267 tasks      | elapsed: 50.1min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed: 50.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed: 50.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed: 50.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed: 50.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 50.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed: 50.8min
Pickling array (shap

[Parallel(n_jobs=-1)]: Done 321 tasks      | elapsed: 59.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed: 59.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 323 tasks      | elapsed: 60.1min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 324 tasks      | elapsed: 60.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 325 tasks      | elapsed: 60.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 326 tasks      | elapsed: 60.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 327 tasks      | elapsed: 61.3min
Pickling array (shap

##### The best score and associated parameters followed by a table of more detailed results

In [27]:
imbdDecTreGSCV.best_score_

0.71744

In [28]:
for param_name in sorted(imbdDecTreParam.keys()):
    print("%s: %r" % (param_name, imbdDecTreGSCV.best_params_[param_name]))

clf__class_weight: None
clf__criterion: 'gini'
clf__max_depth: None
clf__max_features: None
clf__max_leaf_nodes: None
clf__min_impurity_decrease: 0.0
clf__min_samples_leaf: 1
clf__min_samples_split: 2
clf__min_weight_fraction_leaf: 0.0
clf__random_state: None
clf__splitter: 'best'
ppp__vect__lowercase: True
ppp__vect__max_df: 0.4
ppp__vect__max_features: 10000
ppp__vect__min_df: 3
ppp__vect__ngram_range: (1, 3)
ppp__vect__stop_words: 'english'
ppp__vect__strip_accents: 'ascii'


In [29]:
df = pd.DataFrame(imbdDecTreGSCV.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__class_weight,param_clf__criterion,param_clf__max_depth,param_clf__max_features,param_clf__max_leaf_nodes,param_clf__min_impurity_decrease,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,24.111879,0.822174,0.427985,0.026977,,gini,,,,0,...,0.7104,0.7192,0.7116,0.7216,0.6964,0.702,0.7268,0.71228,0.009873,20
1,36.787525,1.189227,0.669415,0.044335,,gini,,,,0,...,0.7072,0.7292,0.7072,0.7048,0.7076,0.7208,0.7216,0.71188,0.009737,23
2,50.423897,1.215271,0.84292,0.033516,,gini,,,,0,...,0.6996,0.7064,0.7064,0.7096,0.7056,0.7196,0.72,0.71156,0.007274,27
3,25.51023,0.652245,0.448396,0.025997,,gini,,,,0,...,0.7216,0.7136,0.7068,0.7128,0.7004,0.7104,0.7412,0.71436,0.010731,9
4,35.934046,0.872416,0.632812,0.02007,,gini,,,,0,...,0.7176,0.7192,0.7056,0.7012,0.7108,0.7272,0.7204,0.71496,0.008804,5
5,49.607715,0.531786,0.85548,0.021482,,gini,,,,0,...,0.7152,0.7316,0.706,0.696,0.7188,0.7132,0.7152,0.71468,0.008894,6
6,30.55368,1.146425,0.443984,0.026612,,gini,,,,0,...,0.712,0.706,0.7048,0.7236,0.7228,0.6984,0.7336,0.71428,0.010111,10
7,55.597433,1.060411,0.720947,0.040097,,gini,,,,0,...,0.712,0.718,0.702,0.7076,0.7248,0.6932,0.726,0.7112,0.010228,31
8,71.63033,0.781425,0.942187,0.034268,,gini,,,,0,...,0.7132,0.716,0.7124,0.708,0.7272,0.7144,0.7228,0.71544,0.005893,4
9,29.40636,0.668915,0.446656,0.04012,,gini,,,,0,...,0.7132,0.71,0.7156,0.7208,0.72,0.7096,0.7264,0.71616,0.004903,3


##### Use the pre-processing pipe to pre-process both the train and test data separately, then fit a model with the above best parameters to the train data and evaluate the result on the test data

In [30]:
imbdDecTrePreProcessingPipe.fit(imbd_train_data)
X_train_DecTre_imbd = imbdDecTrePreProcessingPipe.transform(imbd_train_data)
X_test_DecTre_imbd = imbdDecTrePreProcessingPipe.transform(imbd_test_data)
imbdDecTre = DecisionTreeClassifier()
imbdDecTre.fit(X_train_DecTre_imbd, imbd_train_target)
y_pred_DecTre_imbd = imbdDecTre.predict(X_test_DecTre_imbd)
metrics.accuracy_score(imbd_test_target, y_pred_DecTre_imbd)

0.70084

In [31]:
metrics.confusion_matrix(imbd_test_target, y_pred_DecTre_imbd)

array([[8829, 3671],
       [3808, 8692]], dtype=int64)

In [32]:
print(metrics.classification_report(imbd_test_target, y_pred_DecTre_imbd))

              precision    recall  f1-score   support

           0       0.70      0.71      0.70     12500
           1       0.70      0.70      0.70     12500

    accuracy                           0.70     25000
   macro avg       0.70      0.70      0.70     25000
weighted avg       0.70      0.70      0.70     25000



# Linear SVC - Twenty News Group
##### Set up a pre-processing pipeline separately from the main-pipeline so that later we can pre-process before training our final model on the train dataset.  Define the parameters over which to do the GSCV and then perform the GSCV on the set of permutations of those parameters.

In [33]:
twentyLinSVCPreProcessingPipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('scale', StandardScaler(with_mean=False)),
    ('norm', Normalizer())
])
twentyLinSVCPipe = Pipeline([
    ('ppp', twentyLinSVCPreProcessingPipe),
    ('clf', LinearSVC())
])
twentyLinSVCParam = {
    'ppp__vect__strip_accents': ['ascii'],
    'ppp__vect__lowercase': [True],
    'ppp__vect__stop_words': ['english'],
    'ppp__vect__ngram_range': [(1,1), (1,2), (1,3)],
    'ppp__vect__max_df': [0.3, 0.4, 0.5],
    'ppp__vect__min_df': [2, 3],
    'ppp__vect__max_features': [10000, 100000],
    'clf__penalty': ['l2'],
    'clf__loss': ['squared_hinge'],
    'clf__dual': [True],
    'clf__tol': [0.0001],
    'clf__C': [1.0],
    'clf__multi_class': ['ovr'],
    'clf__fit_intercept': [True],
    'clf__intercept_scaling': [1.0],
    'clf__random_state': [None],
    'clf__max_iter': [1000]
}
twentyLinSVCGSCV = GridSearchCV(twentyLinSVCPipe, twentyLinSVCParam, cv=10, n_jobs=-1, verbose=1000)
twentyLinSVCGSCV = twentyLinSVCGSCV.fit(twenty_train.data, twenty_train.target)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.6s
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  39 tasks      | elapsed:  1.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  1.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  43 tasks      | elapsed:  1.3min
[Para

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  3.0min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  83 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  84 tasks      | elapsed:  3.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  85 tasks      | elapsed:  3.3min
Pickl

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 123 tasks      | elapsed:  4.9min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  4.9min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:  4.9min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 126 tasks      | elapsed:  4.9min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 127 tasks      | elapsed:  4.9min
Pickl

[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:  6.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  6.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 166 tasks      | elapsed:  6.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 167 tasks      | elapsed:  6.3min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  6.3min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Para

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 206 tasks      | elapsed:  8.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 207 tasks      | elapsed:  8.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:  8.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 209 tasks      | elapsed:  8.3min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:  8.3min
Pickl

[Parallel(n_jobs=-1)]: Done 247 tasks      | elapsed:  9.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:  9.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 249 tasks      | elapsed:  9.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed:  9.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 251 tasks      | elapsed:  9.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Para

[Parallel(n_jobs=-1)]: Done 289 tasks      | elapsed: 11.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 290 tasks      | elapsed: 11.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 291 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed: 11.3min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 293 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 294 tasks      | elapsed: 11.4min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dt

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 331 tasks      | elapsed: 13.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 332 tasks      | elapsed: 13.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed: 13.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 334 tasks      | elapsed: 13.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 335 tasks      | elapsed: 13.2min
Pickl

##### The best score and associated parameters followed by a table of more detailed results

In [34]:
twentyLinSVCGSCV.best_score_

0.7275941311649284

In [35]:
for param_name in sorted(twentyLinSVCParam.keys()):
    print("%s: %r" % (param_name, twentyLinSVCGSCV.best_params_[param_name]))

clf__C: 1.0
clf__dual: True
clf__fit_intercept: True
clf__intercept_scaling: 1.0
clf__loss: 'squared_hinge'
clf__max_iter: 1000
clf__multi_class: 'ovr'
clf__penalty: 'l2'
clf__random_state: None
clf__tol: 0.0001
ppp__vect__lowercase: True
ppp__vect__max_df: 0.3
ppp__vect__max_features: 100000
ppp__vect__min_df: 2
ppp__vect__ngram_range: (1, 2)
ppp__vect__stop_words: 'english'
ppp__vect__strip_accents: 'ascii'


In [36]:
df = pd.DataFrame(twentyLinSVCGSCV.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__dual,param_clf__fit_intercept,param_clf__intercept_scaling,param_clf__loss,param_clf__max_iter,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,3.383622,0.074279,0.199037,0.023826,1,True,True,1,squared_hinge,1000,...,0.713908,0.704947,0.698849,0.734043,0.699822,0.725979,0.69724,0.70594,0.013453,19
1,7.716829,0.158689,0.273437,0.03714,1,True,True,1,squared_hinge,1000,...,0.707746,0.709364,0.695306,0.73227,0.692718,0.720641,0.691006,0.7032,0.013742,31
2,12.87494,0.219926,0.346874,0.040624,1,True,True,1,squared_hinge,1000,...,0.708627,0.708481,0.696191,0.731383,0.698046,0.718861,0.687444,0.703642,0.012799,28
3,3.195691,0.136224,0.178125,0.022317,1,True,True,1,squared_hinge,1000,...,0.713028,0.704064,0.699734,0.738475,0.697158,0.725979,0.696349,0.705674,0.014736,22
4,7.661218,0.112869,0.261501,0.02311,1,True,True,1,squared_hinge,1000,...,0.709507,0.704947,0.70062,0.734043,0.702487,0.715302,0.692787,0.704614,0.012445,25
5,12.949946,0.145925,0.348437,0.031289,1,True,True,1,squared_hinge,1000,...,0.705986,0.706714,0.69442,0.728723,0.70071,0.715302,0.689225,0.702581,0.011904,34
6,4.522358,0.075484,0.198282,0.026169,1,True,True,1,squared_hinge,1000,...,0.71919,0.742933,0.708592,0.728723,0.71048,0.730427,0.718611,0.719109,0.011579,16
7,11.28931,0.216351,0.298925,0.024668,1,True,True,1,squared_hinge,1000,...,0.737676,0.737633,0.725421,0.734043,0.713144,0.75,0.71683,0.727594,0.012521,1
8,16.509289,0.237606,0.382749,0.031437,1,True,True,1,squared_hinge,1000,...,0.742958,0.74735,0.718335,0.728723,0.709591,0.736655,0.723063,0.724324,0.013401,4
9,4.153693,0.159527,0.201212,0.020743,1,True,True,1,squared_hinge,1000,...,0.732394,0.739399,0.710363,0.733156,0.71492,0.741993,0.712378,0.722114,0.013918,10


##### Use the pre-processing pipe to pre-process both the train and test data separately, then fit a model with the above best parameters to the train data and evaluate the result on the test data

In [37]:
twentyLinSVCPreProcessingPipe.fit(twenty_train.data)
X_train_LinSVC_twenty = twentyLinSVCPreProcessingPipe.transform(twenty_train.data)
X_test_LinSVC_twenty = twentyLinSVCPreProcessingPipe.transform(twenty_test.data)
twentyLinSVC = LinearSVC()
twentyLinSVC.fit(X_train_LinSVC_twenty, twenty_train.target)
y_pred_LinSVC_twenty = twentyLinSVC.predict(X_test_LinSVC_twenty)
metrics.accuracy_score(twenty_test.target, y_pred_LinSVC_twenty)

0.6228093467870419

In [38]:
metrics.confusion_matrix(twenty_test.target, y_pred_LinSVC_twenty)

array([[125,   4,   9,   1,   2,   1,   4,  14,   8,   4,   3,   6,   3,
          7,   8,  63,  13,  12,  10,  22],
       [  6, 257,  18,  10,   7,  21,   6,   8,   4,   2,   2,  12,   9,
          5,   7,   2,   1,   5,   2,   5],
       [  3,  25, 191,  42,  14,  22,   7,  20,   6,   2,   1,   7,   7,
         12,  15,   1,   4,   6,   5,   4],
       [  0,  18,  33, 223,  21,  12,  22,  12,   1,   0,   4,   6,  21,
          6,   6,   2,   3,   1,   1,   0],
       [  2,  11,  14,  14, 230,  11,  13,  20,   3,   1,   1,   5,  25,
          7,  10,   4,   3,   2,   5,   4],
       [  0,  41,  19,   3,   9, 282,   6,   6,   1,   3,   2,   3,   4,
          5,   4,   3,   1,   2,   0,   1],
       [  1,   7,   5,  23,  11,   3, 254,  20,   9,   4,   1,   1,  13,
         10,   7,   2,   6,   4,   5,   4],
       [  2,   7,   4,   1,   5,   2,  10, 289,  19,   2,   2,   2,   9,
          7,  13,   4,   3,   6,   5,   4],
       [  4,   7,  12,   1,   7,   2,   5,  38, 247,   5,   4,  

In [39]:
print(metrics.classification_report(twenty_test.target, y_pred_LinSVC_twenty, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.48      0.39      0.43       319
           comp.graphics       0.59      0.66      0.62       389
 comp.os.ms-windows.misc       0.54      0.48      0.51       394
comp.sys.ibm.pc.hardware       0.62      0.57      0.60       392
   comp.sys.mac.hardware       0.70      0.60      0.64       385
          comp.windows.x       0.74      0.71      0.73       395
            misc.forsale       0.68      0.65      0.66       390
               rec.autos       0.48      0.73      0.58       396
         rec.motorcycles       0.71      0.62      0.66       398
      rec.sport.baseball       0.87      0.76      0.81       397
        rec.sport.hockey       0.86      0.87      0.87       399
               sci.crypt       0.65      0.63      0.64       396
         sci.electronics       0.58      0.49      0.53       393
                 sci.med       0.61      0.69      0.65       396
         

# Linear SVC - IBMD Movie Reviews
##### Set up a pre-processing pipeline separately from the main-pipeline so that later we can pre-process before training our final model on the train dataset.  Define the parameters over which to do the GSCV and then perform the GSCV on the set of permutations of those parameters.

In [40]:
imbdLinSVCPreProcessingPipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('scale', StandardScaler(with_mean=False)),
    ('norm', Normalizer())
])
imbdLinSVCPipe = Pipeline([
    ('ppp', imbdLinSVCPreProcessingPipe),
    ('clf', LinearSVC())
])
imbdLinSVCParam = {
    'ppp__vect__strip_accents': ['ascii'],
    'ppp__vect__lowercase': [True],
    'ppp__vect__stop_words': ['english'],
    'ppp__vect__ngram_range': [(1,1), (1,2), (1,3)],
    'ppp__vect__max_df': [0.3, 0.4, 0.5],
    'ppp__vect__min_df': [2, 3],
    'ppp__vect__max_features': [10000, 100000],
    'clf__penalty': ['l2'],
    'clf__loss': ['squared_hinge'],
    'clf__dual': [True],
    'clf__tol': [0.0001],
    'clf__C': [1.0],
    'clf__multi_class': ['ovr'],
    'clf__fit_intercept': [True],
    'clf__intercept_scaling': [1.0],
    'clf__random_state': [None],
    'clf__max_iter': [1000]
}
imbdLinSVCGSCV = GridSearchCV(imbdLinSVCPipe, imbdLinSVCParam, cv=10, n_jobs=-1, verbose=1000)
imbdLinSVCGSCV = imbdLinSVCGSCV.fit(imbd_train_data, imbd_train_target)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.3s
Pickling array (shape=(22500,), dtype=int32).[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.7s

Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done   4 tasks 

[Parallel(n_jobs=-1)]: Done  51 tasks      | elapsed:  3.5min
Pickling array (shape=(22500,), dtype=int32).[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:  3.5min

Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  3.7min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed:  3.7min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  55 tasks      | elapsed:  4.0min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  4.0min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  4.2min
Pickling array (shap

[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  7.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 106 tasks      | elapsed:  7.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 107 tasks      | elapsed:  7.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:  7.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:  7.7min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 110 tasks      | elapsed:  7.7min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 111 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)

Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed: 11.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed: 11.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 162 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 163 tasks      | elapsed: 11.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed: 11.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 11.7min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype

[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed: 15.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 214 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done 215 tasks      | elapsed: 15.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed: 15.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed: 15.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 218 tasks      | elapsed: 15.6min
[Parallel(n_jobs=-1)]: Done 219 tasks      | elapsed: 15.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)

[Parallel(n_jobs=-1)]: Done 267 tasks      | elapsed: 19.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed: 19.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed: 19.7min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed: 19.7min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed: 19.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 19.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed: 19.9min
Pickling array (shap

[Parallel(n_jobs=-1)]: Done 321 tasks      | elapsed: 23.2min
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed: 23.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 323 tasks      | elapsed: 23.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 324 tasks      | elapsed: 23.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 325 tasks      | elapsed: 23.7min
[Parallel(n_jobs=-1)]: Done 326 tasks      | elapsed: 23.7min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 327 tasks      | elapsed: 24.0min
Pickling array (shap

##### The best score and associated parameters followed by a table of more detailed results

In [41]:
imbdLinSVCGSCV.best_score_

0.82984

In [42]:
for param_name in sorted(imbdLinSVCParam.keys()):
    print("%s: %r" % (param_name, imbdLinSVCGSCV.best_params_[param_name]))

clf__C: 1.0
clf__dual: True
clf__fit_intercept: True
clf__intercept_scaling: 1.0
clf__loss: 'squared_hinge'
clf__max_iter: 1000
clf__multi_class: 'ovr'
clf__penalty: 'l2'
clf__random_state: None
clf__tol: 0.0001
ppp__vect__lowercase: True
ppp__vect__max_df: 0.5
ppp__vect__max_features: 10000
ppp__vect__min_df: 3
ppp__vect__ngram_range: (1, 3)
ppp__vect__stop_words: 'english'
ppp__vect__strip_accents: 'ascii'


In [43]:
df = pd.DataFrame(imbdLinSVCGSCV.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__dual,param_clf__fit_intercept,param_clf__intercept_scaling,param_clf__loss,param_clf__max_iter,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,4.746778,0.055692,0.502305,0.039445,1,True,True,1,squared_hinge,1000,...,0.7884,0.8016,0.8148,0.7908,0.8152,0.8336,0.8028,0.80848,0.016002,30
1,15.096862,0.157941,0.706837,0.068081,1,True,True,1,squared_hinge,1000,...,0.8216,0.8308,0.8268,0.8116,0.8416,0.8408,0.8316,0.82712,0.014301,7
2,29.276928,0.163819,0.890161,0.069851,1,True,True,1,squared_hinge,1000,...,0.82,0.834,0.8336,0.8148,0.8408,0.8428,0.8328,0.82924,0.01258,4
3,4.835676,0.074249,0.436332,0.025552,1,True,True,1,squared_hinge,1000,...,0.7888,0.8048,0.8152,0.7928,0.8148,0.8332,0.8028,0.80892,0.015147,29
4,14.997263,0.167395,0.712038,0.059413,1,True,True,1,squared_hinge,1000,...,0.8188,0.8324,0.8228,0.8144,0.8396,0.844,0.8316,0.82712,0.014575,7
5,29.358516,0.197061,0.939197,0.044214,1,True,True,1,squared_hinge,1000,...,0.8164,0.8356,0.8336,0.8188,0.8376,0.8416,0.8312,0.82848,0.012284,6
6,4.998239,0.079135,0.45663,0.024266,1,True,True,1,squared_hinge,1000,...,0.7376,0.7356,0.7552,0.7488,0.7692,0.7828,0.7676,0.75432,0.018103,35
7,15.594535,0.185717,0.771743,0.049738,1,True,True,1,squared_hinge,1000,...,0.79,0.8152,0.8192,0.8056,0.828,0.8168,0.8168,0.81124,0.014571,24
8,30.09844,0.127237,0.972734,0.060244,1,True,True,1,squared_hinge,1000,...,0.7932,0.8176,0.8196,0.8016,0.8256,0.8192,0.818,0.81172,0.014762,23
9,5.00386,0.108914,0.470423,0.04112,1,True,True,1,squared_hinge,1000,...,0.748,0.746,0.7696,0.7592,0.7844,0.7892,0.7744,0.76576,0.01872,32


##### Use the pre-processing pipe to pre-process both the train and test data separately, then fit a model with the above best parameters to the train data and evaluate the result on the test data

In [44]:
imbdLinSVCPreProcessingPipe.fit(imbd_train_data)
X_train_LinSVC_imbd = imbdLinSVCPreProcessingPipe.transform(imbd_train_data)
X_test_LinSVC_imbd = imbdLinSVCPreProcessingPipe.transform(imbd_test_data)
imbdLinSVC = LinearSVC()
imbdLinSVC.fit(X_train_LinSVC_imbd, imbd_train_target)
y_pred_LinSVC_imbd = imbdLinSVC.predict(X_test_LinSVC_imbd)
metrics.accuracy_score(imbd_test_target, y_pred_LinSVC_imbd)

0.81104

In [45]:
metrics.confusion_matrix(imbd_test_target, y_pred_LinSVC_imbd)

array([[10483,  2017],
       [ 2707,  9793]], dtype=int64)

In [46]:
print(metrics.classification_report(imbd_test_target, y_pred_LinSVC_imbd))

              precision    recall  f1-score   support

           0       0.79      0.84      0.82     12500
           1       0.83      0.78      0.81     12500

    accuracy                           0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000



# AdaBoost Classifier - Twenty News Group
##### Set up a pre-processing pipeline separately from the main-pipeline so that later we can pre-process before training our final model on the train dataset.  Define the parameters over which to do the GSCV and then perform the GSCV on the set of permutations of those parameters.

In [47]:
twentyAdaBooPreProcessingPipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('scale', StandardScaler(with_mean=False)),
    ('norm', Normalizer())
])
twentyAdaBooPipe = Pipeline([
    ('ppp', twentyAdaBooPreProcessingPipe),
    ('clf', AdaBoostClassifier())
])
twentyAdaBooParam = {
    'ppp__vect__strip_accents': ['ascii'],
    'ppp__vect__lowercase': [True],
    'ppp__vect__stop_words': ['english'],
    'ppp__vect__ngram_range': [(1,1), (1,2), (1,3)],
    'ppp__vect__max_df': [0.3, 0.4, 0.5],
    'ppp__vect__min_df': [2, 3],
    'ppp__vect__max_features': [10000, 100000],
    'clf__base_estimator': [twentyDecTre],
    'clf__n_estimators': [50],
    'clf__learning_rate': [1.0],
    'clf__algorithm': ['SAMME.R'],
    'clf__random_state': [None]
}
twentyAdaBooGSCV = GridSearchCV(twentyAdaBooPipe, twentyAdaBooParam, cv=10, n_jobs=-1, verbose=1000)
twentyAdaBooGSCV = twentyAdaBooGSCV.fit(twenty_train.data, twenty_train.target)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-21daf6d3ecc643e687a35ac5861102b2.pkl
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling arra

[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   42.6s
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-72cd843b19cb40c58d28ae6d02be6166.pkl
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   43.9s
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed: 21.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-f7ceced090cb4fc3be3609a22d165928.pkl
[Parallel(n_jobs=-1)]: Done  22 tasks      | elapsed: 21.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 22.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-ccdcd338933842448461dda573599db0.pkl
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 22.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 23.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-419e7e025e25461785d2a34f17993873.pkl
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 23.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed: 24.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-58927857a9c5490d8112a8e648b2ae63.pkl
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed: 24.9min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 25.9min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-ec63b6924fff4c8e9e994e9269bb6de3.pkl
[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed: 26.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed: 28.0min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-d5809fe7c60844bdbc6de3633abd4899.pkl
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 28.0min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed: 29.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-f53a993ba4d3476fb6a4fa7fa276548d.pkl
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed: 29.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 31.0min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-f62c2624a6bc4d6b819e5e3e7a6b402a.pkl
[Parallel(n_jobs=-1)]: Done 106 tasks      | elapsed: 31.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 117 tasks      | elapsed: 32.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-2b2a36ab9e79427b8b9419a447a55aae.pkl
[Parallel(n_jobs=-1)]: Done 118 tasks      | elapsed: 32.9min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 33.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-9b9e3ed0bf5546dcbbc94ba4b190b774.pkl
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed: 33.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed: 34.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-c42272d2bc864d388afe9cf22c6e8c2b.pkl
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed: 34.9min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-3dbdd9d7e6764b4385ad63f18b61bcf9.pkl
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 35.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('fea

[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 36.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-c378e0f0931544b69b356bf41e3460c0.pkl
[Parallel(n_jobs=-1)]: Done 166 tasks      | elapsed: 36.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 177 tasks      | elapsed: 37.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-92fcee58a79a42279e6999e252da4f13.pkl
[Parallel(n_jobs=-1)]: Done 178 tasks      | elapsed: 37.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed: 38.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-97c77a1f7dd048eaa25b595901ef6ff4.pkl
[Parallel(n_jobs=-1)]: Done 190 tasks      | elapsed: 38.9min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed: 40.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-d690b07d7e104a35931d786943badd85.pkl
[Parallel(n_jobs=-1)]: Done 202 tasks      | elapsed: 40.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed: 42.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-55fd0333e766409ea55d34e727b9bb3b.pkl
[Parallel(n_jobs=-1)]: Done 214 tasks      | elapsed: 42.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 225 tasks      | elapsed: 43.9min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-0148944f7f694b0c96f4cac89a0f9072.pkl
[Parallel(n_jobs=-1)]: Done 226 tasks      | elapsed: 43.9min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 237 tasks      | elapsed: 45.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-85d98527be0d4386a1226a803da1da8d.pkl
[Parallel(n_jobs=-1)]: Done 238 tasks      | elapsed: 45.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 249 tasks      | elapsed: 46.4min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-a25ade48a7ad4b8e8eff08e96c19137a.pkl
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed: 46.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 261 tasks      | elapsed: 47.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-9cac47c4e6834f509a6be5aa594ceb02.pkl
[Parallel(n_jobs=-1)]: Done 262 tasks      | elapsed: 47.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed: 48.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-a45a366bf0854cfe8a54a63fe9fba072.pkl
[Parallel(n_jobs=-1)]: Done 274 tasks      | elapsed: 48.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 285 tasks      | elapsed: 49.4min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-dc19d983c7934f6191fa08dbf3118b8e.pkl
[Parallel(n_jobs=-1)]: Done 286 tasks      | elapsed: 49.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed: 50.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-4cb8e297e4ed478499a5d30a236a7a51.pkl
[Parallel(n_jobs=-1)]: Done 298 tasks      | elapsed: 50.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 309 tasks      | elapsed: 51.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-bb8635f560d84935b9715b3068f95389.pkl
[Parallel(n_jobs=-1)]: Done 310 tasks      | elapsed: 51.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 321 tasks      | elapsed: 53.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-5be64880761c4a7c8e8c7c4df0167ec5.pkl
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed: 53.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed: 55.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-4dce1bbbacec45bbaf6d438dcd726eb9.pkl
[Parallel(n_jobs=-1)]: Done 334 tasks      | elapsed: 55.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

[Parallel(n_jobs=-1)]: Done 345 tasks      | elapsed: 56.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Memmapping (shape=(6619, 1, 20), dtype=float64) to new file C:\Users\Tyler\AppData\Local\Temp\joblib_memmapping_folder_1984_7381848825\1984-2281383102024-25a5b54d1e45465689b803d68134fa27.pkl
[Parallel(n_jobs=-1)]: Done 346 tasks      | elapsed: 56.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
Pickling array (shape=(20,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(6619,

##### The best score and associated parameters followed by a table of more detailed results

In [48]:
twentyAdaBooGSCV.best_score_

0.4887749690648754

In [49]:
for param_name in sorted(twentyAdaBooParam.keys()):
    print("%s: %r" % (param_name, twentyAdaBooGSCV.best_params_[param_name]))

clf__algorithm: 'SAMME.R'
clf__base_estimator: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
clf__learning_rate: 1.0
clf__n_estimators: 50
clf__random_state: None
ppp__vect__lowercase: True
ppp__vect__max_df: 0.5
ppp__vect__max_features: 10000
ppp__vect__min_df: 2
ppp__vect__ngram_range: (1, 1)
ppp__vect__stop_words: 'english'
ppp__vect__strip_accents: 'ascii'


In [50]:
df = pd.DataFrame(twentyAdaBooGSCV.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__algorithm,param_clf__base_estimator,param_clf__learning_rate,param_clf__n_estimators,param_clf__random_state,param_ppp__vect__lowercase,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,14.447738,1.567092,0.226007,0.073202,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.483275,0.488516,0.463242,0.503546,0.489343,0.491103,0.487088,0.486123,0.011648,4
1,497.094753,584.329146,0.362689,0.107866,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.502641,0.507951,0.470328,0.491135,0.471581,0.491103,0.469279,0.485328,0.012841,6
2,25.976797,1.446154,0.430074,0.095277,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.483275,0.492933,0.481842,0.494681,0.484014,0.47331,0.47195,0.481439,0.00767,21
3,15.089701,1.618458,0.23727,0.067899,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.488556,0.50265,0.488043,0.485816,0.487567,0.481317,0.494212,0.487096,0.007263,3
4,19.232486,1.729667,0.326562,0.065345,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.486796,0.489399,0.462356,0.485816,0.480462,0.475089,0.473731,0.476224,0.008579,30
5,24.0936,1.627989,0.404688,0.065345,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.486796,0.481449,0.470328,0.486702,0.491119,0.487544,0.476402,0.482941,0.005945,13
6,21.028911,1.190226,0.25404,0.062798,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.480634,0.493816,0.454384,0.47961,0.487567,0.485765,0.469279,0.472865,0.015435,36
7,38.258323,1.027887,0.360938,0.074755,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.482394,0.501767,0.479185,0.48227,0.484902,0.468861,0.470169,0.476047,0.011886,31
8,43.206916,1.476012,0.451562,0.064971,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.497359,0.492933,0.487157,0.492021,0.496448,0.459964,0.476402,0.480202,0.015774,23
9,19.08471,1.087686,0.236707,0.04105,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.471831,0.487633,0.452613,0.495567,0.494671,0.459964,0.495102,0.477285,0.014747,27


##### Use the pre-processing pipe to pre-process both the train and test data separately, then fit a model with the above best parameters to the train data and evaluate the result on the test data

In [51]:
twentyAdaBooPreProcessingPipe.fit(twenty_train.data)
X_train_AdaBoo_twenty = twentyAdaBooPreProcessingPipe.transform(twenty_train.data)
X_test_AdaBoo_twenty = twentyAdaBooPreProcessingPipe.transform(twenty_test.data)
twentyAdaBoo = AdaBoostClassifier()
twentyAdaBoo.fit(X_train_AdaBoo_twenty, twenty_train.target)
y_pred_AdaBoo_twenty = twentyAdaBoo.predict(X_test_AdaBoo_twenty)
metrics.accuracy_score(twenty_test.target, y_pred_AdaBoo_twenty)

0.3432023366967605

In [52]:
metrics.confusion_matrix(twenty_test.target, y_pred_AdaBoo_twenty)

array([[  0,   0,   0,   0,   0,   2,   2,   0,   1,   0,   2,   4, 196,
          1,   5,  88,   6,   1,   9,   2],
       [  0,  96,  12,   8,   5,  15,   3,   1,   0,   0,   2,   2, 238,
          0,   7,   0,   0,   0,   0,   0],
       [  0,  21, 131,  18,  12,  19,   1,   1,   0,   0,   1,   0, 180,
          1,   6,   0,   2,   0,   1,   0],
       [  0,  14,  22, 106,   5,   3,   6,   0,   0,   0,   1,   2, 226,
          2,   5,   0,   0,   0,   0,   0],
       [  0,   3,   1,  24, 120,   0,   8,   0,   0,   0,   2,   4, 216,
          0,   3,   2,   0,   0,   2,   0],
       [  0,   9,  25,   3,   3, 155,   1,   0,   0,   0,   0,   3, 187,
          0,   4,   0,   0,   1,   3,   1],
       [  0,   5,   4,  21,   8,   1, 201,   6,   2,   0,   3,   1, 132,
          0,   3,   2,   1,   0,   0,   0],
       [  0,   0,   2,   7,   0,   2,   6, 116,   4,   0,   1,   1, 245,
          0,   1,   1,   9,   0,   1,   0],
       [  0,   0,   0,  11,   0,   1,   6,   6, 126,   1,   1,  

In [53]:
print(metrics.classification_report(twenty_test.target, y_pred_AdaBoo_twenty, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.00      0.00      0.00       319
           comp.graphics       0.62      0.25      0.35       389
 comp.os.ms-windows.misc       0.66      0.33      0.44       394
comp.sys.ibm.pc.hardware       0.50      0.27      0.35       392
   comp.sys.mac.hardware       0.71      0.31      0.43       385
          comp.windows.x       0.76      0.39      0.52       395
            misc.forsale       0.79      0.52      0.62       390
               rec.autos       0.80      0.29      0.43       396
         rec.motorcycles       0.92      0.32      0.47       398
      rec.sport.baseball       0.77      0.18      0.29       397
        rec.sport.hockey       0.65      0.53      0.58       399
               sci.crypt       0.83      0.41      0.54       396
         sci.electronics       0.08      0.87      0.15       393
                 sci.med       0.83      0.16      0.27       396
         

# AdaBoost Classifier - IBMD Movie Reviews
##### Set up a pre-processing pipeline separately from the main-pipeline so that later we can pre-process before training our final model on the train dataset.  Define the parameters over which to do the GSCV and then perform the GSCV on the set of permutations of those parameters.

In [54]:
imbdAdaBooPreProcessingPipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('scale', StandardScaler(with_mean=False)),
    ('norm', Normalizer())
])
imbdAdaBooPipe = Pipeline([
    ('ppp', imbdAdaBooPreProcessingPipe),
    ('clf', AdaBoostClassifier())
])
imbdAdaBooParam = {
    'ppp__vect__strip_accents': ['ascii'],
    'ppp__vect__lowercase': [True],
    'ppp__vect__stop_words': ['english'],
    'ppp__vect__ngram_range': [(1,1), (1,2), (1,3)],
    'ppp__vect__max_df': [0.3, 0.4, 0.5],
    'ppp__vect__min_df': [2, 3],
    'ppp__vect__max_features': [10000, 100000],
    'clf__base_estimator': [imbdDecTre],
    'clf__n_estimators': [50],
    'clf__learning_rate': [1.0],
    'clf__algorithm': ['SAMME.R'],
    'clf__random_state': [None]
}
imbdAdaBooGSCV = GridSearchCV(imbdAdaBooPipe, imbdAdaBooParam, cv=10, n_jobs=-1, verbose=1000)
imbdAdaBooGSCV = imbdAdaBooGSCV.fit(imbd_train_data, imbd_train_target)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2

[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  1.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:  4.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done  31 tasks      | elapsed:  5.0min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done  47 tasks      | elapsed:  7.1min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:  7.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 10.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed: 10.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed: 13.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 13.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed: 17.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done  99 tasks      | elapsed: 17.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done 115 tasks      | elapsed: 21.7min
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed: 21.7min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed: 24.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed: 24.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed: 27.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done 150 tasks      | elapsed: 27.8min
[Parallel(n_jobs=-1)]: Done 151 tasks      | elapsed: 27.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_sample

[Parallel(n_jobs=-1)]: Done 166 tasks      | elapsed: 29.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done 167 tasks      | elapsed: 29.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done 183 tasks      | elapsed: 32.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 33.0min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed: 36.3min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed: 36.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed: 40.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done 218 tasks      | elapsed: 40.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done 234 tasks      | elapsed: 44.3min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done 235 tasks      | elapsed: 44.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done 251 tasks      | elapsed: 47.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done 252 tasks      | elapsed: 47.3min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed: 50.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed: 50.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done 285 tasks      | elapsed: 52.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done 286 tasks      | elapsed: 52.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done 302 tasks      | elapsed: 55.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done 303 tasks      | elapsed: 56.1min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done 319 tasks      | elapsed: 59.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done 320 tasks      | elapsed: 59.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done 336 tasks      | elapsed: 63.1min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done 337 tasks      | elapsed: 63.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float6

[Parallel(n_jobs=-1)]: Done 353 tasks      | elapsed: 67.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 1, 2), dtype=float64).
[Parallel(n_jobs=-1)]: Done 354 out of 360 | elapsed: 67.7min remaining:  1.1min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(2,), dtype=int32).
Pickling array (shape=(1,), dtype=int64).
Pickling array (shape=(4649,), dtype=[('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]).
Pickling array (shape=(4649, 

##### The best score and associated parameters followed by a table of more detailed results

In [55]:
imbdAdaBooGSCV.best_score_

0.71736

In [56]:
for param_name in sorted(imbdAdaBooParam.keys()):
    print("%s: %r" % (param_name, imbdAdaBooGSCV.best_params_[param_name]))

clf__algorithm: 'SAMME.R'
clf__base_estimator: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
clf__learning_rate: 1.0
clf__n_estimators: 50
clf__random_state: None
ppp__vect__lowercase: True
ppp__vect__max_df: 0.4
ppp__vect__max_features: 10000
ppp__vect__min_df: 3
ppp__vect__ngram_range: (1, 3)
ppp__vect__stop_words: 'english'
ppp__vect__strip_accents: 'ascii'


In [57]:
df = pd.DataFrame(imbdAdaBooGSCV.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__algorithm,param_clf__base_estimator,param_clf__learning_rate,param_clf__n_estimators,param_clf__random_state,param_ppp__vect__lowercase,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,24.640606,0.806658,0.445585,0.027843,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.7096,0.718,0.71,0.71,0.7212,0.7104,0.7336,0.71552,0.00815,3
1,35.716688,0.900366,0.655086,0.037491,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.7096,0.718,0.704,0.7104,0.7016,0.72,0.7144,0.71116,0.007248,32
2,49.961094,0.795826,0.836821,0.034192,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.71,0.7168,0.7072,0.714,0.718,0.7152,0.7144,0.71516,0.00582,4
3,24.444622,0.530167,0.418749,0.011693,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.718,0.7108,0.7028,0.7204,0.7084,0.7088,0.7304,0.71356,0.008689,16
4,35.634827,0.741453,0.628123,0.018221,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.7124,0.7184,0.7056,0.702,0.7016,0.72,0.72,0.71368,0.008912,13
5,49.224085,0.692611,0.867527,0.036272,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.7116,0.7156,0.7052,0.7004,0.7212,0.7192,0.7056,0.71236,0.008479,24
6,30.425321,0.98691,0.433552,0.011734,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.714,0.706,0.7048,0.7156,0.7156,0.7092,0.7372,0.71404,0.00858,10
7,55.694503,0.935295,0.715624,0.03409,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.7088,0.7236,0.6972,0.72,0.7196,0.7108,0.7196,0.71364,0.007931,15
8,71.477118,1.051296,0.940677,0.055664,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.7112,0.72,0.7108,0.7152,0.7184,0.72,0.7232,0.71508,0.006027,5
9,29.341814,0.874661,0.442465,0.018502,SAMME.R,"DecisionTreeClassifier(class_weight=None, crit...",1,50,,True,...,0.7156,0.7044,0.7104,0.7088,0.7112,0.7064,0.7256,0.71244,0.006303,23


##### Use the pre-processing pipe to pre-process both the train and test data separately, then fit a model with the above best parameters to the train data and evaluate the result on the test data

In [58]:
imbdAdaBooPreProcessingPipe.fit(imbd_train_data)
X_train_AdaBoo_imbd = imbdAdaBooPreProcessingPipe.transform(imbd_train_data)
X_test_AdaBoo_imbd = imbdAdaBooPreProcessingPipe.transform(imbd_test_data)
imbdAdaBoo = AdaBoostClassifier()
imbdAdaBoo.fit(X_train_AdaBoo_imbd, imbd_train_target)
y_pred_AdaBoo_imbd = imbdAdaBoo.predict(X_test_AdaBoo_imbd)
metrics.accuracy_score(imbd_test_target, y_pred_AdaBoo_imbd)

0.80172

In [59]:
metrics.confusion_matrix(imbd_test_target, y_pred_AdaBoo_imbd)

array([[ 9503,  2997],
       [ 1960, 10540]], dtype=int64)

In [60]:
print(metrics.classification_report(imbd_test_target, y_pred_AdaBoo_imbd))

              precision    recall  f1-score   support

           0       0.83      0.76      0.79     12500
           1       0.78      0.84      0.81     12500

    accuracy                           0.80     25000
   macro avg       0.80      0.80      0.80     25000
weighted avg       0.80      0.80      0.80     25000



# Random Forest Classifier - Twenty News Group
##### Set up a pre-processing pipeline separately from the main-pipeline so that later we can pre-process before training our final model on the train dataset.  Define the parameters over which to do the GSCV and then perform the GSCV on the set of permutations of those parameters.

In [62]:
twentyRndForPreProcessingPipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('scale', StandardScaler(with_mean=False)),
    ('norm', Normalizer())
])
twentyRndForPipe = Pipeline([
    ('ppp', twentyRndForPreProcessingPipe),
    ('clf', RandomForestClassifier())
])
twentyRndForParam = {
    'ppp__vect__strip_accents': ['ascii'],
    'ppp__vect__lowercase': [True],
    'ppp__vect__stop_words': ['english'],
    'ppp__vect__ngram_range': [(1,1), (1,2), (1,3)],
    'ppp__vect__max_df': [0.3, 0.4, 0.5],
    'ppp__vect__min_df': [2, 3],
    'ppp__vect__max_features': [10000, 100000],
    'clf__n_estimators': [100],
    'clf__criterion': ['gini'],
    'clf__max_depth': [None],
    'clf__min_samples_split': [2],
    'clf__min_samples_leaf': [1],
    'clf__min_weight_fraction_leaf': [0.0],
    'clf__max_features': ['auto'],
    'clf__max_leaf_nodes': [None],
    'clf__min_impurity_decrease': [0.0],
    'clf__bootstrap': [True],
    'clf__oob_score': [False],
    'clf__random_state': [None],
    'clf__class_weight': [None],
}
twentyRndForGSCV = GridSearchCV(twentyRndForPipe, twentyRndForParam, cv=10, n_jobs=-1, verbose=1000)
twentyRndForGSCV = twentyRndForGSCV.fit(twenty_train.data, twenty_train.target)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   23.3s
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  39 tasks      | elapsed:  4.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  4.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  4.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  43 tasks      | elapsed:  4.6min
Pickl

[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed: 11.0min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed: 11.4min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 11.4min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  83 tasks      | elapsed: 12.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  84 tasks      | elapsed: 12.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
[Para

[Parallel(n_jobs=-1)]: Done 122 tasks      | elapsed: 18.9min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 123 tasks      | elapsed: 19.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed: 19.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed: 19.3min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 126 tasks      | elapsed: 19.3min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
[Para

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed: 23.3min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dtype=int32).
Pickling array (shape=(1124,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 23.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 166 tasks      | elapsed: 23.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 167 tasks      | elapsed: 23.6min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed: 23.7min
Pickl

[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed: 31.0min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10191,), dtype=int32).
Pickling array (shape=(1123,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 206 tasks      | elapsed: 31.1min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10174,), dtype=int32).
Pickling array (shape=(1140,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 207 tasks      | elapsed: 31.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed: 31.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 209 tasks      | elapsed: 32.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
[Para

[Parallel(n_jobs=-1)]: Done 247 tasks      | elapsed: 38.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed: 38.5min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 249 tasks      | elapsed: 38.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 250 tasks      | elapsed: 38.7min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 251 tasks      | elapsed: 38.9min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Para

Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10176,), dtype=int32).
Pickling array (shape=(1138,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 289 tasks      | elapsed: 43.0min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10178,), dtype=int32).
Pickling array (shape=(1136,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 290 tasks      | elapsed: 43.0min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 291 tasks      | elapsed: 43.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 292 tasks      | elapsed: 43.2min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 293 tasks      | elapsed: 43.5min
[Para

[Parallel(n_jobs=-1)]: Done 330 tasks      | elapsed: 51.4min
[Parallel(n_jobs=-1)]: Done 331 tasks      | elapsed: 51.4min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10182,), dtype=int32).
Pickling array (shape=(1132,), dtype=int32).
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10185,), dtype=int32).
Pickling array (shape=(1129,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 332 tasks      | elapsed: 51.4min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10186,), dtype=int32).
Pickling array (shape=(1128,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed: 51.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10188,), dtype=int32).
Pickling array (shape=(1126,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 334 tasks      | elapsed: 51.8min
[Parallel(n_jobs=-1)]: Done 335 tasks      | elapsed: 51.8min
Pickling array (shape=(11314,), dtype=int32).
Pickling array (shape=(10190,), dt

##### The best score and associated parameters followed by a table of more detailed results

In [63]:
twentyRndForGSCV.best_score_

0.6701431854339756

In [64]:
for param_name in sorted(twentyRndForParam.keys()):
    print("%s: %r" % (param_name, twentyRndForGSCV.best_params_[param_name]))

clf__bootstrap: True
clf__class_weight: None
clf__criterion: 'gini'
clf__max_depth: None
clf__max_features: 'auto'
clf__max_leaf_nodes: None
clf__min_impurity_decrease: 0.0
clf__min_samples_leaf: 1
clf__min_samples_split: 2
clf__min_weight_fraction_leaf: 0.0
clf__n_estimators: 100
clf__oob_score: False
clf__random_state: None
ppp__vect__lowercase: True
ppp__vect__max_df: 0.5
ppp__vect__max_features: 100000
ppp__vect__min_df: 2
ppp__vect__ngram_range: (1, 2)
ppp__vect__stop_words: 'english'
ppp__vect__strip_accents: 'ascii'


In [65]:
df = pd.DataFrame(twentyRndForGSCV.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__bootstrap,param_clf__class_weight,param_clf__criterion,param_clf__max_depth,param_clf__max_features,param_clf__max_leaf_nodes,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,21.470054,0.436407,0.304734,0.033612,True,,gini,,auto,,...,0.644366,0.673145,0.64039,0.667553,0.650977,0.659253,0.638468,0.648931,0.013769,20
1,25.463158,0.441348,0.383366,0.039107,True,,gini,,auto,,...,0.648768,0.658127,0.639504,0.662234,0.660746,0.657473,0.633126,0.647074,0.011797,25
2,30.285887,0.419302,0.456065,0.045565,True,,gini,,auto,,...,0.638204,0.659011,0.650133,0.660461,0.657194,0.664591,0.635797,0.647163,0.012429,23
3,20.626489,0.165671,0.271139,0.023803,True,,gini,,auto,,...,0.639965,0.657244,0.643047,0.661348,0.664298,0.663701,0.637578,0.649019,0.012643,19
4,25.114607,0.220666,0.36163,0.032472,True,,gini,,auto,,...,0.647887,0.672261,0.642161,0.651596,0.646536,0.654804,0.62333,0.645572,0.0134,33
5,30.123475,0.271507,0.453953,0.04734,True,,gini,,auto,,...,0.659331,0.661661,0.637733,0.659574,0.656306,0.66548,0.632235,0.647163,0.014519,23
6,29.641564,0.6077,0.319844,0.027497,True,,gini,,auto,,...,0.65493,0.677562,0.663419,0.657801,0.67762,0.673488,0.646483,0.660951,0.010951,12
7,65.82943,1.131054,0.442187,0.029687,True,,gini,,auto,,...,0.659331,0.694346,0.666076,0.684397,0.669627,0.678826,0.645592,0.667138,0.01424,8
8,71.824054,1.044788,0.523856,0.036453,True,,gini,,auto,,...,0.666373,0.688163,0.672276,0.66578,0.685613,0.69395,0.650935,0.668994,0.014689,5
9,25.104714,0.407396,0.297306,0.016082,True,,gini,,auto,,...,0.651408,0.678445,0.653676,0.655142,0.665187,0.674377,0.64203,0.657239,0.011016,14


##### Use the pre-processing pipe to pre-process both the train and test data separately, then fit a model with the above best parameters to the train data and evaluate the result on the test data

In [66]:
twentyRndForPreProcessingPipe.fit(twenty_train.data)
X_train_RndFor_twenty = twentyRndForPreProcessingPipe.transform(twenty_train.data)
X_test_RndFor_twenty = twentyRndForPreProcessingPipe.transform(twenty_test.data)
twentyRndFor = RandomForestClassifier()
twentyRndFor.fit(X_train_RndFor_twenty, twenty_train.target)
y_pred_RndFor_twenty = twentyRndFor.predict(X_test_RndFor_twenty)
metrics.accuracy_score(twenty_test.target, y_pred_RndFor_twenty)



0.39776951672862454

In [67]:
metrics.confusion_matrix(twenty_test.target, y_pred_RndFor_twenty)

array([[ 67,  11,  10,   5,   5,   9,   4,  23,  23,   9,   5,   8,  13,
          9,  12,  62,   8,  14,   8,  14],
       [  6, 162,  43,  18,  27,  35,  23,  20,   8,   6,   4,   4,  14,
          4,   2,   5,   1,   5,   2,   0],
       [  7,  41, 205,  28,  15,  15,   7,  24,   7,   9,   4,   3,   9,
          5,   7,   0,   1,   4,   2,   1],
       [ 12,  37,  55, 121,  35,  14,  37,  20,   7,   4,   6,   5,  14,
          4,   6,   4,   2,   5,   2,   2],
       [  5,  32,  27,  52, 154,  14,  20,  25,   7,   6,   4,   4,  17,
          3,   5,   1,   1,   7,   1,   0],
       [  8,  50,  70,  12,  18, 155,  14,  14,   9,   4,   5,   6,  13,
          3,   3,   3,   1,   4,   3,   0],
       [  4,  14,  12,  21,  16,   7, 253,  21,   7,   7,   4,   3,   7,
          0,   4,   3,   3,   3,   1,   0],
       [ 13,  20,  17,  10,  15,  11,  20, 179,  15,  17,   7,   5,  26,
          8,  13,   2,   3,   6,   5,   4],
       [ 25,  18,  10,   5,  11,   5,  16,  61, 163,   9,  11,  

In [68]:
print(metrics.classification_report(twenty_test.target, y_pred_RndFor_twenty, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.18      0.21      0.19       319
           comp.graphics       0.28      0.42      0.33       389
 comp.os.ms-windows.misc       0.35      0.52      0.42       394
comp.sys.ibm.pc.hardware       0.39      0.31      0.34       392
   comp.sys.mac.hardware       0.37      0.40      0.39       385
          comp.windows.x       0.44      0.39      0.41       395
            misc.forsale       0.50      0.65      0.57       390
               rec.autos       0.27      0.45      0.34       396
         rec.motorcycles       0.40      0.41      0.40       398
      rec.sport.baseball       0.40      0.43      0.42       397
        rec.sport.hockey       0.55      0.64      0.59       399
               sci.crypt       0.60      0.47      0.53       396
         sci.electronics       0.23      0.19      0.21       393
                 sci.med       0.49      0.38      0.43       396
         

# Random Forest Classifier - IBMD Movie Reviews
##### Set up a pre-processing pipeline separately from the main-pipeline so that later we can pre-process before training our final model on the train dataset.  Define the parameters over which to do the GSCV and then perform the GSCV on the set of permutations of those parameters.

In [69]:
imbdRndForPreProcessingPipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('scale', StandardScaler(with_mean=False)),
    ('norm', Normalizer())
])
imbdRndForPipe = Pipeline([
    ('ppp', imbdRndForPreProcessingPipe),
    ('clf', RandomForestClassifier())
])
imbdRndForParam = {
    'ppp__vect__strip_accents': ['ascii'],
    'ppp__vect__lowercase': [True],
    'ppp__vect__stop_words': ['english'],
    'ppp__vect__ngram_range': [(1,1), (1,2), (1,3)],
    'ppp__vect__max_df': [0.3, 0.4, 0.5],
    'ppp__vect__min_df': [2, 3],
    'ppp__vect__max_features': [10000, 100000],
    'clf__n_estimators': [100],
    'clf__criterion': ['gini'],
    'clf__max_depth': [None],
    'clf__min_samples_split': [2],
    'clf__min_samples_leaf': [1],
    'clf__min_weight_fraction_leaf': [0.0],
    'clf__max_features': ['auto'],
    'clf__max_leaf_nodes': [None],
    'clf__min_impurity_decrease': [0.0],
    'clf__bootstrap': [True],
    'clf__oob_score': [False],
    'clf__random_state': [None],
    'clf__class_weight': [None],
}
imbdRndForGSCV = GridSearchCV(imbdRndForPipe, imbdRndForParam, cv=10, n_jobs=-1, verbose=1000)
imbdRndForGSCV = imbdRndForGSCV.fit(imbd_train_data, imbd_train_target)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   44.4s
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   45.0s
Pickling array (shape=(22500,), dtype=

[Parallel(n_jobs=-1)]: Done  51 tasks      | elapsed: 11.6min
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed: 11.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 12.3min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed: 12.3min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  55 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 12.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed: 13.4min
Pickling array (shap

[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 40.2min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 106 tasks      | elapsed: 40.3min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 107 tasks      | elapsed: 40.4min
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed: 40.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed: 41.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 110 tasks      | elapsed: 41.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 111 tasks      | elapsed: 42.0min
[Parallel(n_jobs=-1)

[Parallel(n_jobs=-1)]: Done 159 tasks      | elapsed: 53.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed: 53.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed: 54.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 162 tasks      | elapsed: 54.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 163 tasks      | elapsed: 54.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed: 54.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 55.5min
Pickling array (shap

[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed: 69.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 214 tasks      | elapsed: 69.7min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 215 tasks      | elapsed: 69.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed: 69.8min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed: 70.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 218 tasks      | elapsed: 70.5min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 219 tasks      | elapsed: 70.6min
Pickling array (shap

[Parallel(n_jobs=-1)]: Done 267 tasks      | elapsed: 84.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed: 84.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed: 85.4min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed: 85.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed: 85.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 85.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed: 86.1min
Pickling array (shap

[Parallel(n_jobs=-1)]: Done 321 tasks      | elapsed: 98.7min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed: 98.9min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 323 tasks      | elapsed: 99.7min
[Parallel(n_jobs=-1)]: Done 324 tasks      | elapsed: 99.7min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 325 tasks      | elapsed: 100.3min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 326 tasks      | elapsed: 100.6min
Pickling array (shape=(22500,), dtype=int32).
Pickling array (shape=(2500,), dtype=int32).
[Parallel(n_jobs=-1)]: Done 327 tasks      | elapsed: 101.3min
[Parallel(n_jobs=

##### The best score and associated parameters followed by a table of more detailed results

In [70]:
imbdRndForGSCV.best_score_

0.8522

In [71]:
for param_name in sorted(imbdRndForParam.keys()):
    print("%s: %r" % (param_name, imbdRndForGSCV.best_params_[param_name]))

clf__bootstrap: True
clf__class_weight: None
clf__criterion: 'gini'
clf__max_depth: None
clf__max_features: 'auto'
clf__max_leaf_nodes: None
clf__min_impurity_decrease: 0.0
clf__min_samples_leaf: 1
clf__min_samples_split: 2
clf__min_weight_fraction_leaf: 0.0
clf__n_estimators: 100
clf__oob_score: False
clf__random_state: None
ppp__vect__lowercase: True
ppp__vect__max_df: 0.5
ppp__vect__max_features: 100000
ppp__vect__min_df: 2
ppp__vect__ngram_range: (1, 2)
ppp__vect__stop_words: 'english'
ppp__vect__strip_accents: 'ascii'


In [72]:
df = pd.DataFrame(imbdRndForGSCV.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__bootstrap,param_clf__class_weight,param_clf__criterion,param_clf__max_depth,param_clf__max_features,param_clf__max_leaf_nodes,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,41.960152,0.845471,0.621101,0.068219,True,,gini,,auto,,...,0.8396,0.8472,0.8436,0.8224,0.8508,0.8448,0.8472,0.83944,0.008998,34
1,51.99537,0.355676,0.800865,0.052711,True,,gini,,auto,,...,0.8464,0.8512,0.8424,0.8228,0.8548,0.8528,0.8436,0.84396,0.008866,14
2,66.594327,0.367912,1.050442,0.069891,True,,gini,,auto,,...,0.8488,0.8468,0.848,0.8248,0.8532,0.8452,0.8436,0.84292,0.007734,17
3,41.266229,0.22483,0.566452,0.039175,True,,gini,,auto,,...,0.8436,0.8508,0.8376,0.8232,0.8544,0.8424,0.8392,0.83972,0.008813,33
4,52.190157,0.324156,0.790664,0.053077,True,,gini,,auto,,...,0.846,0.8548,0.834,0.8292,0.8552,0.8476,0.8412,0.8424,0.008472,20
5,66.34753,0.538432,0.968181,0.02591,True,,gini,,auto,,...,0.8464,0.8476,0.8432,0.8304,0.8504,0.8436,0.8468,0.84324,0.006067,16
6,48.152978,0.497628,0.648265,0.036268,True,,gini,,auto,,...,0.858,0.8516,0.8416,0.834,0.8436,0.8564,0.8428,0.8454,0.007465,13
7,378.678791,364.257245,1.053095,0.117386,True,,gini,,auto,,...,0.8512,0.8584,0.854,0.8344,0.8524,0.8552,0.8476,0.8512,0.006432,2
8,98.389726,5.657128,1.234164,0.070403,True,,gini,,auto,,...,0.8524,0.8552,0.844,0.8332,0.8612,0.8544,0.8564,0.84988,0.007676,6
9,46.436692,0.334707,0.645855,0.028292,True,,gini,,auto,,...,0.8436,0.8568,0.84,0.8296,0.8428,0.8436,0.8412,0.84076,0.007062,30


##### Use the pre-processing pipe to pre-process both the train and test data separately, then fit a model with the above best parameters to the train data and evaluate the result on the test data

In [73]:
imbdRndForPreProcessingPipe.fit(imbd_train_data)
X_train_RndFor_imbd = imbdRndForPreProcessingPipe.transform(imbd_train_data)
X_test_RndFor_imbd = imbdRndForPreProcessingPipe.transform(imbd_test_data)
imbdRndFor = RandomForestClassifier()
imbdRndFor.fit(X_train_RndFor_imbd, imbd_train_target)
y_pred_RndFor_imbd = imbdRndFor.predict(X_test_RndFor_imbd)
metrics.accuracy_score(imbd_test_target, y_pred_RndFor_imbd)



0.73584

In [74]:
metrics.confusion_matrix(imbd_test_target, y_pred_RndFor_imbd)

array([[10016,  2484],
       [ 4120,  8380]], dtype=int64)

In [75]:
print(metrics.classification_report(imbd_test_target, y_pred_RndFor_imbd))

              precision    recall  f1-score   support

           0       0.71      0.80      0.75     12500
           1       0.77      0.67      0.72     12500

    accuracy                           0.74     25000
   macro avg       0.74      0.74      0.73     25000
weighted avg       0.74      0.74      0.73     25000

