# Challenge : prediction of the sex of individuals based on their picture.
###  by Benjamin LAZARD

First, let us import all libraries that we will use for this study.

In [1]:
#basic python packages for plotting and array management
import numpy as np
import matplotlib.pyplot as plt

#for data import
import pandas as pd

#Preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

#Classifiers
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV, PassiveAggressiveClassifier, Perceptron, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neural_network import MLPClassifier

#Ensemble methods and crossvalidation
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier,RandomForestClassifier, VotingClassifier

#PostProcessing
from sklearn.metrics import classification_report, confusion_matrix



#Because oh boy some computations take an amazing amount of time !
import time

# To compute the personalized score
def compute_pred_score(y_true, y_pred):
    y_pred_unq =  np.unique(y_pred)
    for i in y_pred_unq:
        if((i != -1) & (i!= 1) & (i!= 0) ):
            raise ValueError('The predictions can contain only -1, 1, or 0!')
    y_comp = y_true * y_pred
    score = float(10*np.sum(y_comp == -1) + np.sum(y_comp == 0))
    score /= y_comp.shape[0]
    return score

Then, let us import the data

In [2]:
X_train_fname = 'training_templates.csv'
y_train_fname = 'training_labels.txt'
X_test_fname  = 'testing_templates.csv'
X_train = pd.read_csv(X_train_fname, sep=',', header=None).values
X_test  = pd.read_csv(X_test_fname,  sep=',', header=None).values
y_train = np.loadtxt(y_train_fname, dtype=np.int)

Now, let us sum up what we are dealing with:

In [3]:
print("We will train our algorithm based on a set of %d pictures, each with %d features."%(X_train.shape[0],X_train.shape[1]))
print("Then we will test it on a set of %d pictures with the same number of features."%(X_test.shape[0]))
print("\nThe training set consists of labels: ")
print(np.unique(y_train))
print("for exemple '-1' = women and '1' = men")
print("There are exactly %d men and %d women" %((y_train == -1).sum(),(y_train == 1).sum() ))

We will train our algorithm based on a set of 105600 pictures, each with 128 features.
Then we will test it on a set of 8496 pictures with the same number of features.

The training set consists of labels: 
[-1  1]
for exemple '-1' = women and '1' = men
There are exactly 52800 men and 52800 women


## Step 1: 
trying to preprocess the data, and check the difference in score for 

In [11]:
X_train_scaled = preprocessing.scale(X)

#score of logistic regression with regular data
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
score = compute_pred_score(y_train, y_pred_train)
print("Logistic regression / training test / regular score= %0.3f"%(score))

#Score with standardized data
clf.fit(X_train_scaled, y_train)
y_pred_train = clf.predict(X_train_scaled)
score = compute_pred_score(y_train, y_pred_train)
print("Logistic regression / training test / scaled score= %0.3f"%(score))

Logistic regression / training test / regular score= 0.639
Logistic regression / training test / scaled score= 0.636


So at least for this particular example, scaling does not seem very useful

# Step 2
## RidgeClassifier
Trying a few linear models on both the regular and scaled data

In [16]:
#Classification based on ridge regression
clf = RidgeClassifier(normalize=False)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
score = compute_pred_score(y_train, y_pred_train)
print("RidgeClassifier / training test / regular / score= %0.3f"%(score))

#same thing, but with normalzed data
clf = RidgeClassifier(normalize=True)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
score = compute_pred_score(y_train, y_pred_train)
print("RidgeClassifier / training test / normalized / score= %0.3f"%(score))


RidgeClassifier / training test / regular / score= 0.646
RidgeClassifier / training test / normalized / score= 0.867


From this we can deduce that
- It does take long to process
- Normalized data still do not score very well

Because this method is very fast to compute, we can now try to optimize the regularization paramter alpha performing a grid search

In [66]:
clf = RidgeClassifierCV(alphas = [1e-7, 1e-5, 1e-3, 1e-2, 0.1 , 1, 10, 1e2, 1e3, 1e5, 1e7], normalize=False)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
score = compute_pred_score(y_train, y_pred_train)
print("RidgeClassifierCV / training test / regular / score= %0.3f"%(score))
print("chosen alpha = %.2f"%(clf.alpha_))

RidgeClassifierCV / training test / regular / score= 0.646
chosen alpha =0.10


So we can narrow our search around 0.1, after several tries, I was led to the following test

In [70]:
clf = RidgeClassifierCV(alphas = [0.32, 0.33, 0.34, 0.345, 0.35, 0.355, 0.36], normalize=False)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
score = compute_pred_score(y_train, y_pred_train)
print("RidgeClassifierCV / training test / regular / score= %0.3f"%(score))
print("chosen alpha =%.3f"%(clf.alpha_))

RidgeClassifierCV / training test / regular / score= 0.646
chosen alpha =0.350


Showing that the score cannot be better that 0.646 with $\alpha = 0.35$
So let us move on to another classifier such as... 
## Perceptron

In [88]:
#Simple perceptron
clf = Perceptron()
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
score = compute_pred_score(y_train, y_pred_train)
print("Perceptron / training test / regular / score= %0.3f"%(score))

#Fine tuning its outcome (after several tries for eta0)
tuned_parameters = [{'eta0': [0.0001, 0.005, 0.01, 0.02, 1],}]
clf = GridSearchCV(Perceptron(n_jobs=-1, n_iter=10), param_grid= tuned_parameters, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
score = compute_pred_score(y_train, y_pred_train)
print("Perceptron / training test / regular / score= %0.3f for params= %a"%(score,clf.best_params_))

Perceptron / training test / regular / score= 0.902
Perceptron / training test / regular / score= 0.921 for params={'eta0': 0.0001}


This is still no better than the logisitic regressor. And the eta0 parameter seems to have only very little impact. Scaled data gave even worse results and therefore are not shown here.
Let us move on to a variation of the Perceptron :
## the Passive AgressiveClassifier

In [100]:
#Simple PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier()
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
score = compute_pred_score(y_train, y_pred_train)
print("PassiveAggressiveClassifier / training test / regular / score= %0.3f"%(score))

#Fine tuning its outcome (after several tries)
tuned_parameters = [{'C': [ 0.01, 0.02, 0.025, 0.03, 0.04, 0.1],
                     'loss': ['hinge', 'squared_hinge'],
                     'n_iter': [10, 20],
                     'n_jobs' :[-1]
                    }]
clf = GridSearchCV(PassiveAggressiveClassifier(n_iter=10, n_jobs=2), param_grid= tuned_parameters, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
score = compute_pred_score(y_train, y_pred_train)
print("PassiveAggressiveClassifier / training test / regular / score= %0.3f for params= %a"%(score,clf.best_params_))

PassiveAggressiveClassifier / training test / regular / score= 0.765
PassiveAggressiveClassifier / training test / regular / score= 0.683 for params= {'n_jobs': -1, 'loss': 'hinge', 'n_iter': 20, 'C': 0.04}


So it is still worse than the **logisticRegressor**, but it is not so bad.
## Let us try out the SGDClassifier

In [116]:
#Simple SGDClassifier
clf = SGDClassifier()
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
score = compute_pred_score(y_train, y_pred_train)
print("SGDClassifier / training test / regular / score= %0.3f"%(score))

#Fine tuning its outcome (after several tries)
sgd_tuned_parameters = [{'loss': ['log', 'hinge'],
                         'penalty' : ['none', 'l2'],
                         'alpha': [0.0001],
                         'eta0' : [0.008, 0.1, 0.2],
                         'learning_rate': ['optimal'],
                         'n_iter': [10],
                         'n_jobs' :[-1],
                        },
                        {'loss': ['log', 'hinge'],
                         'penalty' : ['l2', 'elasticnet'],
                         'alpha': [0.0001],
                         'learning_rate': ['constant', 'invscaling'],
                         'eta0' : [0.001, 0.1, 1, 10],
                         'n_iter': [10],
                         'n_jobs' :[-1],
                        },
                        {'loss': ['log', 'hinge'],
                         'penalty' : ['l2'],
                         'alpha': [0.0001],
                         'learning_rate': ['invscaling'],
                         'eta0' : [0.01, 0.1, 0.5, 1],
                         'power_t' :[0.1, 0.5, 1],
                         'n_iter': [10],
                         'n_jobs' :[-1],
                        },
                        {'loss': ['hinge'],
                         'penalty' : ['elasticnet'],
                         'l1_ratio' : [0.05, 0.1, 0.15],
                         'alpha': [0.0001],
                         'learning_rate': ['invscaling'],
                         'eta0' : [0.008, 0.1, 0.2],
                         'power_t' :[0.2, 0.5, 0.7],
                         'n_iter': [10],
                         'n_jobs' :[-1],
                        }]
clf = GridSearchCV(SGDClassifier(), param_grid= sgd_tuned_parameters, n_jobs= -1, cv=5)
clf.fit(X_train_scaled, y_train)
y_pred_train = clf.predict(X_train_scaled)
score = compute_pred_score(y_train, y_pred_train)
print("SGDClassifier / training test / regular / score= %0.3f for params= %a"%(score, clf.best_params_))

SGDClassifier / training test / regular / score= 0.648
SGDClassifier / training test / regular / score= 0.630 for params= {'penalty': 'l2', 'eta0': 0.1, 'n_iter': 10, 'learning_rate': 'invscaling', 'n_jobs': -1, 'alpha': 0.0001, 'loss': 'hinge'}


I tested, and it works a lot better with scaled data
The true grid is 
```python
    sgd_tuned_parameters = [{'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                     'penalty' : ['none', 'l2', 'l1', 'elasticnet'],
                     'alpha': [0.0001],
                     'learning_rate': ['constant', 'optimal', 'invscaling'],
                     'eta0' : [0.0001, 0.001, 0.1, 1],
                     'n_iter': [20],
                     'n_jobs' :[-1],
                     'power_t' :[0.5]
                    }]
```
But is takes too much time to run as it involves a lot of parameters. Moreover, it is irrelevant to test power_t when there is no invscaling involved for example.

**score= 0.636 for params= {'learning_rate': 'optimal', 'loss': 'log', 'n_iter': 20, 'penalty': 'none', 'n_jobs': -1, 'alpha': 0.0001}**  
led me to choose alpha, keep the log and hinge loss only in mind  
**score= 0.637 for params= {'eta0': 1, 'loss': 'log', 'n_iter': 10, 'penalty': 'elasticnet', 'learning_rate': 'invscaling', 'n_jobs': -1, 'alpha': 0.0001}**

OK, so I get a better score using  
**score= 0.632 for params= {'penalty': 'elasticnet', 'eta0': 0.1, 'n_iter': 10, 'l1_ratio': 0.1, 'learning_rate': 'invscaling', 'n_jobs': -1, 'power_t': 0.5, 'alpha': 0.0001, 'loss': 'hinge'}**

For the first time, we get something better than with logisticregressions  
**score= 0.630 for params= {'penalty': 'l2', 'eta0': 0.1, 'n_iter': 10, 'learning_rate': 'invscaling', 'n_jobs': -1, 'alpha': 0.0001, 'loss': 'hinge'}**

# Step 3

Now that we have tried all the linear models, as computing time drastically increases with other methods, we will have to adapt the data for other tries (maybe then, once we find good estimators, we can train them on the full set of features).


In [15]:
# dimensionality reduction ratio:
ratio_dr = 70 # as a percentage
my_ncomp = round(X_train.shape[1]*ratio_dr/100) # effective number of features retained
print("%d features selected out of %d (%d %%) for LDA"%(my_ncomp, X_train.shape[1], ratio_dr))

#Linear Discriminant Analysis
lda = LinearDiscriminantAnalysis(n_components=my_ncomp, solver='svd').fit(X_train, y_train)
X_lda = lda.transform(X_train)
print(X_lda.shape)

y_pred_train = lda.predict(X_train)
score = compute_pred_score(y_train, y_pred_train)
print("LDAClassifier / training test / regular / score= %0.3f for dimensionality= %a"%(score, my_ncomp))

90 features selected out of 128 (70 %) for LDA
(105600, 1)
LDAClassifier / training test / regular / score= 0.647 for dimensionality= 90


In [3]:
# dimensionality reduction ratio:
ratio_dr = 100 # as a percentage
my_ncomp = round(X_train.shape[1]*ratio_dr/100) # effective number of features retained
print("%d features selected out of %d (%d %%) for PCA"%(my_ncomp, X_train.shape[1], ratio_dr))

#Principal components analysis
pca = PCA(n_components=my_ncomp, svd_solver='full', whiten=True).fit(X_train)
X_pca = pca.transform(X_train)

#Explained variance ratio
print((pca.explained_variance_ratio_*1000).round())

#Classify with the best classifier found earlier
clf = SGDClassifier(penalty='l2', eta0=0.1, n_iter=10, learning_rate='invscaling', n_jobs=-1, alpha=0.0001, loss='hinge')#penalty='l2', eta0=0.1, n_iter=10, learning_rate='invscaling', n_jobs=-1, alpha=0.0001, loss='hinge'
clf.fit(X_pca, y_train)
y_pred_train = clf.predict(X_pca)
score = compute_pred_score(y_train, y_pred_train)
print("SGD / training test / pca nf= %d / score= %0.3f"%(my_ncomp, score))

#Because of the previous analysis, we can try to simplify the number of features:
ratio_dr = 60
my_ncomp = round(X_train.shape[1]*ratio_dr/100)
pca = PCA(n_components=my_ncomp, svd_solver='full', whiten=True).fit(X_train)
X_pca = pca.transform(X_train)
print((pca.explained_variance_ratio_*1000).round())
clf = SGDClassifier(penalty='l2', eta0=0.1, n_iter=10, learning_rate='invscaling', n_jobs=-1, alpha=0.0001, loss='hinge')#penalty='l2', eta0=0.1, n_iter=10, learning_rate='invscaling', n_jobs=-1, alpha=0.0001, loss='hinge'
clf.fit(X_pca, y_train)
y_pred_train = clf.predict(X_pca)
score = compute_pred_score(y_train, y_pred_train)
print("SGD / training test / pca nf= %d / score= %0.3f"%(my_ncomp, score))

128 features selected out of 128 (100 %) for PCA
[ 31.  23.  22.  21.  21.  20.  20.  20.  19.  18.  18.  18.  17.  17.  16.
  16.  16.  15.  15.  15.  14.  14.  14.  13.  13.  12.  12.  12.  12.  12.
  11.  11.  11.  11.  11.  10.  10.  10.  10.  10.   9.   9.   9.   9.   9.
   9.   8.   8.   8.   8.   8.   8.   8.   8.   7.   7.   7.   7.   7.   7.
   7.   7.   6.   6.   6.   6.   6.   6.   6.   6.   6.   5.   5.   5.   5.
   5.   5.   5.   5.   5.   5.   4.   4.   4.   4.   4.   4.   4.   4.   4.
   4.   4.   3.   3.   3.   3.   3.   3.   3.   3.   3.   3.   3.   3.   3.
   2.   2.   2.   2.   2.   2.   2.   2.   2.   1.   1.   1.   1.   1.   1.
   1.   1.   1.   1.   1.   1.   0.   0.]
SGD / training test / pca nf= 128 / score= 0.629
[ 31.  23.  22.  21.  21.  20.  20.  20.  19.  18.  18.  18.  17.  17.  16.
  16.  16.  15.  15.  15.  14.  14.  14.  13.  13.  12.  12.  12.  12.  12.
  11.  11.  11.  11.  11.  10.  10.  10.  10.  10.   9.   9.   9.   9.   9.
   9.   8.   8.   8.   8

I did not manage to understand how LDA works... I even  think there might be a problem, because for n_comp = 0, I still had the very same score.

So as for PCA, it does decrease a lot the score on the previous best estimator, when dimensionality is reduced... But it is not much of a surprise.  

The array printed before shows the realtive importance of the different parameters. As The most significant parameter has a variance ratio of 31 (%1e5), maybe it is safe to consider only features that have more than 6(%e15) variance ratio. That corresponds to 74 features : **roughly 60%** of he original set.

# Step 4
Now let us use pca simplification to compute a SVC. It takes so much time, that we will also train it on a subset of the global data
## SVC + PCA

In [128]:
#ratio of training data among the total set available that will be used for fitting the SVC classifiers
ratio_sd = 50 #as a percentage
n_sd = X_train.shape[0]*ratio_sd/100
X_train_svc, y_train_svc = shuffle(X_pca, y_train, n_samples=n_sd )

#Simple SVC
#X_train_svc.flags['C_CONTIGUOUS'] # check True to improve the speed of the algorithm
print("Currently training the SVC with following parameters : \n PCA reduction ratio= %d \n Training Sample Ratio= %d"%(ratio_dr, ratio_sd))
clf = SVC(kernel='rbf', gamma='auto', cache_size=2500)

start = time.time()
clf.fit(X_train_svc, y_train_svc)
print("total timed used for fitting: %0.3f s"%(time.time() - start))

y_pred_train = clf.predict(X_pca)
score = compute_pred_score(y_train, y_pred_train)
print("default SVC / training test / pca nf= %d / score= %0.3f"%(my_ncomp, score))
print("\n\nClassification report")
print(classification_report(y_train, y_pred_train))
print("\n\nConfusion matrix")
print(confusion_matrix(y_train, y_pred_train))

  indices = indices[:max_n_samples]


Currently training the SVC with following parameters : 
 PCA reduction ratio= 100 
 Training Sample Ratio= 50


KeyboardInterrupt: 

So it is the best estimator so far, let us perform a grid search to make sure we get the best of it.

In [29]:
#ratio of training data among the total set available that will be used for fitting the SVC classifiers
ratio_sd = 50 #as a percentage
n_sd = X_train.shape[0]*ratio_sd/100
X_train_svc, y_train_svc = shuffle(X_pca, y_train, n_samples=n_sd )

#definining the search parameters
svc_tuned_parameters = [{'kernel': ['rbf'], 
                         'gamma': [.1, 10, 1000, 'auto'],
                         'C': [1],
                         'cache_size' : [1000],
                        }]

#Performing the gridsearch
clf = GridSearchCV(SVC(shrinking=True), param_grid= svc_tuned_parameters, n_jobs= 4, cv=3)
start = time.time()
clf.fit(X_train_svc, y_train_svc)
print("total time used for GridSearch fitting: %0.3f s"%(time.time() - start))
y_pred_train = clf.predict(X_pca)

#Score and detailed feedback
score = compute_pred_score(y_train, y_pred_train)
print("Tuned / training test reduced to %d samples / pca nf= %d / score= %0.3f for params= %a"%( n_sd, my_ncomp, score, clf.best_params_))

results = pd.DataFrame(clf.cv_results_)
print(results)

print("\n\nClassification report for the best estimator")
print(classification_report(y_train, y_pred_train))
print("\n\nConfusion matrix for the best estimator")
print(confusion_matrix(y_train, y_pred_train))

  indices = indices[:max_n_samples]


total time used for GridSearch fitting: 3692.115 s
Tuned / training test reduced to 52800 samples / pca nf= 77 / score= 0.166 for params= {'gamma': 'auto', 'cache_size': 1000, 'kernel': 'rbf', 'C': 1}


NameError: name 'display' is not defined

Because the ideal gridsearch below would be way too time-consuming, 
```python
svc_tuned_parameters = [{'kernel': ['rbf'], 
                         'gamma': [1e-3, 1e-4, 'auto'],
                         'C': [0.01, 0.1, 1, 10],
                         'cache_size' : [1000],
                        },
                        {'kernel': ['poly'],
                         'C': [0.1, 1, 10],
                         'deg': [3, 5],
                         'cache_size' : [1000],
                        }]
```
I had to adapt.  

I started with default parameters, and tried to choose gamma (within a range like logspace (1e-3, 1e3)). Obviously, the best result is always obtained for *'auto'*, which, according to the documentation corresponds to $\frac{1}{n_{sd}}$

Now let us try to select C.

In [30]:
#definining the search parameters
svc_tuned_parameters = [{'kernel': ['rbf'], 
                         'gamma': ['auto'],
                         'C': [0.0001, 0.1, 1, 100, 1000, 10000],
                         'cache_size' : [1000],
                        }]

#Performing the gridsearch
clf = GridSearchCV(SVC(shrinking=True), param_grid= svc_tuned_parameters, n_jobs= 4, cv=3)
start = time.time()
clf.fit(X_train_svc, y_train_svc)
print("total time used for GridSearch fitting: %0.3f s"%(time.time() - start))
y_pred_train = clf.predict(X_pca)

#Score and detailed feedback
score = compute_pred_score(y_train, y_pred_train)
print("Tuned SVM/ training test reduced to %d samples / pca nf= %d / score= %0.3f for params= %a"%( n_sd, my_ncomp, score, clf.best_params_))

results = pd.DataFrame(clf.cv_results_)
print(results)

print("\n\nClassification report for the best estimator")
print(classification_report(y_train, y_pred_train))
print("\n\nConfusion matrix for the best estimator")
print(confusion_matrix(y_train, y_pred_train))

total time used for GridSearch fitting: 8092.161 s
Tuned / training test reduced to 52800 samples / pca nf= 77 / score= 0.116 for params= {'gamma': 'auto', 'C': 100, 'kernel': 'rbf', 'cache_size': 1000}
   mean_fit_time  mean_score_time  mean_test_score  mean_train_score param_C  \
0    6804.517463        95.351618         0.500152          0.500152  0.0001   
1    2615.046944        30.464952         0.962481          0.967472     0.1   
2      63.202128        16.544076         0.976742          0.991487       1   
3      71.447156        16.906226         0.977008          0.999991     100   
4      70.249998        16.874936         0.977008          0.999991   10000   

  param_cache_size param_gamma param_kernel  \
0             1000        auto          rbf   
1             1000        auto          rbf   
2             1000        auto          rbf   
3             1000        auto          rbf   
4             1000        auto          rbf   

                                 

It is frankly awesome : we get a score of 0.116 on the training test with params= {'gamma': 'auto', 'C': 100, 'kernel': 'rbf', 'cache_size': 1000}. Never performed so good before. I just should pay attention to overfitting, but it looks good.

In [33]:
clf = SVC(kernel='rbf', gamma='auto', C=100, cache_size=2500, probability=True)

start = time.time()
clf.fit(X_train_svc, y_train_svc)
print("total timed used for fitting: %0.3f s"%(time.time() - start))

y_pred_train = clf.predict(X_pca)
score = compute_pred_score(y_train, y_pred_train)
print("Optimal SVC / training test / pca nf= %d / score= %0.3f"%(my_ncomp, score))
print("\n\nClassification report")
print(classification_report(y_train, y_pred_train))
print("\n\nConfusion matrix")
print(confusion_matrix(y_train, y_pred_train))

total timed used for fitting: 2052.918 s
default SVC / training test / pca nf= 77 / score= 0.116


Classification report
             precision    recall  f1-score   support

         -1       0.99      0.99      0.99     52800
          1       0.99      0.99      0.99     52800

avg / total       0.99      0.99      0.99    105600



Confusion matrix
[[52155   645]
 [  578 52222]]


## Here comes the label 0
Improvement of the score by attribution of the label '0' when probability of prediction in one of the classes is too slow

In [75]:
#Test on single value :
test = X_pca[0].reshape(1,-1)
print("predicted class = %d with probaset = %s" % (clf.predict(test),clf.predict_proba(test) ))

#Test on a larger scale:
test = X_pca[10:15]
print("predicted classes = %s with probaset = %s" % (clf.predict(test),clf.predict_proba(test) ))

#Full scale probability set
prediction_set = clf.predict_proba(X_pca) #For each index of X_pca, the proba that it belongs to class -1 and then 1
print(prediction_set[0], prediction_set[10:15])

#Adaptating the classifier
y_pred_train_with0 = np.zeros(y_pred_train.shape) #just to initialize the size
y_pred_train_with0[prediction_set[:,0] > 0.85] = -1 #if we know that more that 85% that it is class -1
y_pred_train_with0[prediction_set[:,1] > 0.85] = 1  #if we know that more that 85% that it is class 1
#The other values are already set to 0 (less than 85% chance that it belongs to one class or the other)

#Comparing the score with and without this postprocessing
score = compute_pred_score(y_train, y_pred_train)
print("Optimal SVC / training test / pca nf= %d / WITHOUT 0 processing score= %0.3f"% (my_ncomp, score))
score = compute_pred_score(y_train, y_pred_train_with0)
print("Optimal SVC / training test / pca nf= %d / WITH 0 processing score= %0.3f"% (my_ncomp, score))

predicted class = 1 with probaset = [[ 0.00478337  0.99521663]]
predicted classes = [1 1 1 1 1] with probaset = [[ 0.01570091  0.98429909]
 [ 0.01096714  0.98903286]
 [ 0.04554413  0.95445587]
 [ 0.01569454  0.98430546]
 [ 0.38671378  0.61328622]]
[ 0.00478337  0.99521663] [[ 0.01570091  0.98429909]
 [ 0.01096714  0.98903286]
 [ 0.04554413  0.95445587]
 [ 0.01569454  0.98430546]
 [ 0.38671378  0.61328622]]
Optimal SVC / training test / pca nf= 77 / WITHOUT 0 processing score= 0.116
Optimal SVC / training test / pca nf= 77 / WITH 0 processing score= 0.083


Trying this processus with the test data and sending it online

In [74]:
#Adapting the test set
XX = pca.transform(X_test)
prediction_set = clf.predict_proba(XX) #For each index of XX, the proba that it belongs to class -1 and then 1

#Adaptating the classifier
y_pred_test_with0 = np.zeros(XX.shape[0]) #just to initialize the size
y_pred_test_with0[prediction_set[:,0] > 0.7] = -1
y_pred_test_with0[prediction_set[:,1] > 0.7] = 1
#The other values are already set to 0

#saving results to a textfile
np.savetxt('y_pred2.txt', y_pred_test_with0, fmt='%d')

F\*\*\* ! It is still a "low" score of 0.3 on the test set... 
As I don't want to spend more time fine-tuning this estimator, or trying ensemble methods with it, because it has a long computing time, I will try other classifiers

# Interlude

## Making sure we have the appropriate datasets
As I tried many things before, I want to make sure that we start this section with a clean, full dataset

In [89]:
############### Standardization of the data
myScaler = StandardScaler()
X_train_scaled = myScaler.fit_transform(X_train)
X_test_scaled = myScaler.transform(X_test)

############### PCA : manual procedure
# ratio_dr = 100 # dimensionality reduction ratio as a percentage
# my_ncomp = round(X_train.shape[1]*ratio_dr/100) # effective number of features retained
# print("%d features selected out of %d (%d %%) for PCA"%(my_ncomp, X_train.shape[1], ratio_dr))

# #PCA in effect with manual selection
# pca_scaled = PCA(svd_solver='full', whiten=True, n_components=my_ncomp).fit(X_train_scaled)
# X_pca_scaled = pca_scaled.transform(X_train_scaled)

############### PCA : automatic procedure
#select the variance ratio such that the number of components explains at least this ratio. 
var_ratio_min = 99.9 #as a percentage (float number 0<num<100 strictly)

#PCA in effect with automatic selection of variables to keep
pca_scaled = PCA(svd_solver='full', whiten=True, n_components=var_ratio_min/100).fit(X_train_scaled)
X_pca_scaled = pca_scaled.transform(X_train_scaled)
print("%d features selected out of %d (%d %%) for PCA which explains %d %% of variance"%(pca_scaled.n_components_, X_train.shape[1], pca_scaled.n_components_/X_train.shape[1]*100, pca_scaled.explained_variance_ratio_.sum()*100))

#Explained variance ratio
print("\n explained variance ratio as a 'per thousand' ratio for each of the selected features")
print((pca_scaled.explained_variance_ratio_*1000).round())

################ Observation Selection and Mixing
#ratio of training data among the total set available that will be used for fitting the SVC classifiers
ratio_sd = 100 #as a percentage
n_sd = X_train.shape[0]*ratio_sd/100 #effective number of observations retained
print("%d observations selected out of %d (%d %%) for Shuffling and training"%(n_sd, X_train.shape[0], ratio_sd))

X_train_scaled_shuffled, y_train_scaled_shuffled = shuffle(X_pca_scaled, y_train, n_samples=n_sd)

126 features selected out of 128 (98 %) for PCA which explains 99 % of variance

 explained variance ratio as a 'per thousand' ratio for each of the selected features
[ 30.  23.  22.  21.  21.  20.  20.  19.  19.  18.  18.  18.  17.  17.  16.
  16.  16.  15.  15.  15.  14.  14.  14.  13.  13.  12.  12.  12.  12.  12.
  11.  11.  11.  11.  10.  10.  10.  10.  10.  10.   9.   9.   9.   9.   9.
   9.   8.   8.   8.   8.   8.   8.   8.   8.   8.   7.   7.   7.   7.   7.
   7.   7.   6.   6.   6.   6.   6.   6.   6.   6.   6.   5.   5.   5.   5.
   5.   5.   5.   5.   5.   5.   4.   4.   4.   4.   4.   4.   4.   4.   4.
   4.   4.   3.   3.   3.   3.   3.   3.   3.   3.   3.   3.   3.   3.   2.
   2.   2.   2.   2.   2.   2.   2.   2.   2.   1.   1.   1.   1.   1.   1.
   1.   1.   1.   1.   1.   1.]
105600 observations selected out of 105600 (100 %) for Shuffling and training


  indices = indices[:max_n_samples]


So let me sum up what we've got so far :
+ ** X_train, X_test, y_train, y_test** : the original dataset
+ ** X_train_scaled, , X_test** : the standardized dataset, obtained with *myScaler* trained on X_train
+ ** X_pca** : the result of the pca with *my_ncomp* components on the training set: use *pca.transform()* to replicate transformation
+ ** X_pca_scaled** : the result of the pca with all the components obtained with *X_train_scaled*  : use *pca_scaled.transform()* to replicate transformation
+ ** X_train_scaled_shuffled, y_train_scaled_shuffled** : standardization + pca + shuffle

I will try to use the last one as it is supposed to be the most appropriate to give all most significant variables obtained with PCA the same importance (standardization), and should not depend on the order of the objects wisited (shuffle)

As I am growing fed up of repeating over and over the same processes, let me introduce a few fuctions we will use later
## A few useful homemade functions
+ If computing time is long, or if there is overfitting, we might want to train the algorithm on a smaller sample, that is to say less variables, and less observations at the same time.
+ On top of that we want to be able to easily adapt the 0_labels technique
+ Finally we want to save quickly the resulting classifier

In [3]:
def makeTimeSignificant(t_seconds):
    #transforms seconds into hours, minutes, and seconds
    m, s = divmod(t_seconds, 60)
    h, m = divmod(m, 60)
    return "%dh%02dm%02ds" % (h, m, s)

def predict_0_labels(XX, clf, threshold=0.7, without=False):
    #Add the 0 labels to a prediction to increase score
    #check whether the classifier is compatible
    can_predict_proba = getattr(clf, "predict_proba", None)
    if callable(can_predict_proba):
        print("0_labels enabled")
        start = time.time()
        prediction_set = clf.predict_proba(XX) #For each index of XX, the proba that it belongs to class -1 and then 1
        print("total timed used for predicting: %s"%(makeTimeSignificant(time.time() - start)))

        #Adaptating the classifier
        y_pred_with0 = np.zeros(XX.shape[0]) #just to initialize the size
        y_pred_with0[prediction_set[:,0] > threshold] = -1
        y_pred_with0[prediction_set[:,1] > threshold] = 1
        #The other values are already set to 0
        
        if(without):
            y_pred = np.ones(XX.shape[0])
            y_pred[prediction_set[:,0] >= 0.5] = -1
            return y_pred_with0, y_pred
        else:
            return y_pred_with0
    else:
        print("0_labels disabled")
        start = time.time()
        y_pred = clf.predict(XX)
        print("total timed used for predicting: %s"%(makeTimeSignificant(time.time() - start)))
        if(without):
            return y_pred, y_pred
        else:
            return y_pred

def prepare_dataset(XX_train, y_train, XX_test, var_ratio_min=99.9, ratio_sd=100):
    #Scale it 
    myScaler = StandardScaler()
    XX_train_scaled = myScaler.fit_transform(XX_train)

    #select the most significant features
    pca_scaled = PCA(svd_solver='full', whiten=True, n_components=var_ratio_min/100).fit(XX_train_scaled)
    XX_pca_scaled = pca_scaled.transform(XX_train_scaled)
    print("%d features selected out of %d (%d %%) for PCA which explains %d %% of variance"%(pca_scaled.n_components_, XX_train.shape[1], pca_scaled.n_components_/XX_train.shape[1]*100, pca_scaled.explained_variance_ratio_.sum()*100))

    #print("\n explained variance ratio as a 'per thousand' ratio for each of the selected features")
    #print((pca_scaled.explained_variance_ratio_*1000).round())

    #Select a certain amount of observations
    n_sd = XX_train.shape[0]*ratio_sd/100 #effective number of observations retained
    print("%d observations selected out of %d (%d %%) for Shuffling and training"%(n_sd, XX_train.shape[0], ratio_sd))

    #Shuffle it
    XX_train_scaled_shuffled, yy_train_scaled_shuffled = shuffle(XX_pca_scaled, y_train, n_samples=n_sd)
    
    #Adapt the test set accordingly
    XX_test_scaled = myScaler.transform(XX_test)
    XX_test_scaled_pca = pca_scaled.transform(XX_test_scaled)
    
    return XX_train_scaled_shuffled, yy_train_scaled_shuffled, XX_test_scaled_pca

def save_prediction(X_test, clf, trial_number, threshold=0.7):
    y_pred = predict_0_labels(X_test, clf, threshold=threshold)
    np.savetxt('y_pred_' + str(trial_number) + '.txt', y_pred, fmt='%d')

# Step 5
Nearest Neighbours classification
## K-neighbours

In [91]:
#the number of neighbours in the k-neighbours algorithm
K = 5 #5 is the default value. If it runs fast, gridsearch can be intempted with many different values
weight_scheme = 'uniform' # could be inverse of 'distance'
leafSize = 30
#Trying the algorithm on scaled data
clf = KNeighborsClassifier(n_neighbors=K, 
                           weights= weight_scheme,  
                           leaf_size= leafSize, 
                           algorithm='ball_tree', metric='minkowski', p=2,
                           n_jobs=4)

start = time.time()
clf.fit(X_train_scaled_shuffled, y_train_scaled_shuffled)
print("total timed used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

start = time.time()
y_pred_train = clf.predict(X_train_scaled_shuffled)
print("total timed used for predicting: %s"%(makeTimeSignificant(time.time() - start)))

score = compute_pred_score(y_train_scaled_shuffled, y_pred_train)
print("Default K-neighbours classifier/ training test / pca nf= %d / score= %0.3f"%(pca_scaled.n_components_, score))
print("\n\nClassification report")
print(classification_report(y_train_scaled_shuffled, y_pred_train))
print("\n\nConfusion matrix")
print(confusion_matrix(y_train_scaled_shuffled, y_pred_train))

total timed used for fitting: 2.711 s
Default K-neighbours classifier/ training test / pca nf= 126 / score= 0.220


Classification report
             precision    recall  f1-score   support

         -1       0.97      0.99      0.98     52800
          1       0.99      0.97      0.98     52800

avg / total       0.98      0.98      0.98    105600



Confusion matrix
[[52233   567]
 [ 1758 51042]]


Because it took the hell of a long time, I will try a smaller dataset, and better parameters for quicker analysis

In [12]:
#Customizing the number of features and observations
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=95, ratio_sd=90)

#the number of neighbours in the k-neighbours algorithm
K = 10 #5 is the default value. Big values reduce overfitting
weight_scheme = 'uniform' # could be inverse of 'distance'
leafSize = 50
clf = KNeighborsClassifier(n_neighbors=K, 
                           weights= weight_scheme,  
                           leaf_size= leafSize, 
                           algorithm='ball_tree', metric='minkowski', p=2,
                           n_jobs=4)

#Fitting
start = time.time()
clf.fit(X_train_adapt, y_train_adapt)
print("total timed used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train_with0 = predict_0_labels(X_train_adapt, clf)
score = compute_pred_score(y_train_adapt, y_pred_train_with0)
print("K-Neighbours / training test ns=%d / pca nf= %d / score= %0.3f"%(X_train_adapt.shape[0], X_train_adapt.shape[1], score))
# print("\n\nConfusion matrix")
# print(confusion_matrix(y_train_scaled_shuffled, y_pred_train))

#Saving results
print("now for the test set")
save_prediction(X_test=X_test_adapt, clf=clf, trial_number=3)

98 features selected out of 128 (76 %) for PCA which explains 95 % of variance
95040 observations selected out of 105600 (90 %) for Shuffling and training


  indices = indices[:max_n_samples]


total timed used for fitting: 0:00:01
0_labels enabled
total timed used for predicting: 0:10:25
K-Neighbours / training test ns=95040 / pca nf= 98 / score= 0.117
now for the test set
0_labels enabled
total timed used for predicting: 0:00:54


**Oh my god it takes so much time !!!** I am changing my classifier illico presto. The score on the training set is not too bad though ! Again, I am facing overfitting. Because the score obtained for the test set is only 0.4...

I will come back to this method later, because I don't want to loose to much time optimizing a technique that might reveal itself suboptimal later.

## Nearest Centroids

In [136]:
#Customizing the number of features and observations
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)


clf = NearestCentroid()

#Fitting
start = time.time()
clf.fit(X_train_adapt, y_train_adapt)
print("total timed used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train = predict_0_labels(X_train_adapt, clf)
score = compute_pred_score(y_train_adapt, y_pred_train)
print("NearestCentroid / training test ns=%d / pca nf= %d / score= %0.3f"%(X_train_adapt.shape[0], X_train_adapt.shape[1], score))
print("\nConfusion matrix")
print(confusion_matrix(y_train_adapt, y_pred_train))

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=clf, trial_number=4)

126 features selected out of 128 (98 %) for PCA which explains 99 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training


  indices = indices[:max_n_samples]


total timed used for fitting: 0:00:00
total timed used for predicting: 0:00:00
NearestCentroid / training test / pca nf= 126 / score= 0.646

Confusion matrix
[[49383  3417]
 [ 3410 49390]]

Now for the test set
total timed used for predicting: 0:00:00


Which is a terrible score when compared to the best estimators found before.

Let us move on to a new technique :

# Step 6
## Gaussian processes

In [133]:
#Customizing the number of features and observations
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)


clf = GaussianProcessClassifier(n_jobs=4)

#Fitting
start = time.time()
clf.fit(X_train_adapt, y_train_adapt)
print("total timed used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train_with0 = predict_0_labels(X_train_adapt, clf)
score = compute_pred_score(y_train_adapt, y_pred_train_with0)
print("GaussianProcessClassifier / training test ns=%d / pca nf= %d / score= %0.3f"%(X_train_adapt.shape[0], X_train_adapt.shape[1], score))
print("\nConfusion matrix")
print(confusion_matrix(y_train_adapt, y_pred_train_with0))

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=clf, trial_number=5)

98 features selected out of 128 (76 %) for PCA which explains 95 % of variance
73920 observations selected out of 105600 (70 %) for Shuffling and training


  indices = indices[:max_n_samples]


MemoryError: 

# Step 7
## Naive Bayes methods

In [10]:
#Customizing the number of features and observations
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)


clf = GaussianNB(priors= [0.5, 0.5])

#Fitting
start = time.time()
clf.fit(X_train_adapt, y_train_adapt)
print("total timed used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train_with0 = predict_0_labels(X_train_adapt, clf)
score = compute_pred_score(y_train_adapt, y_pred_train_with0)
print("GaussianNB / training test ns=%d / pca nf= %d / score= %0.3f"%(X_train_adapt.shape[0], X_train_adapt.shape[1], score))
print("\nConfusion matrix")
print(confusion_matrix(y_train_adapt, y_pred_train_with0))

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=clf, trial_number=6)

126 features selected out of 128 (98 %) for PCA which explains 99 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training


  indices = indices[:max_n_samples]


total timed used for fitting: 0:00:00
0_labels enabled
total timed used for predicting: 0:00:00
NearestCentroid / training test ns=105600 / pca nf= 126 / score= 1.121

Confusion matrix
[[42430  4770  5600]
 [    0     0     0]
 [ 5125  6342 41333]]

Now for the test set
0_labels enabled
total timed used for predicting: 0:00:00


Which is not a satisfactory result
## BernouilliNB

In [9]:
#Customizing the number of features and observations
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)


clf = BernoulliNB(class_prior= [0.5, 0.5])

#Fitting
start = time.time()
clf.fit(X_train_adapt, y_train_adapt)
print("total timed used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train_with0 = predict_0_labels(X_train_adapt, clf, threshold=0.7)
score = compute_pred_score(y_train_adapt, y_pred_train_with0)
print("BernouilliNB / training test ns= %d/ pca nf= %d / score= %0.3f"%(X_train_adapt.shape[0], X_train_adapt.shape[1], score))
print("\nConfusion matrix")
print(confusion_matrix(y_train_adapt, y_pred_train_with0))

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=clf, trial_number=7)

126 features selected out of 128 (98 %) for PCA which explains 99 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training


  indices = indices[:max_n_samples]


total timed used for fitting: 0:00:00
0_labels enabled
total timed used for predicting: 0:00:00
NearestCentroid / training test / pca nf= 126 / score= 0.857

Confusion matrix
[[27951 22679  2170]
 [    0     0     0]
 [ 2382 22294 28124]]

Now for the test set
0_labels enabled
total timed used for predicting: 0:00:00


Which is still quite unsatisfactory
# Step 8
## Decision Trees

In [16]:
#Customizing the number of features and observations
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=90, ratio_sd=100)


clf = DecisionTreeClassifier(max_depth=15, min_samples_split=100 )

#Fitting
start = time.time()
clf.fit(X_train_adapt, y_train_adapt)
print("total timed used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train_with0 = predict_0_labels(X_train_adapt, clf, threshold=0.7)
score = compute_pred_score(y_train_adapt, y_pred_train_with0)
print("DecisionTree / training test ns=%d / pca nf= %d / score= %0.3f"%(X_train_adapt.shape[0], X_train_adapt.shape[1], score))
print("\nConfusion matrix")
print(confusion_matrix(y_train_adapt, y_pred_train_with0))

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=clf, trial_number=8)

85 features selected out of 128 (66 %) for PCA which explains 90 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training


  indices = indices[:max_n_samples]


total timed used for fitting: 0:00:16
0_labels enabled
total timed used for predicting: 0:00:00
DecisionTree / training test ns=105600 / pca nf= 85 / score= 0.734

Confusion matrix
[[41158  7860  3782]
 [    0     0     0]
 [ 2397  7878 42525]]

Now for the test set
0_labels enabled
total timed used for predicting: 0:00:00


I will not perform Gridsearch just for the parameters of the trees as it is easy to find trees that just perfectly fit the data...But amazingly overfits it (the eight prediction test set I uploaded scored more than 2 !!!)

This is why I will use a pipe, in order to find the best amount of features that can be used to train the data, and also the best tree_depth based on a gridsearch that takes both into account. I will make cross-validation significant : cv=10

Actually, I could have done that before with SVC, but as SVC take a long time to compute, I left it the way I had it. It is more doable with decision trees which take each something like 16s, as opposed to SVC which take 8m

## Cross-validated  trees + PCA

In [26]:
#Customizing the number of features and observations
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)

#Defining the pipe predictor
pipe = Pipeline([ ('reduce_dim', PCA()), ('classify', DecisionTreeClassifier())])

#Selecting parameters for grid-search
N_FEATURES_RATIOS = np.array([98, 99.9]) #as a percentage
N_FEATURES_OPTIONS = np.round(X_train_adapt.shape[1]*N_FEATURES_RATIOS/100).astype('int').tolist()
DEPTH_OPTIONS = [13, 14]
SPLIT_OPTIONS = [2, 5, 7]
SPLITTER_OPTIONS = ['best', 'random']

param_grid = [{'reduce_dim': [PCA(svd_solver='full')],
               'reduce_dim__n_components': N_FEATURES_OPTIONS,
               'classify__max_depth': DEPTH_OPTIONS,
               'classify__min_samples_split' : SPLIT_OPTIONS,
               'classify__splitter' : SPLITTER_OPTIONS
              }]

grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=4)

#Fitting the grid
start = time.time()
grid.fit(X_train_adapt, y_train_adapt)
print("total time used for GridSearch fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Feedback on the best parameters, and each parameter performance:
print("\n============best params found %a"%(grid.best_params_))
print('\n============Classification report')
results = pd.DataFrame(grid.cv_results_)[['rank_test_score',
                                          'mean_test_score', 
                                          'mean_train_score', 
                                          'param_reduce_dim__n_components',
                                          'param_classify__max_depth', 
                                          'param_classify__min_samples_split',
                                          'param_classify__splitter'
                                         ]]
results['param_reduce_dim__n_components'] = results['param_reduce_dim__n_components']/126*100
results.columns = ['rank test', 'score test', 'score train', 'PCA ratio_f', 'max_depth', 'min sample split', 'splitter']
results = results.sort(columns='rank test', ascending=True)
print(results)


126 features selected out of 128 (98 %) for PCA which explains 99 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training


  indices = indices[:max_n_samples]


total time used for GridSearch fitting: 0:08:31

  svd_solver='full', tol=0.0, whiten=False), 'classify__max_depth': 14}

    rank test  score test  score train PCA ratio_f max_depth min sample split  \
13          1    0.734924     0.928369         100        14                2   
5           2    0.734100     0.901165         100        13                5   
21          3    0.733551     0.923894         100        14                7   
1           4    0.733523     0.902384         100        13                2   
17          5    0.733485     0.926458         100        14                5   
12          5    0.733485     0.929744      97.619        14                2   
0           7    0.733116     0.903608      97.619        13                2   
9           7    0.733116     0.899470         100        13                7   
16          9    0.732481     0.927708      97.619        14                5   
20         10    0.732358     0.924941      97.619        14        



Ideal param_grid (my computer remained idle, I don't know why. Afraid of the work ahead ?), that I split into several sets to try to get the best out of it
```python
#Selecting parameters for grid-search
N_FEATURES_RATIOS = np.array([85, 90, 95, 99.9]) #as a percentage
N_FEATURES_OPTIONS = np.round(X_train_adapt.shape[1]*N_FEATURES_RATIOS/100).astype('int').tolist()
DEPTH_OPTIONS = np.arange(10, 20, 3)
SPLIT_OPTIONS = np.arange(1, 100, 20)
```

After a long study, it seems that it is always best to keep most of the features, and that optimal tree_depth is around 13.

In [30]:
#Further customizing
param_grid = [{'reduce_dim__n_components': [126],
               'classify__max_depth': [15],
               'classify__min_samples_split' : [2, 4, 7],
               'classify__splitter' : ['best']
              },
              {'reduce_dim__n_components': [126],
               'classify__max_depth': [14],
               'classify__min_samples_split' : [2,3,4, 10],
               'classify__splitter' : ['best']
              },
              {'reduce_dim__n_components': [126],
               'classify__max_depth': [13],
               'classify__min_samples_split' : [4,5,6,7, 10],
               'classify__splitter' : ['best']
              }]

grid = GridSearchCV(pipe, param_grid=param_grid, cv=6, n_jobs=4)

#Fitting the grid
start = time.time()
grid.fit(X_train_adapt, y_train_adapt)
print("total time used for GridSearch fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Feedback on the best parameters, and each parameter performance:
print("\n============best params found %a"%(grid.best_params_))
print('\n============Classification report')
results = pd.DataFrame(grid.cv_results_)[['rank_test_score',
                                          'mean_test_score', 
                                          'mean_train_score', 
                                          'param_reduce_dim__n_components',
                                          'param_classify__max_depth', 
                                          'param_classify__min_samples_split',
                                         ]]
results['param_reduce_dim__n_components'] = results['param_reduce_dim__n_components']/126*100
results.columns = ['rank test', 'score test', 'score train', 'PCA ratio_f', 'max_depth', 'min sample split']
results = results.sort(columns='rank test', ascending=True)
print(results)

total time used for GridSearch fitting: 0:09:46


    rank test  score test  score train PCA ratio_f max_depth min sample split
4           1    0.735502     0.920188         100        14                3
5           2    0.735161     0.919445         100        14                4
3           3    0.734650     0.920244         100        14                2
0           4    0.734574     0.941483         100        15                2
1           5    0.734394     0.940354         100        15                4
7           6    0.733295     0.893483         100        13                4
6           7    0.733191     0.911920         100        14               10
9           8    0.733011     0.892129         100        13                6
2           9    0.732945     0.935894         100        15                7
8          10    0.732481     0.892852         100        13                5
10         11    0.732330     0.891161         100        13                7
11         12 



We will therefore keep this last optimal set of values for next PCA. Basically, max_depth=14, min_samples_split=3.
As Test score is not very high anyway, we will try to use ensemble methods on trees to get a better result later, and do not save the results just yet. But the trees used for Bagging or Extratrees will be based on this particular study.
# Step 9
A few ensemble methods based on trees
## Extra-trees

In [18]:
#Customizing the number of features and observations
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)

#Parameters as defined by the previous study... 
#We might however want to reduce the depth
#change the number of estimators

#clf = ExtraTreesClassifier(max_depth=16, n_estimators=70, min_samples_split=3, bootstrap=True, n_jobs=4)
clf = ExtraTreesClassifier(max_depth=None, n_estimators=70, min_samples_split=10, min_samples_leaf=10, bootstrap=True, max_features=round(126**.5), n_jobs=4)

#Fitting
start = time.time()
clf.fit(X_train_adapt, y_train_adapt)
print("total time used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train = predict_0_labels(XX=X_train_adapt, clf=clf, threshold=0.6)

#Score
score = compute_pred_score(y_train_adapt, y_pred_train)
print("Score with ExtraTrees estimator %0.3f"%(score))
print("\n\nConfusion matrix")
print(confusion_matrix(y_train_adapt, y_pred_train))

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=clf, trial_number=9)

126 features selected out of 128 (98 %) for PCA which explains 99 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training


  indices = indices[:max_n_samples]


total time used for fitting: 0:00:07
0_labels enabled
total timed used for predicting: 0:00:00
Score with ExtraTrees estimator 0.188


Confusion matrix
[[42853  9823   124]
 [    0     0     0]
 [   90  7879 44831]]

Now for the test set
0_labels enabled
total timed used for predicting: 0:00:00


Despite playing  with the data, it did not lead to a good result, therefore i will directly try something else.
It overfits way too much or is not accurate (I get more than 0.8) It is a very fast technique though
## GradientBoostingClassifier

In [23]:
#Customizing the number of features and observations
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)

#Parameters as defined by the previous study... 
#We might however want to reduce the depth
#change the number of estimators
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=6)

#Fitting
start = time.time()
clf.fit(X_train_adapt, y_train_adapt)
print("total time used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train = predict_0_labels(XX=X_train_adapt, clf=clf, threshold=0.7)

#Score
score = compute_pred_score(y_train_adapt, y_pred_train)
print("Score with Gradient Boosting estimator %0.3f"%(score))
print("\n\nConfusion matrix")
print(confusion_matrix(y_train_adapt, y_pred_train))

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=clf, trial_number=10)

126 features selected out of 128 (98 %) for PCA which explains 99 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training


  indices = indices[:max_n_samples]


total time used for fitting: 0:12:31
0_labels enabled
total timed used for predicting: 0:00:00
Score with Gradient Boosting estimator 0.158


Confusion matrix
[[47570  4779   451]
 [    0     0     0]
 [  322  4153 48325]]

Now for the test set
0_labels enabled
total timed used for predicting: 0:00:00


After 16m, we obtain for GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=7) a train score of 0.088 and a test score of 0.33: not too shabby. It is the best estimato obtained so far

With (n_estimators=70, learning_rate=0.1, max_depth=10), after 36m, the training score is 0.008, but there is overfitting, because the testscore is 0.38.

With (n_estimators=100, learning_rate=0.1, max_depth=5), after 8m trainscore is 0.237, and test_score is >0.46  
With (n_estimators=100, learning_rate=0.01, max_depth=7), after 17m, trainscore is 0.646, and test_score is not tested  
With (n_estimators=100, learning_rate=0.1, max_depth=6), after 13m, trainscore is 0.158, and test score is 0.4

In [29]:
#Customizing the number of features and observations
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)

#Parameters as defined by the previous study... 
#We might however want to reduce the depth
#change the number of estimators
clf = RandomForestClassifier(n_estimators=100, max_depth=12, max_features=70 , bootstrap=True, n_jobs=4)

#Fitting
start = time.time()
clf.fit(X_train_adapt, y_train_adapt)
print("total time used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train = predict_0_labels(XX=X_train_adapt, clf=clf, threshold=0.7)

#Score
score = compute_pred_score(y_train_adapt, y_pred_train)
print("Score with RandomForestClassifier estimator %0.3f"%(score))
print("\n\nConfusion matrix")
print(confusion_matrix(y_train_adapt, y_pred_train))

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=clf, trial_number=11)


126 features selected out of 128 (98 %) for PCA which explains 99 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training


  indices = indices[:max_n_samples]


total time used for fitting: 0:09:04
0_labels enabled
total timed used for predicting: 0:00:01
Score with RandomForestClassifier estimator 0.333


Confusion matrix
[[41710  9457  1633]
 [    0     0     0]
 [  112  8278 44410]]

Now for the test set
0_labels enabled
total timed used for predicting: 0:00:00


# step 10
## Neural Networks

In [33]:
#Customizing the number of features and observations
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)

#Parameters as defined by the previous study... 
#We might however want to reduce the depth
#change the number of estimators
clf = MLPClassifier()

#Fitting
start = time.time()
clf.fit(X_train_adapt, y_train_adapt)
print("total time used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train = predict_0_labels(XX=X_train_adapt, clf=clf, threshold=0.75)

#Score
score = compute_pred_score(y_train_adapt, y_pred_train)
print("Score with basic MLPClassifier estimator %0.3f"%(score))
print("\n\nConfusion matrix")
print(confusion_matrix(y_train_adapt, y_pred_train))

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=clf, trial_number=12)


126 features selected out of 128 (98 %) for PCA which explains 99 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training


  indices = indices[:max_n_samples]


total time used for fitting: 0:00:29
0_labels enabled
total timed used for predicting: 0:00:00
Score with basic RandomForestClassifier estimator 0.002


Confusion matrix
[[52731    65     4]
 [    0     0     0]
 [    0    54 52746]]

Now for the test set
0_labels enabled
total timed used for predicting: 0:00:00


Woaw, this is amazing ! 
It is by far the best classifier with only 0.29 as a score on the test set.

Even more amazing, the computing time is ridiculous compared to other methods. Let us try to customize it via gridsearch

In [35]:
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)

#Further customizing
param_grid = [{'alpha' : np.logspace(-5, 3, 5),
               'activation' : ['logistic', 'tanh', 'relu']
              }]

grid = GridSearchCV(MLPClassifier(), param_grid=param_grid, cv=6, n_jobs=4)

#Fitting the grid
start = time.time()
grid.fit(X_train_adapt, y_train_adapt)
print("total time used for GridSearch fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Feedback on the best parameters, and each parameter performance:
print("\n============best params found %a"%(grid.best_params_))
print('\n============Classification report')
results = pd.DataFrame(grid.cv_results_)[['rank_test_score',
                                          'mean_test_score', 
                                          'mean_train_score', 
                                          'param_alpha', 
                                          'param_activation',
                                         ]]
results.columns = ['rank test', 'score test', 'score train', 'alpha', 'activation']
results = results.sort(columns='rank test', ascending=True)
print(results)

126 features selected out of 128 (98 %) for PCA which explains 99 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training


  indices = indices[:max_n_samples]


total time used for GridSearch fitting: 0:17:52


    rank test  score test  score train  alpha activation
7           1    0.981600     0.995559    0.1       tanh
12          2    0.981184     0.996941    0.1       relu
1           3    0.980256     0.999975  0.001   logistic
11          4    0.979242     0.999674  0.001       relu
10          5    0.978930     0.999326  1e-05       relu
0           6    0.978769     0.999970  1e-05   logistic
2           7    0.978655     0.982739    0.1   logistic
5           8    0.975492     0.999864  1e-05       tanh
6           9    0.974271     0.999828  0.001       tanh
13         10    0.934896     0.934786     10       relu
8          11    0.932377     0.932898     10       tanh
3          12    0.930833     0.931405     10   logistic
4          13    0.500000     0.500000   1000   logistic
9          13    0.500000     0.500000   1000       tanh
14         13    0.500000     0.500000   1000       relu




In [36]:
#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=grid.best_estimator_, trial_number=13)


Now for the test set
0_labels enabled
total timed used for predicting: 0:00:00


Woaw !!!! So far it is the best estimator found with a score on the test set of 0.197...

In [4]:
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)

#Further customizing
param_grid = [{'alpha' : np.logspace(-3, 2, 5),
               'activation' : ['tanh', 'relu'],
               'solver' : ['adam', 'sgd'],
               'batch_size' : [200,  400]
              },
              {'alpha' : np.logspace(-3, 2, 5),
               'activation' : ['tanh', 'relu'],
               'solver' : ['lbfgs']
              }]

grid = GridSearchCV(MLPClassifier(), param_grid=param_grid, cv=5, n_jobs=4, verbose=100)

#Fitting the grid
start = time.time()
grid.fit(X_train_adapt, y_train_adapt)
print("total time used for GridSearch fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Feedback on the best parameters, and each parameter performance:
print("\n============best params found %a"%(grid.best_params_))
print('\n============Classification report')
results = pd.DataFrame(grid.cv_results_)[['rank_test_score',
                                          'mean_test_score', 
                                          'mean_train_score', 
                                          'param_alpha', 
                                          'param_activation',
                                          'param_solver',
                                          'param_batch_size'
                                         ]]
results.columns = ['rank test', 'score test', 'score train', 'alpha', 'activation', 'solver', 'batch']
results = results.sort(columns='rank test', ascending=True)
print(results)

126 features selected out of 128 (98 %) for PCA which explains 99 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training


  indices = indices[:max_n_samples]


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 29.5min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 89.0min
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed: 102.4min finished


total time used for GridSearch fitting: 1h42m48.174471616744995


    rank test  score test  score train      alpha activation solver batch
30          1    0.982670     0.995675   0.316228       relu   adam   400
49          2    0.982585     0.993897        100       relu  lbfgs   NaN
44          3    0.982225     0.990490        100       tanh  lbfgs   NaN
10          4    0.981657     0.991764   0.316228       tanh   adam   400
29          5    0.981070     0.989219   0.316228       relu    sgd   200
28          6    0.980966     0.989567   0.316228       relu   adam   200
24          7    0.980047     0.999695  0.0177828       relu   adam   200
26          8    0.979981     0.999941  0.0177828       relu   adam   400
8           9    0.979735     0.985095   0.316228       tanh   adam   200
22         10    0.979489     0.999941      0.001       relu   adam   400
48         11    0.979261     0.999972    5.62341       relu  lbfgs   NaN
25         12    0.979223     0.994164  0.0177



In [5]:
#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=grid.best_estimator_, trial_number=14)


Now for the test set
0_labels enabled
total timed used for predicting: 0h00m0.029016733169555664


In [7]:
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)

#Further customizing
param_grid = [{'alpha' : np.logspace(-1, 1, 3),
               'activation' : ['relu'],
               'solver' : ['adam'],
               'batch_size' : [200,  400, 600, 800]
              },
              {'alpha' : np.logspace(1, 4, 3),
               'activation' : ['tanh'],
               'solver' : ['lbfgs']
              },
             {'alpha' : [0.3, 0.5],
               'activation' : ['relu'],
               'solver' : ['adam'],
               'batch_size' : [400, 500],
               
              },
              {'alpha' : np.logspace(-1, 1, 3),
               'activation' : ['relu'],
               'solver' : ['sgd'],
               'batch_size' : [200],
               'learning_rate' : ['constant', 'adaptive']
              }, 
              {'alpha' : np.logspace(-1, 1, 3),
               'activation' : ['relu'],
               'solver' : ['sgd'],
               'batch_size' : [200],
               'learning_rate' : ['invscaling'],
               'power_t' : [0.2, 0.5, 0.7]
              }]


grid = GridSearchCV(MLPClassifier(), param_grid=param_grid, cv=5, n_jobs=4, verbose=12)

#Fitting the grid
start = time.time()
grid.fit(X_train_adapt, y_train_adapt)
print("total time used for GridSearch fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Feedback on the best parameters, and each parameter performance:
print("\n============best params found %a"%(grid.best_params_))
print('\n============Classification report')
results = pd.DataFrame(grid.cv_results_)[['rank_test_score',
                                          'mean_test_score', 
                                          'mean_train_score', 
                                          'param_alpha', 
                                          'param_activation',
                                          'param_solver',
                                          'param_batch_size',
                                          'param_learning_rate',
                                          'param_power_t'
                                         ]]
results.columns = ['rank test', 'score test', 'score train', 'alpha', 'activation', 'solver', 'batch', 'learn rate', 'invscaling power']
results = results.sort(columns='rank test', ascending=True)
print(results)

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=grid.best_estimator_, trial_number=15)

126 features selected out of 128 (98 %) for PCA which explains 99 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training
Fitting 5 folds for each of 34 candidates, totalling 170 fits


  indices = indices[:max_n_samples]
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   36.8s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:   39.8s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:   46.1s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:   48.4s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed:  2.3min
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:  2.3min
[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed:  3.3min
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done  16 tasks      | elaps

total time used for GridSearch fitting: 1h06m19s




KeyError: "['learning_rate' 'power_t'] not in index"

In [9]:
#Feedback on the best parameters, and each parameter performance:
print("\n============best params found %a"%(grid.best_params_))
print('\n============Classification report')
results = pd.DataFrame(grid.cv_results_)[['rank_test_score',
                                          'mean_test_score', 
                                          'mean_train_score', 
                                          'param_alpha', 
                                          'param_activation',
                                          'param_solver',
                                          'param_batch_size',
                                          'param_learning_rate',
                                          'param_power_t'
                                         ]]
results.columns = ['rank test', 'score test', 'score train', 'alpha', 'activation', 'solver', 'batch', 'learn rate', 'invscaling power']
results = results.sort(columns='rank test', ascending=True)
print(results)

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=grid.best_estimator_, trial_number=15)



    rank test  score test  score train    alpha activation solver batch  \
18          1    0.982131     0.994013      0.5       relu   adam   500   
16          2    0.982083     0.997050      0.3       relu   adam   500   
15          2    0.982083     0.995975      0.3       relu   adam   400   
7           4    0.981761     0.992474        1       relu   adam   800   
17          5    0.981667     0.992287      0.5       relu   adam   400   
0           6    0.981610     0.997474      0.1       relu   adam   200   
6           7    0.981420     0.989957        1       relu   adam   600   
2           8    0.981042     0.999837      0.1       relu   adam   600   
3           9    0.980966     0.999891      0.1       relu   adam   800   
1          10    0.980871     0.999458      0.1       relu   adam   400   
19         11    0.980331     0.992860      0.1       relu    sgd   200   
20         12    0.980208     0.993104      0.1       relu    sgd   200   
5          13    0.9799



In [11]:
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)

#Further customizing
param_grid = [{'alpha' : [0.3],
               'activation' : ['relu'],
               'solver' : ['adam'],
               'batch_size' : [450,500]
              },
             {'alpha' : [0.3],
               'activation' : ['relu'],
               'solver' : ['adam'],
               'batch_size' : [500],
               'hidden_layer_sizes': [(100,), (126,), (250,)]
              }]


grid = GridSearchCV(MLPClassifier(), param_grid=param_grid, cv=5, n_jobs=4, verbose=5)

#Fitting the grid
start = time.time()
grid.fit(X_train_adapt, y_train_adapt)
print("total time used for GridSearch fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Feedback on the best parameters, and each parameter performance:
print("\n============best params found \n %a"%(grid.best_params_))
print('\n============Classification report')
results = pd.DataFrame(grid.cv_results_)[['rank_test_score',
                                          'mean_test_score', 
                                          'mean_train_score', 
                                          'param_alpha', 
                                          'param_activation',
                                          'param_solver',
                                          'param_batch_size',
                                          'param_hidden_layer_sizes'
                                         ]]
results.columns = ['rank test', 'score test', 'score train', 'alpha', 'activation', 'solver', 'batch', 'layers']
results = results.sort(columns='rank test', ascending=True)
print(results)

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=grid.best_estimator_, trial_number=16)

126 features selected out of 128 (98 %) for PCA which explains 99 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training
Fitting 5 folds for each of 5 candidates, totalling 25 fits


  indices = indices[:max_n_samples]
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:  4.8min finished


total time used for GridSearch fitting: 0h05m19s


   rank test  score test  score train alpha activation solver batch  layers
4          1    0.982491     0.997659   0.3       relu   adam   500  (250,)
3          2    0.982273     0.997441   0.3       relu   adam   500  (126,)
0          3    0.982225     0.996773   0.3       relu   adam   450     NaN
2          4    0.982036     0.997334   0.3       relu   adam   500  (100,)
1          5    0.981638     0.997088   0.3       relu   adam   500     NaN

Now for the test set
0_labels enabled
total timed used for predicting: 0h00m00s




## Ensemble methods + Neural Networks

In [32]:
#Customizing the number of features and observations
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)

#Parameters as defined by the previous study... 
#We might however want to reduce the depth
#change the number of estimators

clf_bag = MLPClassifier(batch_size=400, activation='relu', solver='adam', alpha=0.316228)
clf = BaggingClassifier(base_estimator=clf_bag, n_estimators=70, n_jobs=4, verbose=5, max_samples=0.3, oob_score=True)

#Fitting
start = time.time()
clf.fit(X_train_adapt, y_train_adapt)
print("total time used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train = predict_0_labels(XX=X_train_adapt, clf=clf, threshold=0.85)

#Score
score = compute_pred_score(y_train_adapt, y_pred_train)
print("Score with bagging + MPLClassifier estimator %0.3f"%(score))
print("\n\nConfusion matrix")
print(confusion_matrix(y_train_adapt, y_pred_train))

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=clf, trial_number=17, threshold=0.85)

126 features selected out of 128 (98 %) for PCA which explains 99 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training


  indices = indices[:max_n_samples]
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  6.1min remaining:  6.1min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  6.2min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  6.2min finished


total time used for fitting: 0h06m34s
0_labels enabled


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   19.7s remaining:   19.7s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   20.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   20.1s finished


total timed used for predicting: 0h00m20s
Score with basic personnalized estimator 0.054


Confusion matrix
[[50089  2697    14]
 [    0     0     0]
 [   11  2745 50044]]

Now for the test set
0_labels enabled


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    2.4s remaining:    2.4s


total timed used for predicting: 0h00m03s


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.9s finished


Best estimator so far (score on the testing set) is 
```python
clf_bag = MLPClassifier(batch_size=400, activation='relu', solver='adam', alpha=0.316228) #best so far
clf = BaggingClassifier(base_estimator=clf_bag, n_estimators=12, n_jobs=4, verbose=5)
```

Oh no, wait ! Even better is :

```python
clf_bag = MLPClassifier(batch_size=400, activation='relu', solver='adam', alpha=0.316228) #best so far
clf = BaggingClassifier(base_estimator=clf_bag, n_estimators=20, n_jobs=4, verbose=5, max_samples=0.5, oob_score=True)
```

As a matter of facts it is with a threshold of 0.85:

```python
clf_bag = MLPClassifier(batch_size=400, activation='relu', solver='adam', alpha=0.316228)
clf = BaggingClassifier(base_estimator=clf_bag, n_estimators=40, n_jobs=4, verbose=5, max_samples=0.5, oob_score=True)
```

Best score is currently in the cell

## Neurons + Bagging + Voting

In [43]:
clf_bag = MLPClassifier(batch_size=400, activation='relu', solver='adam', alpha=0.316228)
clf1 = BaggingClassifier(base_estimator=clf_bag, n_estimators=50, n_jobs=4, verbose=5, max_samples=0.2, oob_score=True)
clf2 = BaggingClassifier(base_estimator=clf_bag, n_estimators=50, n_jobs=4, verbose=5, max_samples=0.4, oob_score=True)
clf3 = BaggingClassifier(base_estimator=clf_bag, n_estimators=50, n_jobs=4, verbose=5, max_samples=0.6, oob_score=True)
clf4 = BaggingClassifier(base_estimator=clf_bag, n_estimators=50, n_jobs=4, verbose=5, max_samples=0.8, oob_score=True)
clf5 = BaggingClassifier(base_estimator=clf_bag, n_estimators=50, n_jobs=4, verbose=5, max_samples=1.0, oob_score=True)
clf = VotingClassifier(estimators=[('clf1', clf1),('clf2', clf2),('clf3', clf3),('clf4', clf4),('clf5', clf5)], n_jobs=4, voting='soft')

#Fitting
start = time.time()
clf1.fit(X_train_adapt, y_train_adapt)
clf2.fit(X_train_adapt, y_train_adapt)
clf3.fit(X_train_adapt, y_train_adapt)
clf4.fit(X_train_adapt, y_train_adapt)
clf5.fit(X_train_adapt, y_train_adapt)
clf.fit(X_train_adapt, y_train_adapt)
print("total time used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train = predict_0_labels(XX=X_train_adapt, clf=clf, threshold=0.82)

#Score
score = compute_pred_score(y_train_adapt, y_pred_train)
print("Score with bagging + MPLClassifier estimator %0.3f"%(score))
print("\n\nConfusion matrix")
print(confusion_matrix(y_train_adapt, y_pred_train))

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=clf, trial_number=18, threshold=0.82)

[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  3.5min remaining:  3.5min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  3.7min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  3.7min finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  4.9min remaining:  4.9min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  5.2min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  5.2min finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  5.9min remaining:  5.9min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  6.2min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  6.2min finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  7.2min remaining:  7.2min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  7.4min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  7.4min finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  8.9min remaining:  8.9min


total time used for fitting: 1h13m05s
0_labels enabled


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   11.6s remaining:   11.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   12.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   12.1s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   10.7s remaining:   10.7s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   11.3s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   11.3s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   12.3s remaining:   12.3s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   12.7s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   12.7s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   11.1s remaining:   11.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   11.7s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   11.7s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   11.4s remaining:   11.4s


total timed used for predicting: 0h01m01s
Score with bagging + MPLClassifier estimator 0.043


Confusion matrix
[[50695  2090    15]
 [    0     0     0]
 [   18  2071 50711]]

Now for the test set
0_labels enabled


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.4s remaining:    1.4s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.7s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.4s remaining:    1.4s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.7s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.4s remaining:    1.4s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.7s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.5s remaining:    1.5s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.7s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.4s remaining:    1.4s


total timed used for predicting: 0h00m09s


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.7s finished


# OMG

![Yeah!](https://s-media-cache-ak0.pinimg.com/236x/5d/77/05/5d7705ea5bbb9f0f93b239971ad02078.webp)
---
> Now I got the #2nd best classifier with
```python
clf_bag = MLPClassifier(batch_size=400, activation='relu', solver='adam', alpha=0.316228)
clf1 = BaggingClassifier(base_estimator=clf_bag, n_estimators=40, n_jobs=4, verbose=5, max_samples=0.3, oob_score=True)
clf2 = BaggingClassifier(base_estimator=clf_bag, n_estimators=40, n_jobs=4, verbose=5, max_samples=0.2, oob_score=True)
clf3 = BaggingClassifier(base_estimator=clf_bag, n_estimators=40, n_jobs=4, verbose=5, max_samples=0.7, oob_score=True)
clf4 = BaggingClassifier(base_estimator=clf_bag, n_estimators=50, n_jobs=4, verbose=5, max_samples=1.0, oob_score=True)
clf = VotingClassifier(estimators=[('clf1', clf1),('clf2', clf2),('clf3', clf3),('clf4', clf4)], n_jobs=4, voting='soft')
```
and a probability threshold of 0.82 to decide if the 0 label should be invoked

Below is my last attempt to perform better: it is the same thing, but with a different preprocessing (without PCA). It was not conclusive

In [48]:
#Scale it 
myScaler = StandardScaler()
X_train_scaled = myScaler.fit_transform(X_train)

#Shuffle it
X_train_scaled_shuffled, y_train_scaled_shuffled = shuffle(X_train_scaled, y_train)

#Adapt the test set accordingly
X_test_scaled = myScaler.transform(X_test)

clf_bag = MLPClassifier(batch_size=400, activation='relu', solver='adam', alpha=0.316228)
clf1 = BaggingClassifier(base_estimator=clf_bag, n_estimators=50, n_jobs=4, verbose=5, max_samples=0.2, oob_score=True)
clf2 = BaggingClassifier(base_estimator=clf_bag, n_estimators=50, n_jobs=4, verbose=5, max_samples=0.3, oob_score=True)
clf3 = BaggingClassifier(base_estimator=clf_bag, n_estimators=50, n_jobs=4, verbose=5, max_samples=0.4, oob_score=True)
clf4 = BaggingClassifier(base_estimator=clf_bag, n_estimators=50, n_jobs=4, verbose=5, max_samples=0.5, oob_score=True)
clf5 = BaggingClassifier(base_estimator=clf_bag, n_estimators=50, n_jobs=4, verbose=5, max_samples=0.6, oob_score=True)
clf = VotingClassifier(estimators=[('clf1', clf1),('clf2', clf2),('clf3', clf3),('clf4', clf4),('clf5', clf5)], n_jobs=4, voting='soft')

#Fitting
start = time.time()
# clf1.fit(X_train_adapt, y_train_adapt)
# clf2.fit(X_train_adapt, y_train_adapt)
# clf3.fit(X_train_adapt, y_train_adapt)
# clf4.fit(X_train_adapt, y_train_adapt)
# clf5.fit(X_train_adapt, y_train_adapt)
clf.fit(X_train_scaled_shuffled, y_train_scaled_shuffled)
print("total time used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train = predict_0_labels(XX=X_train_scaled_shuffled, clf=clf, threshold=0.82)

#Score
score = compute_pred_score(y_train_scaled_shuffled, y_pred_train)
print("Score with bagging + MPLClassifier estimator %0.3f"%(score))
print("\n\nConfusion matrix")
print(confusion_matrix(y_train_scaled_shuffled, y_pred_train))

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_scaled, clf=clf, trial_number=19, threshold=0.82)

total time used for fitting: 1h39m50s
0_labels enabled


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   13.0s remaining:   13.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   13.5s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   13.5s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   12.4s remaining:   12.4s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   12.8s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   12.8s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   13.1s remaining:   13.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   13.6s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   13.6s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   12.8s remaining:   12.8s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   13.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   13.1s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   12.5s remaining:   12.5s


total timed used for predicting: 0h01m07s
Score with bagging + MPLClassifier estimator 0.047


Confusion matrix
[[50621  2144    35]
 [    0     0     0]
 [   33  2093 50674]]

Now for the test set
0_labels enabled


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.6s remaining:    1.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.9s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.6s remaining:    1.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.9s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.5s remaining:    1.5s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.9s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.5s remaining:    1.5s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.9s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.5s remaining:    1.5s


total timed used for predicting: 0h00m10s


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.9s finished


In [None]:
clf_bag = MLPClassifier(batch_size=400, activation='relu', solver='adam', alpha=0.316228)
clf_bag2 = GradientBoostingClassifier(n_estimators= 100, learning_rate=0.1, max_depth=7)
clf_bag3 = SVC(kernel='rbf', gamma='auto', C=100, cache_size=2500, probability=True)

clf1 = BaggingClassifier(base_estimator=clf_bag, n_estimators=50, n_jobs=4, verbose=1, max_samples=0.5, oob_score=True)
clf2 = BaggingClassifier(base_estimator=clf_bag, n_estimators=50, n_jobs=4, verbose=1, max_samples=1.0, oob_score=True)

clf3 = BaggingClassifier(base_estimator=clf_bag2, n_estimators=5, n_jobs=4, verbose=1, max_samples=0.5, oob_score=True)
clf4 = BaggingClassifier(base_estimator=clf_bag3, n_estimators=5, n_jobs=4, verbose=1, max_samples=0.5, max_features=75, oob_score=True)
clf5 = BaggingClassifier(base_estimator=clf_bag, n_estimators=50, n_jobs=4, verbose=1, max_samples=0.2, oob_score=True)
clf = VotingClassifier(estimators=[('clf1', clf1),('clf2', clf2),('clf3', clf3),('clf4', clf4),('clf5', clf5)], n_jobs=4, voting='soft')

#Fitting
start = time.time()
clf1.fit(X_train_adapt, y_train_adapt)
clf2.fit(X_train_adapt, y_train_adapt)
clf3.fit(X_train_adapt, y_train_adapt)
clf4.fit(X_train_adapt, y_train_adapt)
clf5.fit(X_train_adapt, y_train_adapt)
clf.fit(X_train_adapt, y_train_adapt)
print("total time used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train = predict_0_labels(XX=X_train_adapt, clf=clf, threshold=0.82)

#Score
score = compute_pred_score(y_train_adapt, y_pred_train)
print("Score with bagging + MPLClassifier estimator %0.3f"%(score))
print("\n\nConfusion matrix")
print(confusion_matrix(y_train_adapt, y_pred_train))

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=clf, trial_number=18, threshold=0.82)

![ZUT!](http://www.femoticons.net/images/posts/crying_emoticon_for_facebook.jpg)
I am now only #23 out of the 77 contestants. I must try to find an even better solution

## Desperate trial that shows it is still possible to make score better !

## Come Back on simpler methods

In [9]:
#Customizing the number of features and observations
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)

clf_bag = MLPClassifier(batch_size='auto', activation='relu', solver='adam', alpha=0.1, tol=0.001)
clf = BaggingClassifier(base_estimator=clf_bag, 
                        n_estimators=40, 
                        max_samples=0.3,
                        max_features= 0.6,
                        n_jobs=4, 
                        verbose=5)
#Fitting
start = time.time()
clf.fit(X_train_scaled_shuffled, y_train_scaled_shuffled)
print("total time used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train = predict_0_labels(XX=X_train_scaled_shuffled, clf=clf, threshold=0.73)

#Score
score = compute_pred_score(y_train_scaled_shuffled, y_pred_train)
print("Score with bagging + MPLClassifier estimator %0.3f"%(score))
print("\n\nConfusion matrix")
print(confusion_matrix(y_train_scaled_shuffled, y_pred_train))

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_scaled, clf=clf, trial_number=20, threshold=0.73)

126 features selected out of 128 (98 %) for PCA which explains 99 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training


  indices = indices[:max_n_samples]
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.0min remaining:  2.0min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.0min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.0min finished


total time used for fitting: 0h02m01s
0_labels enabled


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    7.6s remaining:    7.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    8.2s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    8.2s finished


total timed used for predicting: 0h00m08s
Score with bagging + MPLClassifier estimator 0.087


Confusion matrix
[[48794  3947    59]
 [    0     0     0]
 [   90  3786 48924]]

Now for the test set
0_labels enabled


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.4s remaining:    1.4s


total timed used for predicting: 0h00m02s


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.8s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.8s finished


 This is slightly better than before on the test set

In [70]:
#Customizing the number of features and observations
X_train_adapt, y_train_adapt, X_test_adapt = prepare_dataset(X_train, y_train, X_test, var_ratio_min=99.9, ratio_sd=100)

clf_bag = MLPClassifier(batch_size='auto', activation='relu', solver='adam', alpha=0.1, tol=0.0007)
clf = BaggingClassifier(base_estimator=clf_bag, 
                        n_estimators=40, 
                        max_samples=0.35,
                        max_features= 0.62,
                        n_jobs=4, 
                        verbose=5)
#Fitting
start = time.time()
clf.fit(X_train_adapt, y_train_adapt)
print("total time used for fitting: %s"%(makeTimeSignificant(time.time() - start)))

#Predicting
y_pred_train = predict_0_labels(XX=X_train_adapt, clf=clf, threshold=0.73)

#Score
score = compute_pred_score(y_train_adapt, y_pred_train)
print("Score with bagging + MPLClassifier estimator %0.3f"%(score))
print("\n\nConfusion matrix")
print(confusion_matrix(y_train_adapt, y_pred_train))

#Saving results
print("\nNow for the test set")
save_prediction(X_test=X_test_adapt, clf=clf, trial_number=21, threshold=0.73)

126 features selected out of 128 (98 %) for PCA which explains 99 % of variance
105600 observations selected out of 105600 (100 %) for Shuffling and training


  indices = indices[:max_n_samples]
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.7min remaining:  2.7min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.8min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.8min finished


total time used for fitting: 0h02m46s
0_labels enabled


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    8.1s remaining:    8.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    8.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    8.9s finished


total timed used for predicting: 0h00m09s
Score with bagging + MPLClassifier estimator 0.068


Confusion matrix
[[50055  2650    95]
 [    0     0     0]
 [  134  2211 50455]]

Now for the test set
0_labels enabled


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    1.5s remaining:    1.5s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.9s finished


total timed used for predicting: 0h00m02s


## That was Even better !