In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("train_imperson_without4n7_balanced_data.csv")

Delete columns that have zero variance

In [3]:
col_to_delete = [k for k in train.columns if train[k].std()==0]

Of which there are

In [4]:
len(col_to_delete)

74

In [5]:
train = train.drop(col_to_delete,axis=1)
trainX = train[train.columns[:-1]]
trainY = train["155"]

In [6]:
trainX0 = trainX[trainY==0]
trainX1 = trainX[trainY>0]

In [7]:
len(trainX)

97044

# Feature selection

Build a data table to hold information regarding the remaining variables

(Some `pandas` operations are easier if the feature name is a regular column but it's nice for that to be the index too)

In [8]:
features = pd.DataFrame(data = trainX.columns, columns = ['feature'], index = train.columns[:-1])

In [9]:
features["mu0"] = features.feature.apply(lambda f: trainX0[f].mean())
features["mu1"] = features.feature.apply(lambda f: trainX1[f].mean())

features["sigma0"] = features.feature.apply(lambda f: trainX0[f].std())
features["sigma1"] = features.feature.apply(lambda f: trainX1[f].std())

features["rho"] = features.feature.apply(lambda f: train[f].corr(train["155"]))

features.head()

Unnamed: 0,feature,mu0,mu1,sigma0,sigma1,rho
5,5,0.005345,0.00716,0.018728,0.011431,0.05841
6,6,0.005345,0.00716,0.018728,0.011431,0.05841
8,8,0.349147,0.038526,0.450527,0.006735,-0.438183
9,9,0.349147,0.038526,0.450527,0.006735,-0.438183
14,14,0.999567,1.0,0.020799,0.0,0.014712


## correlation

Sorting features by the absolute value of $\rho$, since a negative correlation is as predictive as a postive one

In [10]:
features["absrho"]=abs(features["rho"])

In [11]:
features.sort_values(by="absrho", ascending=False).loc[features["absrho"]>0.1]

Unnamed: 0,feature,mu0,mu1,sigma0,sigma1,rho,absrho
67,67,0.7086695,0.045289,0.260226,0.15774,-0.838937,0.838937
71,71,0.004843164,0.676456,0.069425,0.467833,0.708561,0.708561
50,50,0.4031573,0.999979,0.490537,0.00454,0.652165,0.652165
51,51,0.5964099,2.1e-05,0.490622,0.00454,-0.651828,0.651828
47,47,0.5018176,2.2e-05,0.450187,0.004542,-0.618997,0.618997
68,68,0.449054,0.921829,0.470411,0.268404,0.525254,0.525254
38,38,0.659198,0.768314,0.132439,0.024954,0.496848,0.496848
82,82,0.3597495,0.091727,0.329202,0.055956,-0.493602,0.493602
73,73,0.486233,0.92187,0.499816,0.268378,0.477183,0.477183
146,146,0.05687225,0.0,0.079131,0.0,-0.453058,0.453058


## t-tests

t-tests establish the probability $p$ that we see a least that difference in the mean of a feature for the two conditions under the null hypothesis that they come from the same distribution

In [12]:
from scipy.stats import ttest_ind

In [13]:
features["t-test p"] = features.feature.apply(
    lambda f: ttest_ind(trainX0[f],
                        trainX1[f],
                        equal_var=False)[1]
)
features.head()

Unnamed: 0,feature,mu0,mu1,sigma0,sigma1,rho,absrho,t-test p
5,5,0.005345,0.00716,0.018728,0.011431,0.05841,0.05841,4.460255e-74
6,6,0.005345,0.00716,0.018728,0.011431,0.05841,0.05841,4.460255e-74
8,8,0.349147,0.038526,0.450527,0.006735,-0.438183,0.438183,0.0
9,9,0.349147,0.038526,0.450527,0.006735,-0.438183,0.438183,0.0
14,14,0.999567,1.0,0.020799,0.0,0.014712,0.014712,4.583503e-06


Sort from lowest probability

In [14]:
features.sort_values(by="t-test p")

Unnamed: 0,feature,mu0,mu1,sigma0,sigma1,rho,absrho,t-test p
154,154,0.334940,0.022008,0.458596,0.006407,-0.434536,0.434536,0.000000
94,94,0.055851,0.000289,0.229636,0.016984,-0.168196,0.168196,0.000000
77,77,0.038930,0.008852,0.095921,0.038162,-0.201783,0.201783,0.000000
75,75,0.005427,0.000290,0.011442,0.002490,-0.296245,0.296245,0.000000
73,73,0.486233,0.921870,0.499816,0.268378,0.477183,0.477183,0.000000
...,...,...,...,...,...,...,...,...
105,105,0.000021,0.000000,0.002780,0.000000,-0.005242,0.005242,0.102471
83,83,0.000041,0.000000,0.006420,0.000000,-0.004540,0.004540,0.157301
138,138,0.500008,0.500000,0.001725,0.000000,-0.003210,0.003210,0.317315
133,133,0.000021,0.000000,0.004540,0.000000,-0.003210,0.003210,0.317315


How many features have $p<0.05$?

In [15]:
len(features[features["t-test p"] < 0.05])

70

$p<0.000000001$?

In [16]:
len(features[features["t-test p"] < 0.000000001])

51

So most of the features have a significantly different mean under the two conditions

## unique values

Some features appear to only have a few values, eg

In [17]:
len(train["133"].unique())

2

Add a column to track the number of unique values a feature takes

In [18]:
features["vals"] = features.feature.apply(lambda f: len(train[f].unique()))

In [19]:
features.sort_values(by="vals").head(40)

Unnamed: 0,feature,mu0,mu1,sigma0,sigma1,rho,absrho,t-test p,vals
129,129,6.805573e-07,0.0,3e-06,0.0,-0.14836,0.14836,0.0,2
106,106,0.001482495,1.8e-05,0.035814,0.003934,-0.028733,0.028733,3.5304649999999995e-19,2
71,71,0.004843164,0.676456,0.069425,0.467833,0.708561,0.708561,0.0,2
72,72,0.001978484,8.2e-05,0.044437,0.009079,-0.029548,0.029548,3.4194079999999997e-20,2
73,73,0.486233,0.92187,0.499816,0.268378,0.477183,0.477183,0.0,2
113,113,2.649747e-05,0.0,0.00337,0.0,-0.00556,0.00556,0.08326452,2
111,111,0.0003297473,0.000577,0.018156,0.024015,0.005809,0.005809,0.0703782,2
133,133,2.060921e-05,0.0,0.00454,0.0,-0.00321,0.00321,0.3173155,2
83,83,4.121842e-05,0.0,0.00642,0.0,-0.00454,0.00454,0.1573013,2
84,84,0.0001236552,0.0,0.011119,0.0,-0.007863,0.007863,0.01430437,2


There are a quite a few 2-value features, it might be worth looking at what proportion of the values for a feature are non-zero overall, and under each class

In [20]:
l = len(trainX)
l0 = len(trainX0)
l1 = len(trainX1)
features["nonzero"] = features.feature.apply(lambda f: len(trainX.loc[trainX[f]!=0])/l)
features["nonzero0"] = features.feature.apply(lambda f: len(trainX0.loc[trainX0[f]!=0])/l0)
features["nonzero1"] = features.feature.apply(lambda f: len(trainX1.loc[trainX1[f]!=0])/l1)

In [21]:
features.sort_values(by="nonzero").loc[features["nonzero"]<0.001]

Unnamed: 0,feature,mu0,mu1,sigma0,sigma1,rho,absrho,t-test p,vals,nonzero,nonzero0,nonzero1
97,97,2.1e-05,0.0,0.00454,0.0,-0.00321,0.00321,0.3173155,2,1e-05,2.1e-05,0.0
133,133,2.1e-05,0.0,0.00454,0.0,-0.00321,0.00321,0.3173155,2,1e-05,2.1e-05,0.0
83,83,4.1e-05,0.0,0.00642,0.0,-0.00454,0.00454,0.1573013,2,2.1e-05,4.1e-05,0.0
105,105,2.1e-05,0.0,0.00278,0.0,-0.005242,0.005242,0.102471,3,3.1e-05,6.2e-05,0.0
113,113,2.6e-05,0.0,0.00337,0.0,-0.00556,0.00556,0.08326452,2,3.1e-05,6.2e-05,0.0
88,88,2.9e-05,0.0,0.003466,0.0,-0.005912,0.005912,0.06551449,4,6.2e-05,0.000124,0.0
84,84,0.000124,0.0,0.011119,0.0,-0.007863,0.007863,0.01430437,2,6.2e-05,0.000124,0.0
86,86,0.000165,0.0,0.012839,0.0,-0.00908,0.00908,0.00467667,2,8.2e-05,0.000165,0.0
117,117,1.3e-05,0.0,0.001288,0.0,-0.007129,0.007129,0.02637066,6,8.2e-05,0.000165,0.0
111,111,0.00033,0.000577,0.018156,0.024015,0.005809,0.005809,0.0703782,2,0.000453,0.00033,0.000577


These features are extremely sparse, so may be noise.

Which features have good distribution, some correlation, and low p value?

In [22]:
features.loc[(features["nonzero"]>0.01) & (features["absrho"]>0.2) & (features["t-test p"]<0.01)].feature.values


array(['8', '9', '38', '47', '50', '51', '66', '67', '68', '70', '71',
       '73', '75', '77', '82', '110', '145', '146', '154'], dtype=object)

Parameterise this for ease of comparison later:

In [23]:
def feature_select(nonzero=0, absrho=0, ttestp=1):
    return features.loc[(features["nonzero"]>=nonzero) & (features["absrho"]>=absrho) & (features["t-test p"]<=ttestp)].feature.values

In [24]:
feature_set1 = feature_select(nonzero=0.01, absrho=0.2, ttestp=0.01)
feature_set1

array(['8', '9', '38', '47', '50', '51', '66', '67', '68', '70', '71',
       '73', '75', '77', '82', '110', '145', '146', '154'], dtype=object)

## baselining with logistic regression

Try this set of features in a logistic regression to get a baseline metric for evaluation.

Need to consider what is the most appropriate metric.

Is it more important to detect as many instrusions as possible (recall) or minimise false alarms (precision), or a balance like f1?

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score

Split the training data 60/40 into training and validation data

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(trainX[feature_set1], trainY, test_size=0.4)

In [27]:
# no regularisation at this point
lr = LogisticRegression(solver="sag", penalty='none').fit(X_train,Y_train)



In [28]:
confusion_matrix(Y_test,lr.predict(X_test))

array([[19098,   496],
       [  187, 19037]])

In [29]:
def evaluate(model, ytest, xtest):
    pred = model.predict(xtest)
    p = precision_score(ytest,pred)
    r = recall_score(ytest,pred)
    f = f1_score(ytest,pred)
    return p,r,f

In [30]:
print(evaluate(lr, Y_test, X_test))

(0.9746070752060615, 0.9902725759467332, 0.9823773769899632)


In [31]:
def thresholds_to_metrics(nonzero=0, absrho=0, ttestp=1):
    selection = feature_select(nonzero=nonzero, absrho=absrho, ttestp=ttestp)
    X_train, X_test, Y_train, Y_test = train_test_split(trainX[selection], trainY, test_size=0.4)
    lr = LogisticRegression(solver="sag", penalty='none').fit(X_train,Y_train)
    p,r,f = evaluate(lr, Y_test, X_test)
    return p,r,f,len(selection)

In [32]:
# no feature selection
thresholds_to_metrics()



(0.9924512904212996, 0.9982556946439565, 0.9953450304363395, 78)

In [33]:
import matplotlib.pyplot as plt

That's 99.8% recall in a simple logistic regression on all non-constant features. This poses a challenge as it does not translate to high recall on the test data.

## PCA

Looks like some features are highly correlated, so perhaps PCA or clustering on features first.

In [35]:
from sklearn.decomposition import PCA

In [36]:
X_train, X_test, Y_train, Y_test = train_test_split(trainX, trainY, test_size=0.4)

In [37]:
pca = PCA(n_components=20)

In [38]:
pca_train = pca.fit_transform(X_train)
pca_test = pca.transform(X_test)

In [39]:
lr = LogisticRegression(solver='sag', penalty='none').fit(pca_train,Y_train)



In [40]:
recall_score(Y_test, lr.predict(pca_test))

0.997430362832768

Twenty component PCA does fairly well.

Let's look at a few more or fewer components. We can do that by taking say fifty and then keeping the first 5, 10, 20, 50 of them

In [41]:
pca = PCA(n_components=50)

In [42]:
X_train, X_test, Y_train, Y_test = train_test_split(trainX, trainY, test_size=0.4)

In [43]:
pca_train = pca.fit_transform(X_train)
pca_test = pca.transform(X_test)

In [None]:
for pcs in [5,10,20,50]:
    lr = LogisticRegression(solver='sag', penalty='none').fit(pca_train[:,:pcs],Y_train)
    score = recall_score(Y_test, lr.predict(pca_test[:,:pcs]))
    print(f"{pcs}:\t{score}")

## Mutual information

In [45]:
from sklearn.feature_selection import mutual_info_classif

Consider any feature with fewer than 10 unique values to be discrete. Is that reasonable?

In [47]:
mi = mutual_info_classif(trainX,trainY, discrete_features=(features.vals<10))
mi

array([4.47812426e-01, 4.46309082e-01, 6.32876554e-01, 6.33125299e-01,
       1.50018172e-04, 1.50018172e-04, 1.50018172e-04, 1.50018172e-04,
       1.50018172e-04, 1.50018172e-04, 1.50018172e-04, 6.48974155e-01,
       1.50018172e-04, 3.06492963e-01, 8.90199152e-03, 2.72272530e-01,
       2.72002744e-01, 1.50018172e-04, 3.93816580e-01, 1.50018172e-04,
       5.38009964e-01, 1.43635424e-01, 4.71191255e-01, 1.75160966e-01,
       5.51963961e-02, 3.11386227e-01, 5.41650770e-04, 1.23919498e-01,
       2.76640932e-01, 4.76754185e-01, 4.86403498e-01, 2.07442413e-01,
       5.25039523e-01, 1.52310994e-01, 5.81157485e-01, 1.42854269e-05,
       4.28575551e-05, 5.71442564e-05, 4.28575551e-05, 1.50018172e-04,
       5.73936203e-04, 1.48303711e-03, 1.89492770e-02, 7.14266037e-06,
       4.49655821e-04, 2.47054140e-03, 2.14282997e-05, 5.44435479e-04,
       3.00277081e-02, 3.84346475e-05, 2.09096746e-03, 4.49100997e-02,
       1.70852413e-05, 2.64472542e-03, 2.14282997e-05, 5.71442564e-05,
      

In [50]:
features["mi"] = mi
features.sort_values(by="mi", ascending=False).head(30)

Unnamed: 0,feature,mu0,mu1,sigma0,sigma1,rho,absrho,t-test p,vals,nonzero,nonzero0,nonzero1,mi
38,38,0.659198,0.768314,0.132439,0.024954,0.496848,0.496848,0.0,25023,0.999784,0.999567,1.0,0.648974
9,9,0.349147,0.038526,0.450527,0.006735,-0.438183,0.438183,0.0,399,0.868575,0.73715,1.0,0.633125
8,8,0.349147,0.038526,0.450527,0.006735,-0.438183,0.438183,0.0,399,0.868575,0.73715,1.0,0.632877
82,82,0.359749,0.091727,0.329202,0.055956,-0.493602,0.493602,0.0,4096,0.867421,0.734842,1.0,0.581157
142,142,0.23105,0.291817,0.317456,0.187557,0.115751,0.115751,3.517662e-286,20359,0.69215,0.462429,0.92187,0.573959
140,140,0.22615,0.320148,0.301992,0.199302,0.180674,0.180674,0.0,17320,0.69215,0.462429,0.92187,0.573301
64,64,0.670202,0.68383,0.273232,0.149417,0.03093,0.03093,5.596116e-22,20,0.996476,0.992952,1.0,0.53801
79,79,0.002452,0.00538,0.025173,0.001334,0.08186,0.08186,1.9143659999999998e-143,46,0.867668,0.735337,1.0,0.52504
154,154,0.33494,0.022008,0.458596,0.006407,-0.434536,0.434536,0.0,333,0.705793,0.489716,0.92187,0.523361
77,77,0.03893,0.008852,0.095921,0.038162,-0.201783,0.201783,0.0,84,0.867668,0.735337,1.0,0.486403


Some of the features with relatively high mutual information with the target have low correlation which suggests that these features have a relation with the target if not a *linear* one.

We can select features with high linear correlation *or* mutual information

In [80]:
features.loc[(features.absrho > 0.5)|(features.mi > 0.3)]

Unnamed: 0,feature,mu0,mu1,sigma0,sigma1,rho,absrho,t-test p,vals,nonzero,nonzero0,nonzero1,mi
5,5,0.005345,0.00716,0.018728,0.011431,0.05841,0.05841,4.460255e-74,8718,1.0,1.0,1.0,0.447812
6,6,0.005345,0.00716,0.018728,0.011431,0.05841,0.05841,4.460255e-74,8718,1.0,1.0,1.0,0.446309
8,8,0.349147,0.038526,0.450527,0.006735,-0.438183,0.438183,0.0,399,0.868575,0.73715,1.0,0.632877
9,9,0.349147,0.038526,0.450527,0.006735,-0.438183,0.438183,0.0,399,0.868575,0.73715,1.0,0.633125
38,38,0.659198,0.768314,0.132439,0.024954,0.496848,0.496848,0.0,25023,0.999784,0.999567,1.0,0.648974
47,47,0.501818,2.2e-05,0.450187,0.004542,-0.618997,0.618997,0.0,12,0.323863,0.647644,8.2e-05,0.306493
50,50,0.403157,0.999979,0.490537,0.00454,0.652165,0.652165,0.0,2,0.701568,0.403157,0.999979,0.272273
51,51,0.59641,2.1e-05,0.490622,0.00454,-0.651828,0.651828,0.0,2,0.298215,0.59641,2.1e-05,0.272003
61,61,0.607722,0.634935,0.14642,0.074405,0.116366,0.116366,5.360185e-289,63,0.999918,0.999918,0.999918,0.393817
64,64,0.670202,0.68383,0.273232,0.149417,0.03093,0.03093,5.596116e-22,20,0.996476,0.992952,1.0,0.53801


## Combination selection

In [153]:
feature_set3 = features.loc[(features.absrho > 0.5)|(features.mi > 0.3)].feature
len(feature_set3)

20

In [63]:
X_train, X_test, Y_train, Y_test = train_test_split(trainX[feature_set3], trainY, test_size=0.4)

In [64]:
lr = LogisticRegression(solver="sag", penalty='none').fit(X_train,Y_train)



In [65]:
confusion_matrix(Y_test,lr.predict(X_test))

array([[19146,   326],
       [  218, 19128]])

In [68]:
recall_score(Y_test, lr.predict(X_test))

0.988731520727799

Following Tim's lead:

In [69]:
from sklearn.neural_network import MLPClassifier

In [70]:
mlp = MLPClassifier(hidden_layer_sizes=(5, 5), max_iter=1000) #two layers of 5 neurons, 1000 of backprop
mlp.fit(X_train, Y_train.values.ravel()) #train the algo

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 5), learning_rate='constant',
              learning_rate_init=0.001, max_iter=1000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [71]:
confusion_matrix(Y_test, mlp.predict(X_test))

array([[19380,    92],
       [   69, 19277]])

So, these 20 offer good recall and precision

Being slightly more selective

In [73]:
feature_set4 = features.loc[(features.absrho > 0.6)|(features.mi > 0.5)].feature
len(feature_set4)

14

In [74]:
X_train, X_test, Y_train, Y_test = train_test_split(trainX[feature_set4], trainY, test_size=0.4)

In [75]:
mlp = MLPClassifier(hidden_layer_sizes=(5, 5), max_iter=1000) #two layers of 5 neurons, 1000 of backprop
mlp.fit(X_train, Y_train.values.ravel()) #train the algo

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 5), learning_rate='constant',
              learning_rate_init=0.001, max_iter=1000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [76]:
confusion_matrix(Y_test, mlp.predict(X_test))

array([[19397,    12],
       [ 1547, 17862]])

In [78]:
recall_score(Y_test, mlp.predict(X_test))

0.9202947086403215

In [79]:
precision_score(Y_test, mlp.predict(X_test))

0.9993286337697214

Good precision but recall has suffered slightly

How about a hybrid selection: the top 5 by absolute rho, top 5 by mutual information, and the first ten principle components?

In [84]:
top5rho = features.sort_values(by="absrho", ascending=False).head(5).feature
top5mi = features.sort_values(by="mi", ascending=False).head(5).feature

In [96]:
pd.concat([top5rho,top5mi]).values

array(['67', '71', '50', '51', '47', '38', '9', '8', '82', '142'],
      dtype=object)

In [87]:
pca = PCA(n_components=10)

In [102]:
pca_train = pca.fit_transform(trainX)
pca_train.shape

(97044, 10)

In [115]:
trainX20 = pd.concat([trainX[pd.concat([top5rho,top5mi]).values], pd.DataFrame(pca_train)], axis=1)

In [116]:
trainX20

Unnamed: 0,67,71,50,51,47,38,9,8,82,142,0,1,2,3,4,5,6,7,8.1,9.1
0,0.30769,1,0,1,0.88679,0.36865,0.009150,0.009150,0.946280,0.0,0.420522,0.453253,-0.547955,-0.854962,0.586509,0.763728,0.569424,-0.064718,-0.041441,-0.076069
1,1.00000,0,0,1,0.43396,0.36867,0.000000,0.000000,0.000000,0.0,0.058966,1.319915,-0.917786,-0.579474,-0.039739,-0.049850,-0.026646,0.080834,0.035814,0.052928
2,0.61538,0,1,0,0.00000,0.36871,0.070588,0.070588,0.255430,0.0,-1.473498,1.431104,1.153064,-0.331561,-0.054824,-0.045797,-0.225877,0.135514,-0.365987,-0.246504
3,0.61538,0,1,0,0.00000,0.36876,0.094771,0.094771,0.072772,0.0,-1.571292,1.509385,1.416091,-0.455870,-0.079367,-0.057258,-0.057567,0.035543,-0.038611,1.115513
4,0.61538,0,1,0,0.00000,0.36880,0.070588,0.070588,0.256900,0.0,-1.471950,1.430648,1.152154,-0.332272,-0.054604,-0.044002,-0.229213,0.138189,-0.369590,-0.256007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97039,0.61538,0,1,0,0.00000,0.93345,0.038562,0.038562,0.217830,0.0,-1.515816,1.393979,1.143533,-0.350176,-0.095881,-0.076009,-0.229036,0.148368,-0.346674,-0.348885
97040,0.61538,0,1,0,0.00000,0.93348,0.038562,0.038562,0.218320,0.0,-1.515666,1.393976,1.143294,-0.349872,-0.095791,-0.075774,-0.228730,0.148168,-0.346713,-0.348983
97041,0.61538,0,1,0,0.00000,0.93350,0.038562,0.038562,0.218560,0.0,-1.515786,1.394070,1.143764,-0.350043,-0.095825,-0.075554,-0.228661,0.148137,-0.346814,-0.348994
97042,0.61538,0,1,0,0.00000,0.93352,0.038562,0.038562,0.218800,0.0,-1.515541,1.393983,1.143125,-0.349609,-0.095713,-0.075533,-0.228439,0.147980,-0.346759,-0.349077


In [139]:
X_train, X_test, Y_train, Y_test = train_test_split(trainX20, trainY, test_size=0.4)

In [140]:
mlp = MLPClassifier(hidden_layer_sizes=(5, 5), max_iter=1000)
mlp.fit(X_train, Y_train.values.ravel())

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 5), learning_rate='constant',
              learning_rate_init=0.001, max_iter=1000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [141]:
confusion_matrix(Y_test, mlp.predict(X_test))

array([[19442,    80],
       [   47, 19249]])

In [142]:
recall_score(Y_test, mlp.predict(X_test))

0.9975642620232172

In [143]:
precision_score(Y_test, mlp.predict(X_test))

0.9958611412902892

So these do well on our hold out set, but on the given test set they perform really badly - ~50% recall...