In [72]:
import pandas as pd
train = pd.read_csv("train_imperson_without4n7_balanced_data.csv")
X = train.iloc[:,:-1]
y = train.iloc[:,-1]

# Preprocessing

Drop zero variance columns

In [31]:
X = X.drop([k for k in X.columns if X[k].std()==0], axis=1)

Remove sparse features (fewer than 1% non-zero)

In [32]:
X = X.drop([k for k in X.columns if len(X[k].loc[X[k]!=0])<0.01*len(X)], axis=1)

Separate into continuous and categorical features, following Tim's suggestion of 6 value threshold

In [33]:
filt = X.nunique()<6
Xcat = X.loc[:,filt]
Xcon = X.loc[:,~filt]

Identify and drop highly correlated continuous features (Pearson >0.8 or <-0.8)

In [34]:
from itertools import product

In [35]:
# keeps the numerically first of any pair of highly correlated features
high_corr = set([a for (a,b) in product(Xcon.columns, Xcon.columns) if int(a)>int(b) if abs(Xcon[a].corr(Xcon[b]))>0.8])
high_corr

{'154', '47', '6', '79', '9'}

In [36]:
Xcon = Xcon.drop(high_corr, axis=1)

In [37]:
Xcon.head()

Unnamed: 0,5,8,38,48,61,64,67,75,76,77,78,80,82,107,119,120,140,142,143
0,6.6e-05,0.00915,0.36865,0.98108,0.70423,0.81818,0.30769,0.003034,0.001127,0.011765,0.001443,0.001474,0.94628,0.0,0.0,0.0,0.0,0.0,0.0
1,1.4e-05,0.0,0.36867,0.98108,0.70423,0.65909,1.0,0.0,0.002253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.035528,0.070588,0.36871,0.98108,0.59155,0.18182,0.61538,0.0,0.000563,0.005882,0.001731,0.001474,0.25543,0.048053,0.001459,0.46154,0.0,0.0,0.0
3,0.005128,0.094771,0.36876,0.98108,0.14085,0.18182,0.61538,0.0,0.000563,0.005882,0.000866,0.000885,0.072772,0.1683,0.000875,0.46154,0.0,0.0,0.0
4,0.035116,0.070588,0.3688,0.98108,0.61972,0.18182,0.61538,0.0,0.000563,0.005882,0.001731,0.001474,0.2569,0.048054,0.001459,0.46154,0.0,0.0,0.0


Likewise remove identical categorical features

In [38]:
ident_cat = set([a for (a,b) in product(Xcat.columns, Xcat.columns) if int(a)>int(b) and all(abs(Xcat[a]-Xcat[b])<0.0001)])
ident_cat

{'128',
 '129',
 '146',
 '15',
 '16',
 '18',
 '20',
 '26',
 '29',
 '43',
 '52',
 '62',
 '89'}

In [39]:
Xcat = Xcat.drop(ident_cat, axis=1)
Xcat.head()

Unnamed: 0,14,50,51,66,68,70,71,73,90,93,...,108,110,118,122,126,127,130,138,141,145
0,1,0,1,1.0,0.5,0,1,0,0,0,...,0.0,0.0,0,0.0,0.0,0.0,0,0.5,0.0,0.0
1,1,0,1,0.5,0.0,0,0,0,0,0,...,0.0,0.0,0,0.0,0.0,0.0,0,0.5,0.0,0.0
2,1,1,0,0.0,0.0,0,0,0,1,1,...,0.00153,0.0,1,0.007936,0.0,0.0,0,0.5,0.0,0.0
3,1,1,0,0.0,0.0,0,0,0,1,1,...,0.00153,0.0,1,0.003968,1.5e-05,0.007843,1,0.5,0.0,0.0
4,1,1,0,0.0,0.0,0,0,0,1,1,...,0.00153,0.0,1,0.007936,0.0,0.0,0,0.5,0.0,0.0


## Standardisation

Standardise both sets of features

For continuous we want standard scaling. For categorical we can just divide by the maximum value.

In [40]:
from sklearn.preprocessing import StandardScaler, MaxAbsScaler

In [43]:
# careful to have pandas dataframes still, not numpy arrays
Xcon[Xcon.columns] = StandardScaler().fit_transform(Xcon[Xcon.columns])
Xcat[Xcat.columns] = MaxAbsScaler().fit_transform(Xcat[Xcat.columns])

In [46]:
Xcon.shape, Xcat.shape

((97044, 19), (97044, 22))

So we're down to 19 continuous and 22 categorical features

# Feature selection

Put them back together into a processed dataframe to carry forward

In [74]:
processed_data = pd.concat([Xcon,Xcat], axis=1)
processed_data["y"] = y
processed_data.head()

Unnamed: 0,5,8,38,48,61,64,67,75,76,77,...,110,118,122,126,127,130,138,141,145,y
0,-0.398089,-0.521063,-3.142811,0.01475,0.708991,0.640755,-0.175252,0.020304,-0.103169,-0.162696,...,0.0,0.0,0.0,0.0,0.0,0.0,0.568182,0.0,0.0,0
1,-0.401396,-0.546879,-3.142628,0.01475,0.708991,-0.081367,1.575793,-0.329739,-0.049529,-0.320552,...,0.0,0.0,0.0,0.0,0.0,0.0,0.568182,0.0,0.0,0
2,1.883791,-0.347726,-3.142264,0.01475,-0.254672,-2.247733,0.602982,-0.329739,-0.129992,-0.241625,...,0.0,1.0,1.0,0.0,0.0,0.0,0.568182,0.0,0.0,0
3,-0.072373,-0.279498,-3.141809,0.01475,-4.109153,-2.247733,0.602982,-0.329739,-0.129992,-0.241625,...,0.0,1.0,0.500006,1.0,0.500006,1.0,0.568182,0.0,0.0,0
4,1.85728,-0.347726,-3.141445,0.01475,-0.013756,-2.247733,0.602982,-0.329739,-0.129992,-0.241625,...,0.0,1.0,1.0,0.0,0.0,0.0,0.568182,0.0,0.0,0


Build a dataframe to capture information about the features

In [75]:
features = pd.DataFrame([0]*19+[1]*22,index=Xcon.columns.append(Xcat.columns),columns=["Categorical"])
features.head()

Unnamed: 0,Categorical
5,0
8,0
38,0
48,0
61,0


Add a column for the mutual information measure

In [62]:
from sklearn.feature_selection import mutual_info_classif

In [79]:
features["MI"] = mutual_info_classif(processed_data.iloc[:,:-1],processed_data["y"])
features.head()

Unnamed: 0,Categorical,MI
5,0,0.446731
8,0,0.635839
38,0,0.648896
48,0,0.0
61,0,0.391814


And for the linear correlation (Kendall's following Michael's work), taking the absolute value of the correlation coefficient

In [88]:
features["absrho"] = [abs(processed_data[f].corr(processed_data["y"],method='pearson' )) for f in features.index]
features["abstau"] = [abs(processed_data[f].corr(processed_data["y"],method='kendall' )) for f in features.index]
features.head()

Unnamed: 0,Categorical,MI,absrho,abstau
5,0,0.446731,0.05841,0.517106
8,0,0.635839,0.438183,0.119957
38,0,0.648896,0.496848,0.44858
48,0,0.0,0.014634,0.003032
61,0,0.391814,0.116366,0.106813


Which features have the highest MI?

In [96]:
features.sort_values(by="MI", ascending=False).head(15)

Unnamed: 0,Categorical,MI,absrho,abstau
38,0,0.648896,0.496848,0.44858
8,0,0.635839,0.438183,0.119957
82,0,0.582729,0.493602,0.234276
140,0,0.577228,0.180674,0.24599
142,0,0.575496,0.115751,0.230346
64,0,0.536773,0.03093,0.019666
77,0,0.487621,0.201783,0.289967
76,0,0.478165,0.09987,0.747373
67,0,0.474816,0.838937,0.783199
5,0,0.446731,0.05841,0.517106


Which features have the highest $|\tau|$?

In [97]:
features.sort_values(by="abstau", ascending=False).head(15)

Unnamed: 0,Categorical,MI,absrho,abstau
67,0,0.474816,0.838937,0.783199
76,0,0.478165,0.09987,0.747373
71,1,0.311638,0.708561,0.708561
50,1,0.280262,0.652165,0.652165
51,1,0.272402,0.651828,0.651828
75,0,0.276427,0.296245,0.620754
68,1,0.180354,0.525254,0.52784
5,0,0.446731,0.05841,0.517106
73,1,0.131098,0.477183,0.477183
145,1,0.134703,0.453058,0.453468


And the highest $|\rho|$?

In [105]:
features.sort_values(by="absrho", ascending=False).head(15)

Unnamed: 0,Categorical,MI,absrho,abstau
67,0,0.474816,0.838937,0.783199
71,1,0.311638,0.708561,0.708561
50,1,0.280262,0.652165,0.652165
51,1,0.272402,0.651828,0.651828
68,1,0.180354,0.525254,0.52784
38,0,0.648896,0.496848,0.44858
82,0,0.582729,0.493602,0.234276
73,1,0.131098,0.477183,0.477183
145,1,0.134703,0.453058,0.453468
8,0,0.635839,0.438183,0.119957


The following features are strong on one or more of these measures:

In [102]:
features.loc[(features.MI > 0.4)|(features.absrho > 0.4)|(features.abstau > 0.4)]

Unnamed: 0,Categorical,MI,absrho,abstau
5,0,0.446731,0.05841,0.517106
8,0,0.635839,0.438183,0.119957
38,0,0.648896,0.496848,0.44858
64,0,0.536773,0.03093,0.019666
67,0,0.474816,0.838937,0.783199
75,0,0.276427,0.296245,0.620754
76,0,0.478165,0.09987,0.747373
77,0,0.487621,0.201783,0.289967
80,0,0.154242,0.030155,0.452988
82,0,0.582729,0.493602,0.234276


## Feature set

In [104]:
feature_set = features.loc[(features.MI > 0.4)|(features.absrho > 0.4)|(features.abstau > 0.4)].index
feature_set

Index(['5', '8', '38', '64', '67', '75', '76', '77', '80', '82', '140', '142',
       '50', '51', '66', '68', '71', '73', '145'],
      dtype='object')

In [109]:
len(feature_set)

19

## PCA

In [135]:
from sklearn.decomposition import PCA

In [143]:
pca = PCA(n_components=5)

In [144]:
pcaX = pca.fit_transform(processed_data.iloc[:,:-1])

Add the PCs to a dataframe

In [148]:
processed_with_pcs = pd.DataFrame(pcaX,columns=[f"PC{i}" for i in range(1,6)]).join(processed_data)

And to our feature set list

In [168]:
feature_set_with_pcs = list(feature_set) + [f"PC{i}" for i in range(1,6)]
feature_set_with_pcs

['5',
 '8',
 '38',
 '64',
 '67',
 '75',
 '76',
 '77',
 '80',
 '82',
 '140',
 '142',
 '50',
 '51',
 '66',
 '68',
 '71',
 '73',
 '145',
 'PC1',
 'PC2',
 'PC3',
 'PC4',
 'PC5']

## Feature set dataframe generator

In [189]:
def get_df(rho=0.4, tau=0.4, mi=0.4, pcs=5):

    pca = PCA(n_components=pcs)
    pcaX = pca.fit_transform(processed_data.iloc[:,:-1])
    pc_names = [f"PC{i}" for i in range(1,pcs+1)]
    
    feature_names = list(features.loc[(features.MI > mi)|(features.absrho > rho)|(features.abstau > tau)].index)
    return pd.DataFrame(pcaX,columns=pc_names).join(processed_data[feature_names+["y"]])

In [190]:
get_df(rho=0.6, tau=0.6, mi=0.5, pcs=8)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,8,38,...,67,75,76,82,140,142,50,51,71,y
0,0.016421,2.250497,-0.197476,-0.011926,-1.308096,-0.322161,0.543896,0.176686,-0.521063,-3.142811,...,-0.175252,0.020304,-0.103169,2.653966,-1.050047,-0.995985,0.0,1.0,1.0,0
1,0.727002,1.726786,-0.943850,0.500531,-2.971171,0.379647,-0.144628,-1.085927,-0.546879,-3.142628,...,1.575793,-0.329739,-0.049529,-0.831461,-1.050047,-0.995985,0.0,1.0,0.0,0
2,4.784770,1.037861,0.118674,-3.735073,-0.884188,0.502046,-0.549043,-0.569640,-0.347726,-3.142264,...,0.602982,-0.329739,-0.129992,0.109363,-1.050047,-0.995985,1.0,0.0,0.0,0
3,7.355533,0.728026,0.556609,-6.344181,0.553830,-2.267821,-2.456201,-1.175151,-0.279498,-3.141809,...,0.602982,-0.329739,-0.129992,-0.563420,-1.050047,-0.995985,1.0,0.0,0.0,0
4,4.715456,1.063744,0.096733,-3.716964,-0.922074,0.562361,-0.546974,-0.582332,-0.347726,-3.141445,...,0.602982,-0.329739,-0.129992,0.114777,-1.050047,-0.995985,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97039,4.039484,-1.209542,0.052671,-1.733565,0.100919,0.636626,-0.026566,-0.264824,-0.438082,2.000711,...,0.602982,-0.329739,-0.129992,-0.029129,-1.050047,-0.995985,1.0,0.0,0.0,1
97040,3.864883,-1.136388,-0.011389,-1.287749,0.028885,0.267397,-0.257340,-0.267732,-0.438082,2.000984,...,0.602982,-0.329739,-0.129992,-0.027324,-1.050047,-0.995985,1.0,0.0,0.0,1
97041,4.097552,-1.232749,0.074325,-1.882010,0.125868,0.758387,0.050601,-0.263068,-0.438082,2.001166,...,0.602982,-0.329739,-0.129992,-0.026440,-1.050047,-0.995985,1.0,0.0,0.0,1
97042,3.729301,-1.079494,-0.061093,-0.941567,-0.026915,-0.019444,-0.436496,-0.269901,-0.438082,2.001348,...,0.602982,-0.329739,-0.129992,-0.025556,-1.050047,-0.995985,1.0,0.0,0.0,1


## Testing

In [106]:
from sklearn.model_selection import train_test_split

Train/test split on our combined feature set:

In [199]:
df = get_df(rho=0.6, tau=0.6, mi=0.5, pcs=8)
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df["y"], test_size=0.33)

In [200]:
from sklearn.neural_network import MLPClassifier

In [201]:
clf = MLPClassifier(solver='adam', hidden_layer_sizes=(12,5), random_state=1)

In [202]:
clf.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(12, 5), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [180]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score

In [203]:
confusion_matrix(y_test, clf.predict(X_test))

array([[16045,     6],
       [    4, 15970]])

In [204]:
recall_score(y_test, clf.predict(X_test))

0.9997495930887692

So this looks like a pretty good feature selection strategy:

* First $n$ principal components of the preprocessed features, together with
* Features high in one of
 * $|\rho|$ 
 * $|\tau|$
 * $MI$