In [6]:
import pandas as pd
import numpy as np
import re

from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [7]:
# load data

def load_data():
    tabular_data = pd.read_csv('tabular_data.csv', encoding='latin-1')
    hashed_feature = pd.read_csv('hashed_feature.csv', encoding='latin-1')
    train = pd.read_csv('train.csv', encoding='latin-1')
    test = pd.read_csv('test.csv', encoding='latin-1')
    return tabular_data, hashed_feature, train, test

In [8]:
tabular_data, hashed_feature, y_train, y_test = load_data()

In [9]:
tabular_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61272 entries, 0 to 61271
Data columns (total 52 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          61272 non-null  int64  
 1   period      61272 non-null  int64  
 2   feature_0   58460 non-null  float64
 3   feature_1   58380 non-null  float64
 4   feature_2   57795 non-null  float64
 5   feature_3   58314 non-null  float64
 6   feature_4   58425 non-null  float64
 7   feature_5   58257 non-null  float64
 8   feature_6   57814 non-null  float64
 9   feature_7   58280 non-null  float64
 10  feature_8   58338 non-null  float64
 11  feature_9   58344 non-null  float64
 12  feature_10  58318 non-null  float64
 13  feature_11  58184 non-null  float64
 14  feature_12  57847 non-null  float64
 15  feature_13  57938 non-null  float64
 16  feature_14  58366 non-null  float64
 17  feature_15  58410 non-null  float64
 18  feature_16  58193 non-null  float64
 19  feature_17  61272 non-nul

In [97]:
y_test

Unnamed: 0,id,score
0,4084,
1,4085,
2,4086,
3,4087,
4,4088,
...,...,...
1017,5101,
1018,5102,
1019,5103,
1020,5104,


In [11]:
X = tabular_data

In [12]:
pd.crosstab(X['feature_25'], X.id)

id,0,1,2,3,4,5,6,7,8,9,...,5096,5097,5098,5099,5100,5101,5102,5103,5104,5105
feature_25,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11e623a37e87cf7995c466723ec99688d55cae8c,7,0,1,9,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
390c481a740bb1c12c57f33dc5263ced2ab11796,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aa2c96dacf00c451ef465f6115a45a20bccf1256,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,12,0,0
b204e1e3743cc314841070435211e25f4602b9fd,0,0,11,0,0,0,12,0,0,0,...,0,0,12,0,0,0,12,0,0,12
cf3cca1da7361bd988642600e76c4a3021be8ccf,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cf7413906c99be47067cb3fb6299959857710d77,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
d458ece2abeae803254c5d442b2e2a80f58a4153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
f9ae10397ed62e79c644d06842bbcd92963a5927,4,12,0,3,12,12,0,12,11,12,...,12,12,0,12,12,12,0,0,12,0


In [13]:
hashed_feature_50_uniqu = pd.crosstab(hashed_feature['feature_50'], hashed_feature.id)

In [14]:
hashed_feature_50_uniqu

id,0,1,2,3,4,5,6,7,8,9,...,5096,5097,5098,5099,5100,5101,5102,5103,5104,5105
feature_50,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000063d721c1f034be95d0208c9b695ae4c4a7ff,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0004424eef8f9bf1292d5db1b747974102dba0c3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
000f3ba943b7e58a5d676205c090ef812941a160,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00190bc7e4a8816794ab97d128bafd54b5fa7d98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00191d9d44fd37f680cffbb0dfc5d30ae26581ff,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffc6bd16e0f7c6f106089f3bae741006a2423afc,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
ffcba4846ef0f01e3b0476d0bf0b5d058fccb9f7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ffd80e85ec84a0fee0ce0af3416da95810ced13c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ffdae33cc90c7a30c4f70d285c0f54fdc1ee1e26,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
d = []
for hashed_feature in hashed_feature_50_uniqu.index:
    d.append(hashed_feature)

In [16]:
f_50 = dict(zip(d,range(5009)))

In [17]:
hashed_feature_50_uniqu.index = hashed_feature_50_uniqu.index.map(f_50)

In [18]:
hashed_feature_50_uniqu = hashed_feature_50_uniqu.T

In [19]:
hashed_feature_50_uniqu

feature_50,0,1,2,3,4,5,6,7,8,9,...,4999,5000,5001,5002,5003,5004,5005,5006,5007,5008
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5101,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5102,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5103,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
h_train = hashed_feature_50_uniqu.drop(index = list(range(4085,4406)) + list(range(4407,5106)))

In [102]:
h_test = hashed_feature_50_uniqu.drop(index = list(range(0,3090)) + 
                                      list(range(3091,4083)))

In [103]:
h_test

feature_50,0,1,2,3,4,5,6,7,8,9,...,4999,5000,5001,5002,5003,5004,5005,5006,5007,5008
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4083,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4084,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4085,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4086,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4087,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5101,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5102,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5103,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
h_train

feature_50,0,1,2,3,4,5,6,7,8,9,...,4999,5000,5001,5002,5003,5004,5005,5006,5007,5008
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4081,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4082,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4083,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
h_train = h_train.replace([np.inf, -np.inf], np.nan)

In [40]:
for column in h_train.columns:
    h_train[column] = h_train[column].fillna(h_train[column].mean())

In [42]:
h_train

feature_50,0,1,2,3,4,5,6,7,8,9,...,4999,5000,5001,5002,5003,5004,5005,5006,5007,5008
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4081,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4082,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4083,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [333]:
for column in normalized_x_train.columns:
    normalized_x_train[column] = normalized_x_train[column].fillna(normalized_x_train[column].mean())

In [334]:
XXX = pd.concat([normalized_x_train, h_train], axis=1) 

In [335]:
XXX.shape

(4084, 5058)

In [336]:
for column in XXX.columns:
    XXX[column] = XXX[column].fillna(XXX[column].mean())

In [1]:
first_knn = KNeighborsClassifier()

NameError: name 'KNeighborsClassifier' is not defined

In [338]:
tree_params = {'max_depth': np.arange(1, 4), 'max_features':[.8, 1]}

In [339]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=-1)

In [None]:
%%time
tree_grid.fit(XXX , y_train.drop('id',axis =1));

In [None]:
tree_grid.fit()

In [305]:
accuracy = (y_pred == y_train.target).mean()

In [306]:
print(accuracy)

0.6410381978452497


In [9]:
X['feature_25'] = X['feature_25'].map({'11e623a37e87cf7995c466723ec99688d55cae8c': 100,
                                                        '390c481a740bb1c12c57f33dc5263ced2ab11796': 200,
                                                        'aa2c96dacf00c451ef465f6115a45a20bccf1256': 300,
                                                        'b204e1e3743cc314841070435211e25f4602b9fd': 400,                                                       
                                                        'cf3cca1da7361bd988642600e76c4a3021be8ccf': 500,                                                                                                            
                                                        'cf7413906c99be47067cb3fb6299959857710d77': 600,                                                      
                                                        'd458ece2abeae803254c5d442b2e2a80f58a4153': 700,                                                      
                                                        'f9ae10397ed62e79c644d06842bbcd92963a5927': 800,                                                      
                                                      })

In [10]:
for column in X.columns:
    X[column] = X[column].fillna(X[column].mean())

In [11]:
X.feature_25

0        800
1        800
2        800
3        800
4        100
        ... 
61267    400
61268    400
61269    400
61270    400
61271    400
Name: feature_25, Length: 61272, dtype: int64

In [12]:
x_sum = X.drop(index = range(1,61272))

In [13]:
x_sum = x_sum.drop(index = 0)


In [14]:
x_sum

Unnamed: 0,id,period,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49


In [15]:
k = 0
for i in X['id']:
    if k%12==0:
        x_sum = x_sum.append(X.loc[k:k+11].sum(axis=0), ignore_index = True)
    k+=1
    

In [16]:
x_sum

Unnamed: 0,id,period,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49
0,0.0,78.0,1320.000000,731.000000,2.866555,3523.629746,5161.610000,0.000000,5.343986,4693.534964,...,5.100000,0.0,138.431547,0.000000,718.500000,9.00000,6.009026,1714.000000,3780.140000,3.000000
1,12.0,78.0,1320.000000,1626.500000,0.725314,83286.958225,13432.230000,0.000000,4.207655,14398.120000,...,19.640000,0.0,40620.522617,1.000000,1500.000000,81.00000,4.068892,1631.000000,3466.630853,2452.009568
2,24.0,78.0,1322.000000,7.170000,8.559721,1.696433,50.020000,0.000000,10.500171,4693.534964,...,5.480000,0.0,0.000987,0.000000,7.170000,0.00000,2.566367,121.000000,29.640000,0.000000
3,36.0,78.0,1320.634434,971.781683,8.230128,31578.475518,3959.174433,13198.746579,7.032080,4013.201223,...,23.543156,0.0,4063.577415,11.833557,784.539926,116.55664,5.852904,1446.667448,2765.923140,580.688209
4,48.0,78.0,1320.000000,1512.000000,1.604314,30819.401741,5523.990000,0.000000,12.082259,6269.390000,...,40.580000,0.0,1955.666336,2.000000,1244.800000,436.00000,9.431095,2284.000000,3466.630853,2506.932045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5101,61212.0,78.0,1320.000000,1130.000000,3.507360,2161.714785,14620.660000,1178.533981,5.585183,17169.430000,...,3.230000,0.0,429.471227,0.000000,1100.010000,145.00000,5.042036,2007.000000,10300.810000,78.000000
5102,61224.0,78.0,1320.000000,968.100000,9.068844,0.000000,2585.990000,0.000000,9.272238,1737.390000,...,10.350000,0.0,0.000000,5.000000,895.000000,102.00000,5.413837,1229.000000,2250.070000,20.000000
5103,61236.0,78.0,1323.000000,270.820000,9.618391,0.000000,1742.780000,0.000000,10.213948,769.100000,...,32.030000,0.0,0.000000,4.000000,250.000000,67.00000,4.296799,207.000000,3466.630853,92.000000
5104,61248.0,78.0,1320.000000,1034.770000,0.823126,11866.732524,871.700000,0.000000,6.580039,884.640000,...,6.570000,0.0,2290.858807,5.000000,1023.770000,52.00000,8.275377,731.000000,598.450000,55.000000


In [17]:
x_train = x_sum.drop(index = range(4084,5106))

In [18]:
x_train = x_train.drop(['period'],axis =1)

In [19]:
x_train['feature_25']

0       4200.0
1       9600.0
2       4500.0
3       3300.0
4       9600.0
         ...  
4079    3600.0
4080    9600.0
4081    4800.0
4082    4800.0
4083    9600.0
Name: feature_25, Length: 4084, dtype: float64

In [20]:
x_test = x_sum.drop(index = range(4084)).drop(['id','period'],axis =1)

In [21]:
x_test[['feature_0', 'feature_1']]

Unnamed: 0,feature_0,feature_1
4084,1320.000000,997.160000
4085,1320.634434,1030.281683
4086,1320.000000,793.000000
4087,1320.000000,428.500000
4088,1320.000000,1487.460000
...,...,...
5101,1320.000000,1130.000000
5102,1320.000000,968.100000
5103,1323.000000,270.820000
5104,1320.000000,1034.770000


In [22]:
y_train = y_train.drop('id',axis = 1)

In [23]:
y_train

Unnamed: 0,target
0,0
1,0
2,1
3,0
4,1
...,...
4079,0
4080,0
4081,0
4082,0


In [43]:
x_train = x_train.drop(['feature_41'],axis = 1)

In [121]:
normalized_x_train=(x_train-x_train.mean())/x_train.std()


NameError: name 'x_train' is not defined

In [85]:
normalized_x_test=(x_test-x_test.mean())/x_test.std()

NameError: name 'x_test' is not defined

In [48]:
from sklearn.neighbors import KNeighborsClassifier, SVM

In [49]:
y_train

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,1
3,3,0
4,4,1
...,...,...
4079,4079,0
4080,4080,0
4081,4081,0
4082,4082,0


In [4]:
from sklearn.model_selection import GridSearchCV

In [30]:
first_tree = DecisionTreeClassifier()

In [81]:
tree_params = {'max_depth': range(1, 10), 'max_features':[.77]}

In [82]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=-1)

In [83]:
%%time
tree_grid.fit(h_train, y_train.drop('id',axis = 1));

CPU times: user 505 ms, sys: 81.1 ms, total: 586 ms
Wall time: 11.6 s


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'max_depth': range(1, 10), 'max_features': [0.77]})

In [84]:
tree_grid.best_score_, tree_grid.best_params_

(0.7392225862193101, {'max_depth': 2, 'max_features': 0.77})

In [105]:
pred = tree_grid.predict(h_test)

In [108]:
y_test['score'] = pred

In [109]:
y_test

Unnamed: 0,id,score
0,4084,0
1,4085,0
2,4086,0
3,4087,0
4,4088,1
...,...,...
1017,5101,0
1018,5102,0
1019,5103,0
1020,5104,0


In [127]:
normalized_x_train = pd.read_csv('norm_x_train.csv', encoding='latin-1')

In [128]:
h_train.shape

(4084, 5009)

In [129]:
normalized_x_train.shape

(4084, 49)

In [130]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [132]:
X_X = pd.concat([normalized_x_train, h_train], axis=1)

In [147]:
X_X = X_X.drop('Unnamed: 0',axis =1)

In [148]:
X_X

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,4999,5000,5001,5002,5003,5004,5005,5006,5007,5008
0,-0.342535,-0.264098,-0.277732,0.037294,-0.257892,-0.425304,0.002519,0.204511,-0.601818,0.096157,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.606794,-0.973294,0.391613,1.580414,-0.257892,-0.730625,2.225250,-0.100530,1.530241,1.670211,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.109876,1.621522,-0.307287,-0.916418,-0.257892,0.960113,0.002519,-1.060104,-0.603483,-0.880276,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.087280,1.512358,-0.042306,-0.187054,-0.066028,0.028271,-0.153304,-0.044648,-0.139136,-0.179046,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.485411,-0.682162,-0.048676,0.104907,-0.257892,1.385205,0.363452,0.939520,-0.366024,0.057409,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4080,-0.474403,1.756625,-0.307301,-0.840832,-0.257892,-0.864568,-0.902572,-0.003028,0.009520,-0.817943,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4081,-0.632486,0.977955,-0.307301,-0.845339,-0.257892,-1.347353,-0.661854,-0.619405,-0.589160,-0.817419,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4082,-0.516924,3.033507,-0.303360,8.270739,-0.257892,-1.631779,2.312812,-0.003028,-0.548511,8.482330,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4083,1.276063,-0.888655,5.067660,-0.534380,8.977186,0.714656,-0.533287,0.077283,1.583349,-0.501199,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [149]:
for column in X_X.columns:
    X_X[column] = X_X[column].fillna(0) 5057

In [162]:
norm_x_test = pd.read_csv('norm_x_test.csv', encoding='latin-1')

In [176]:
norm_x_test

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_39,feature_40,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49
0,-0.021930,0.021067,-0.244995,-0.299317,-0.281097,-1.180057,0.075182,-0.025989,-0.584394,-0.280893,...,-0.182091,0.216359,-0.220498,-0.336391,0.118497,-0.470390,-1.349952,-0.047688,-0.274613,1.361741
1,0.026602,-0.012803,-0.079410,-0.201894,-0.063217,1.320432,-0.201610,-0.170238,-0.180948,-0.192487,...,-0.023429,-0.080374,-0.069174,-0.097029,0.103483,-0.151169,0.266937,-0.169061,-0.196110,-0.154060
2,-0.321076,-0.219909,-0.399842,1.826543,-0.281097,-1.743245,3.625392,1.764583,1.450394,1.895638,...,-0.009949,-0.365173,-0.220597,-0.428060,-0.103610,-0.538115,-1.317630,2.326713,1.818809,-0.614738
3,-0.855160,-0.317617,-0.221742,-0.706918,-0.042286,1.775618,0.096160,-0.373233,1.476875,-0.684651,...,-0.226675,-0.092947,0.037636,-0.336391,-0.881925,0.256759,-0.393209,-0.195469,-0.672422,-0.585299
4,0.696484,-0.885974,-0.006293,-0.429052,-0.281097,0.252177,-0.140157,0.430713,-0.606268,-0.389665,...,-0.226566,-0.353028,-0.091988,-0.214167,0.981106,-0.384843,-0.916900,0.757491,-0.400318,-0.618944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017,0.172715,-0.048350,-0.374217,1.766622,-0.261642,-0.339251,2.658794,0.173299,-0.555820,1.837459,...,-0.216256,-0.323054,-0.204592,-0.428060,0.527403,-0.042655,-0.005985,0.400988,1.721460,-0.560065
1018,-0.064510,1.794743,-0.399843,-0.451412,-0.281097,0.645203,-0.641624,0.011917,-0.606732,-0.409402,...,-0.226675,-0.231063,-0.220597,-0.275279,0.142173,-0.195927,0.108414,-0.191660,-0.322121,-0.608850
1019,-1.086201,1.976864,-0.399843,-0.606819,-0.281097,0.896642,-0.848710,-0.947695,-0.411053,-0.660090,...,-0.226675,0.049044,-0.220597,-0.305835,-1.069833,-0.320683,-0.235288,-0.970177,-0.013312,-0.548289
1020,0.033179,-0.937914,-0.259172,-0.767362,-0.281097,-0.073622,-0.824000,-0.530248,-0.587229,-0.740673,...,-0.226675,-0.279901,-0.135222,-0.275279,0.384142,-0.374150,0.988883,-0.571016,-0.741364,-0.579411


In [177]:
h_test.index = range(0,1022)

In [178]:
h_test

feature_50,0,1,2,3,4,5,6,7,8,9,...,4999,5000,5001,5002,5003,5004,5005,5006,5007,5008
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [180]:
XX_X = pd.concat([norm_x_test, h_test], axis=1)

In [181]:
XX_X

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,4999,5000,5001,5002,5003,5004,5005,5006,5007,5008
0,-0.021930,0.021067,-0.244995,-0.299317,-0.281097,-1.180057,0.075182,-0.025989,-0.584394,-0.280893,...,0,0,0,0,0,0,0,0,0,0
1,0.026602,-0.012803,-0.079410,-0.201894,-0.063217,1.320432,-0.201610,-0.170238,-0.180948,-0.192487,...,0,0,0,0,0,0,0,0,0,0
2,-0.321076,-0.219909,-0.399842,1.826543,-0.281097,-1.743245,3.625392,1.764583,1.450394,1.895638,...,0,0,0,0,0,0,0,0,0,0
3,-0.855160,-0.317617,-0.221742,-0.706918,-0.042286,1.775618,0.096160,-0.373233,1.476875,-0.684651,...,0,0,0,0,0,0,0,0,0,0
4,0.696484,-0.885974,-0.006293,-0.429052,-0.281097,0.252177,-0.140157,0.430713,-0.606268,-0.389665,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017,0.172715,-0.048350,-0.374217,1.766622,-0.261642,-0.339251,2.658794,0.173299,-0.555820,1.837459,...,0,0,0,0,0,1,0,0,0,0
1018,-0.064510,1.794743,-0.399843,-0.451412,-0.281097,0.645203,-0.641624,0.011917,-0.606732,-0.409402,...,0,0,0,0,0,0,0,0,0,0
1019,-1.086201,1.976864,-0.399843,-0.606819,-0.281097,0.896642,-0.848710,-0.947695,-0.411053,-0.660090,...,0,0,0,0,0,0,0,0,0,0
1020,0.033179,-0.937914,-0.259172,-0.767362,-0.281097,-0.073622,-0.824000,-0.530248,-0.587229,-0.740673,...,0,0,0,0,0,0,0,0,0,0


In [116]:
y_test.to_csv('predic_hash.csv')

In [139]:
param = {'knn__n_neighbors': range(33, 34)}

In [151]:
knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_jobs=-1))])
grid = GridSearchCV(knn_pipe, param, cv=5, n_jobs=-1)
grid.fit(X_X, y_train.drop('id',axis = 1))

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('knn',
                                        KNeighborsClassifier(n_jobs=-1))]),
             n_jobs=-1, param_grid={'knn__n_neighbors': range(33, 34)})

In [152]:
grid.best_score_, grid.best_params_

(0.7367776057791537, {'knn__n_neighbors': 33})

In [182]:
ppppp = grid.predict(XX_X)

In [186]:
ppppp.sum()

1