In [35]:
import numpy as np 
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder

In [36]:
df_train = pd.read_csv("train.csv",index_col=0)
df_test = pd.read_csv("test.csv",index_col=0)

In [None]:
ord_1_map  = dict(Novice=0,Contributor=1,Expert=2,Master=3,Grandmaster=4)
ord_2_map = dict(Freezing=0,Cold=1,Warm=2,Hot=3)
ord_2_map.update({'Boiling Hot':4,'Lava Hot':5})
ord_3_lookup = {j:i for (i,j) in zip(range(len(string.ascii_lowercase)),string.ascii_lowercase)}
ord_4_lookup = {j:i for (i,j) in zip(range(len(string.ascii_lowercase)),string.ascii_uppercase)}

df_train.ord_1 = df_train.ord_1.map(ord_1_map)
df_train.ord_2 = df_train.ord_2.map(ord_2_map)
df_train.ord_3 = df_train.ord_3.map(ord_3_lookup)
df_train.ord_4 = df_train.ord_4.map(ord_4_lookup)

df_test.ord_1 = df_test.ord_1.map(ord_1_map)
df_test.ord_2 = df_test.ord_2.map(ord_2_map)
df_test.ord_3 = df_test.ord_3.map(ord_3_lookup)
df_test.ord_4 = df_test.ord_4.map(ord_4_lookup)

In [37]:
df_train.dropna(inplace=True)
df_test.fillna(0,inplace=True)

In [38]:
float_cols = df_train.select_dtypes("float64").columns.tolist()
df_train[float_cols] = df_train[float_cols].astype("int16")

In [39]:
# df_train.fillna(df_train.median, inplace=True)
# df_test.fillna(df_train.median, inplace=True)

In [40]:
%%time
cat_cols = df_train.select_dtypes('O').columns.tolist()
encoder = TargetEncoder(cols = cat_cols,smoothing=0.2)
encoder.fit(df_train[cat_cols],df_train.target)
df_train[cat_cols] = encoder.transform(df_train[cat_cols])
df_test[cat_cols] = encoder.transform(df_test[cat_cols])


CPU times: user 6 s, sys: 962 ms, total: 6.97 s
Wall time: 6.97 s


In [41]:
df_train.head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0.18753,0.180621,0.182689,0.223915,0.168329,0.219342,0.208428,...,0.219048,3,0.173659,0.205994,0.110363,0.222031,0.146994,6,3,0
6,0,0,0,0.18753,0.180621,0.182689,0.158584,0.168329,0.202258,0.208428,...,0.208,1,0.233281,0.15298,0.110363,0.205705,0.241023,5,6,0
7,0,0,1,0.188656,0.180621,0.182689,0.158584,0.20125,0.180852,0.208428,...,0.211111,3,0.190944,0.15298,0.105846,0.24034,0.146341,1,1,0
10,0,0,1,0.188656,0.196433,0.198048,0.158584,0.168329,0.180852,0.208428,...,0.150235,1,0.156047,0.205994,0.105846,0.194544,0.195021,7,5,1
11,0,0,1,0.18753,0.196433,0.182689,0.159744,0.168329,0.202258,0.182923,...,0.125,1,0.173659,0.205994,0.105846,0.183441,0.281621,2,8,0


In [42]:
X_train, X_test, y_train, y_test = train_test_split(df_train.drop("target",axis=1),df_train.target, random_state=0)

In [43]:
%%time
rf = RandomForestClassifier(n_estimators=500,max_features=0.5,n_jobs=-1,min_samples_leaf=25)
rf.fit(X_train,y_train)

CPU times: user 9min 31s, sys: 346 ms, total: 9min 31s
Wall time: 1min 20s


In [44]:
from sklearn.metrics import roc_auc_score
preds = rf.predict_proba(X_test)
print(roc_auc_score(y_test,preds[:,1]))

0.7855718486110482


In [45]:
sub = pd.read_csv("sample_submission.csv")
preds= rf.predict_proba(df_test)
sub["target"] = preds[:, 1]
sub.head()

Unnamed: 0,id,target
0,600000,0.181856
1,600001,0.198051
2,600002,0.096411
3,600003,0.182646
4,600004,0.086612


In [46]:
sub.to_csv("preds/rf.csv",index=False)