In [85]:
import numpy as np 
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from category_encoders import TargetEncoder
from sklearn.model_selection import KFold
import string

In [86]:
df = pd.read_csv("train.csv",index_col=0)
df_t = pd.read_csv("test.csv",index_col=0)

In [87]:
ord_1_map  = dict(Novice=0,Contributor=1,Expert=2,Master=3,Grandmaster=4)
ord_2_map = dict(Freezing=0,Cold=1,Warm=2,Hot=3)
ord_2_map.update({'Boiling Hot':4,'Lava Hot':5})
ord_3_lookup = {j:i for (i,j) in zip(range(len(string.ascii_lowercase)),string.ascii_lowercase)}
ord_4_lookup = {j:i for (i,j) in zip(range(len(string.ascii_lowercase)),string.ascii_uppercase)}

df.ord_1 = df.ord_1.map(ord_1_map)
df.ord_2 = df.ord_2.map(ord_2_map)
df.ord_3 = df.ord_3.map(ord_3_lookup)
df.ord_4 = df.ord_4.map(ord_4_lookup)

df_t.ord_1 = df_t.ord_1.map(ord_1_map)
df_t.ord_2 = df_t.ord_2.map(ord_2_map)
df_t.ord_3 = df_t.ord_3.map(ord_3_lookup)
df_t.ord_4 = df_t.ord_4.map(ord_4_lookup)
ords = ['ord_1','ord_2','ord_3','ord_4']

In [88]:
df[ords] = df[ords].apply(pd.to_numeric, errors='coerce')
df[ords] = df[ords].fillna(df[ords].median())
df[ords] = df[ords].astype("int")

In [89]:
cat_cols = df.select_dtypes(['O']).columns.tolist()

In [90]:
enc = TargetEncoder(cols=cat_cols).fit(df.select_dtypes('O'), df.target)

In [91]:
transformed = enc.transform(df.select_dtypes('O'))
df[cat_cols] = transformed
transformed = enc.transform(df_t.select_dtypes('O'))
df_t[cat_cols] = transformed

In [92]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target',axis=1),df.target,random_state=0)

In [103]:
%%time
estimated_params = {'subsample': 0.8, 'min_child_weight': 5, 'max_depth': 5, 'gamma': 2, 'colsample_bytree': 0.8}
xgb = XGBClassifier(tree_method='gpu_hist',n_jobs=-1,random_state=0, **estimated_params)
xgb.fit(X_train,y_train)

CPU times: user 962 ms, sys: 516 ms, total: 1.48 s
Wall time: 1.48 s


In [104]:
preds = xgb.predict_proba(X_test)
print(roc_auc_score(y_test,preds[:,1]))

0.7923426210112559


In [95]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [96]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [100]:
# %%time
# clf = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
#                     silent=True, nthread=1,tree_method='gpu_hist')

# random_search = RandomizedSearchCV(clf, 
#                                    param_distributions=params, 
#                                    n_iter=5, 
#                                    scoring='roc_auc', 
#                                    n_jobs=1, cv=3, verbose=3, random_state=0 )
# random_search.fit(X_train, y_train)

In [102]:
print('Best score reached: {} with params: {} '.format(random_search.best_score_, random_search.best_params_))
with open("xgb_target.txt","w") as f:
    f.write(f"{random_search.best_params_}")

Best score reached: 0.7940464866063234 with params: {'subsample': 0.8, 'min_child_weight': 5, 'max_depth': 5, 'gamma': 2, 'colsample_bytree': 0.8} 


In [None]:
{'subsample': 0.8, 'min_child_weight': 5, 'max_depth': 5, 'gamma': 2, 'colsample_bytree': 0.8}

In [105]:
sub = pd.read_csv("sample_submission.csv")
preds= xgb.predict_proba(df_t)
sub["target"] = preds[:, 1]
sub.head()

Unnamed: 0,id,target
0,600000,0.075984
1,600001,0.264474
2,600002,0.204797
3,600003,0.127666
4,600004,0.081376


In [106]:
sub.to_csv("preds/xgb.csv",index=False)

In [11]:
df = pd.read_csv("train.csv",index_col=0)
