In [1]:
import pandas as pd
import numpy as np
import re, matplotlib
import category_encoders as ce
from xgboost import XGBClassifier,plot_tree,plot_importance
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
from matplotlib import pyplot
%matplotlib inline



# <a id="3"></a> 
# 2. Reading Data
[TOC](#0)

In [3]:
df = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0


In [4]:
df.drop(['Id'], axis=1, inplace=True)

In [5]:
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

In [10]:
column_names = list(df.columns.values)

column: BQ --> 60
column: CB --> 2
column: CC --> 3
column: DU --> 1
column: EL --> 60
column: FC --> 1
column: FL --> 1
column: FS --> 2
column: GL --> 1


In [11]:
def fill_nan(column_names):
    for column in column_names:
        if df[column].isnull().sum() != 0:
            mean = df[column].mean()
            df[column].fillna(mean, inplace = True)

In [12]:
fill_nan(column_names)

In [14]:
X = df.iloc[:, :-1]

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,3.58345,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,10.358927,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978


# <a id="7"></a> 
# 3. y
[TOC](#0)

In [15]:
y = df.iloc[:,-1]

0      1
1      0
2      0
3      0
4      1
      ..
612    0
613    0
614    0
615    0
616    0
Name: Class, Length: 617, dtype: int64

In [17]:
enc = ce.OneHotEncoder(cols=["EJ"])
enc.fit(X)
encoded_x = enc.transform(X)

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,3.58345,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,10.358927,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978


# <a id="9"></a> 
# 5. Split Data into Train/Test
[TOC](#0)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(encoded_x, y, test_size = 0.2, random_state = 0)
ec1_eval_set = [(X_train, y_train), (X_test, y_test)]
estimate = 4.478

In [21]:
model = XGBClassifier(colsample_bylevel = 0.6, 
                      colsample_bytree = 1.0, 
                      learning_rate = 0.3, 
                      max_depth = 2, 
                      n_estimators = 100, 
                      subsample = 1.0,   
                      scale_pos_weight = 5)

In [22]:
model.fit(X_train, y_train, early_stopping_rounds = 10, eval_metric = ["error", "logloss"],
          eval_set = ec1_eval_set, verbose = True)

[0]	validation_0-error:0.29412	validation_0-logloss:0.58808	validation_1-error:0.44355	validation_1-logloss:0.64736
[1]	validation_0-error:0.12373	validation_0-logloss:0.49848	validation_1-error:0.24194	validation_1-logloss:0.58199
[2]	validation_0-error:0.13185	validation_0-logloss:0.44767	validation_1-error:0.24194	validation_1-logloss:0.53391
[3]	validation_0-error:0.12576	validation_0-logloss:0.39804	validation_1-error:0.23387	validation_1-logloss:0.49501
[4]	validation_0-error:0.09736	validation_0-logloss:0.35618	validation_1-error:0.19355	validation_1-logloss:0.45094
[5]	validation_0-error:0.07708	validation_0-logloss:0.32159	validation_1-error:0.14516	validation_1-logloss:0.42632
[6]	validation_0-error:0.07099	validation_0-logloss:0.30280	validation_1-error:0.13710	validation_1-logloss:0.40379
[7]	validation_0-error:0.05882	validation_0-logloss:0.28120	validation_1-error:0.11290	validation_1-logloss:0.38709
[8]	validation_0-error:0.05477	validation_0-logloss:0.26229	validation_1



# <a id="13"></a> 
# 7. KFold Accuracy
[TOC](#0)

In [23]:
Kfold = KFold(n_splits = 10)
results_K = cross_val_score(model, encoded_x, y, cv = Kfold)
print("KFold - Accuracy: {0}% ({1})%)".format(results_K.mean()*100, results_K.std()*100))

KFold - Accuracy: 93.68588048651509% (2.6552857153256966)%)


With the new parameters, we got about 94% accuracy. Let's see how we do.

<div class="alert alert-block alert-warning">
<b>We can encode the whole dataframe without splitting into X and y</b>
</div>

In [25]:
df_enc = ce.OneHotEncoder(cols=["EJ"])
encoded_df = df_enc.fit_transform(df)
corr = encoded_df.corr()

In [27]:
test_df = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,00eed32682bb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,010ebe33f668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
new_test_df = test_df.drop(['Id'], axis = 1)

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Let's encode the EJ column as well.

In [30]:
encoded_test_df = enc.transform(new_test_df)

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
class_0_pred = model.predict_proba(encoded_test_df)[:,0]

In [32]:
class_1_pred = model.predict_proba(encoded_test_df)[:,1]

In [33]:
test_df["class_0"] = pd.Series(class_0_pred).values
test_df["class_1"] = pd.Series(class_1_pred).values

In [34]:
submission_df = test_df[["Id", "class_0", "class_1"]]
submission_df

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.321698,0.678302
1,010ebe33f668,0.321698,0.678302
2,02fa521e1838,0.321698,0.678302
3,040e15f562a2,0.321698,0.678302
4,046e85c7cc7f,0.321698,0.678302


In [35]:
submission_df.to_csv("/kaggle/working/submission.csv", index=False)