In [10]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import model_selection
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import auc, confusion_matrix, accuracy_score, recall_score, ConfusionMatrixDisplay, roc_auc_score, precision_recall_curve
import xgboost as xgb
import lightgbm as lgb
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv
/kaggle/input/icr-identify-age-related-conditions/greeks.csv
/kaggle/input/icr-identify-age-related-conditions/train.csv
/kaggle/input/icr-identify-age-related-conditions/test.csv


In [11]:
train = pd.read_csv('../input/icr-identify-age-related-conditions/train.csv', index_col = 'Id')
test = pd.read_csv('../input/icr-identify-age-related-conditions/test.csv', index_col = 'Id')
sample_submission = pd.read_csv('../input/icr-identify-age-related-conditions/sample_submission.csv')

In [12]:
train.head()

Unnamed: 0_level_0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,5135.78024,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,4169.67738,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,5728.73412,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


#### The only non-numerical column is the binary categorical column EJ, so we will replace it with the binary numerical equivalent in both datasets:

In [13]:
train["EJ"] = train["EJ"].replace({'A':0,'B':1})
test["EJ"] = test["EJ"].replace({'A':0,'B':1})
X, y = train.drop(columns = 'Class'), train["Class"]

In [14]:
train["Class"].value_counts(normalize=True)

0    0.824959
1    0.175041
Name: Class, dtype: float64

#### Since the ratio of 0's to 1's in the class column is 4.71 : 1, we will weight the positive class (1's) 4.71 times more to make the overall classes equally important to the model:

In [15]:
# The inspiration for using these two models together is that each of them 
# performed decently by themselves, but the subset of incorrect CV answers 
# generated by each of them were very different, suggesting that they could 
# potentially account for each other's mistakes.

# The hyperparameters of these two models were determined by a GridSearch.

clf1 = xgb.XGBClassifier(max_depth = 3, n_estimators = 140, 
                           random_state = 737, subsample = 0.75, 
                           learning_rate = 0.1, scale_pos_weight = 4.71)
clf1.fit(X,y)
predictions1 = pd.DataFrame(clf1.predict_proba(test))

clf2 = lgb.LGBMClassifier(max_depth = 2, n_estimators = 110, 
                          random_state = 737, subsample = 1.0,
                          num_leaves = 10, class_weight = 'balanced')
clf2.fit(X,y)
predictions2 = pd.DataFrame(clf2.predict_proba(test))


# Averaging the predictions
blend = (predictions1 + predictions2) / 2


In [16]:
# Creating submission

sample_submission['Id'] = test.reset_index()['Id']
sample_submission["class_0"] = blend[0]
sample_submission["class_1"]  = blend[1]
print(sample_submission)
sample_submission.set_index('Id').to_csv('submission.csv')

             Id   class_0   class_1
0  00eed32682bb  0.761501  0.238499
1  010ebe33f668  0.761501  0.238499
2  02fa521e1838  0.761501  0.238499
3  040e15f562a2  0.761501  0.238499
4  046e85c7cc7f  0.761501  0.238499


In [17]:
! head 'submission.csv'

Id,class_0,class_1
00eed32682bb,0.7615009769677634,0.23849903048281723
010ebe33f668,0.7615009769677634,0.23849903048281723
02fa521e1838,0.7615009769677634,0.23849903048281723
040e15f562a2,0.7615009769677634,0.23849903048281723
046e85c7cc7f,0.7615009769677634,0.23849903048281723
