Import Libararies 

# Import Libraries

In [1]:
#Installation of catboot libraries
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [2]:
import numpy as np
import pandas as pd
# pd.set_option('max_colwidth', 500)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import auc, classification_report, roc_auc_score

import lightgbm as lgb
import xgboost as xgb

# Data

Loading Data

In [3]:
trainData = pd.read_csv("/content/drive/MyDrive/Train.csv")
testData = pd.read_csv("/content/drive/MyDrive/Test.csv")

Data Description

In [None]:
#Checking NullValues & Data type of Features
trainData.info()
testData.info()
trainData.describe()
testData.describe()

In [None]:
testData.columns

**Visualizing and Understanding the Data**

In [None]:
#Target Histogram
sns.set()
sns.histplot(trainData['target'])

# ImBalanced Data

In [None]:
# Relation between Features to see if I should drop any feature
feat_re = list(trainData.select_dtypes(include=['int64', 'float64', 'int32']).columns)
trainData[feat_re].hist(figsize=(20,15));

In [None]:
feat_re = list(testData.select_dtypes(include=['int64', 'float64', 'int32']).columns)
testData[feat_re].hist(figsize=(20,15));

In [None]:
# Visualize the Relations between features

attri = ['population', 'Q9']
sns.pairplot(trainData.reset_index(drop=True)[attri], diag_kind='kde')
plt.gcf().set_size_inches(15, 10)

# Data Preprocessing

Turns out that the region column is considered to have numerical values in the train data while been strings in the test data, so that i used "astype(str)" in Label Encoder for the region for test data

In [4]:
# Encoding gategorical data [country, region] using LabelEncoder
# 1st Country for the train and test dataframes
label_Encode = LabelEncoder()
trainData['countryLabel'] = label_Encode.fit_transform(trainData['country'])

testData['countryLabel'] = label_Encode.transform(testData['country'])

# 2nd region for the train and test datraframes
label_Encode2 = LabelEncoder()
trainData['regionLabel'] = label_Encode2.fit_transform(trainData['region'].astype(str))

testData['regionLabel'] = label_Encode2.transform(testData['region'].astype(str))

# Modeling And Tuning

Setting Features for Models

In [5]:
drop_features = ['ID', 'country', 'region','target']
features = [feat for feat in trainData.columns if feat not in drop_features]
categ_features = ['countryLabel','regionLabel']
x,y   = trainData[features], trainData['target']

**Note** that I decided to use **CatBoost** and **Lightgbm**
models since they handle missing values by default

**CatBoost** Validation

In [6]:
score = 0
testData['target'] = 0
Skfold = StratifiedKFold(n_splits=5,shuffle=True, random_state=195024)
catOOF = np.zeros((trainData.shape[0],))
cat_predictions= []
for loop, (idx, vidx) in enumerate(Skfold.split(x, trainData.country)):
    print('Fold:',loop+1)
        
    # Creating lgb train/valid data
    x_train, y_train = x.iloc[idx,:], y[idx] 
    x_test, y_test = x.iloc[vidx,:], y[vidx] 
       
    estimator = CatBoostClassifier(learning_rate=0.05,
                                task_type="GPU",
                                devices='0:1',
                                iterations=10000,eval_metric='AUC',
                              use_best_model =True,
                              verbose=100,
                             random_seed= 0)
    estimator.fit(Pool(x_train,y_train,cat_features = categ_features),
                  eval_set = Pool(x_test,y_test,cat_features = categ_features),early_stopping_rounds=200)
    
    y_pval = estimator.predict_proba(x_test)[:,1]
    catOOF[vidx] = y_pval
    score = score + roc_auc_score(y_test, y_pval)
    y_pred_test = estimator.predict_proba(testData[features])[:,1]

    cat_predictions.append(y_pred_test)
print(f'Score: {roc_auc_score(y, catOOF)}')

Fold: 1


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7483366	best: 0.7483366 (0)	total: 29.3ms	remaining: 4m 52s
100:	test: 0.7891449	best: 0.7891449 (100)	total: 1.99s	remaining: 3m 14s
200:	test: 0.7946036	best: 0.7946036 (200)	total: 3.81s	remaining: 3m 5s
300:	test: 0.7967973	best: 0.7967981 (299)	total: 6.39s	remaining: 3m 25s
400:	test: 0.7979766	best: 0.7979872 (396)	total: 10.4s	remaining: 4m 8s
500:	test: 0.7988911	best: 0.7989006 (498)	total: 12.2s	remaining: 3m 50s
600:	test: 0.7993228	best: 0.7993228 (600)	total: 14s	remaining: 3m 39s
700:	test: 0.7999439	best: 0.7999439 (700)	total: 15.8s	remaining: 3m 30s
800:	test: 0.8001121	best: 0.8001305 (782)	total: 17.7s	remaining: 3m 23s
900:	test: 0.8004383	best: 0.8004383 (900)	total: 24.2s	remaining: 4m 4s
1000:	test: 0.8005434	best: 0.8005569 (988)	total: 26.7s	remaining: 3m 59s
1100:	test: 0.8006924	best: 0.8006983 (1086)	total: 28.6s	remaining: 3m 50s
1200:	test: 0.8008091	best: 0.8008091 (1200)	total: 30.3s	remaining: 3m 42s
1300:	test: 0.8008456	best: 0.8008855 (12

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7433428	best: 0.7433428 (0)	total: 43.5ms	remaining: 7m 14s
100:	test: 0.7864985	best: 0.7864985 (100)	total: 3.43s	remaining: 5m 36s
200:	test: 0.7915487	best: 0.7915487 (200)	total: 5.21s	remaining: 4m 14s
300:	test: 0.7938608	best: 0.7938608 (300)	total: 7.04s	remaining: 3m 46s
400:	test: 0.7954642	best: 0.7954642 (400)	total: 8.9s	remaining: 3m 33s
500:	test: 0.7963537	best: 0.7963608 (496)	total: 10.7s	remaining: 3m 23s
600:	test: 0.7968423	best: 0.7968423 (600)	total: 13.8s	remaining: 3m 35s
700:	test: 0.7972618	best: 0.7972618 (700)	total: 19.6s	remaining: 4m 20s
800:	test: 0.7977447	best: 0.7977448 (799)	total: 21.5s	remaining: 4m 6s
900:	test: 0.7981596	best: 0.7981596 (900)	total: 23.3s	remaining: 3m 55s
1000:	test: 0.7983208	best: 0.7983208 (1000)	total: 25.1s	remaining: 3m 46s
1100:	test: 0.7983911	best: 0.7984069 (1050)	total: 27s	remaining: 3m 38s
1200:	test: 0.7986981	best: 0.7987061 (1198)	total: 31.7s	remaining: 3m 52s
1300:	test: 0.7989664	best: 0.7989759 (

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7502691	best: 0.7502691 (0)	total: 18.2ms	remaining: 3m 2s
100:	test: 0.7916954	best: 0.7916954 (100)	total: 1.91s	remaining: 3m 6s
200:	test: 0.7962402	best: 0.7962493 (198)	total: 3.74s	remaining: 3m 2s
300:	test: 0.7982052	best: 0.7982052 (300)	total: 8.45s	remaining: 4m 32s
400:	test: 0.7993231	best: 0.7993231 (400)	total: 12.6s	remaining: 5m 2s
500:	test: 0.8001410	best: 0.8001410 (500)	total: 15.3s	remaining: 4m 49s
600:	test: 0.8005633	best: 0.8005781 (599)	total: 17s	remaining: 4m 26s
700:	test: 0.8009736	best: 0.8009736 (700)	total: 20.8s	remaining: 4m 36s
800:	test: 0.8013784	best: 0.8013784 (800)	total: 23.8s	remaining: 4m 32s
900:	test: 0.8014784	best: 0.8014784 (900)	total: 25.6s	remaining: 4m 18s
1000:	test: 0.8015923	best: 0.8016190 (990)	total: 27.4s	remaining: 4m 6s
1100:	test: 0.8018444	best: 0.8018444 (1100)	total: 29.2s	remaining: 3m 56s
1200:	test: 0.8020352	best: 0.8020352 (1200)	total: 31.1s	remaining: 3m 47s
1300:	test: 0.8021413	best: 0.8021413 (1300

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7495573	best: 0.7495573 (0)	total: 18.1ms	remaining: 3m
100:	test: 0.7916778	best: 0.7916778 (100)	total: 1.95s	remaining: 3m 11s
200:	test: 0.7959687	best: 0.7959687 (200)	total: 7.73s	remaining: 6m 16s
300:	test: 0.7983443	best: 0.7983443 (300)	total: 10.9s	remaining: 5m 51s
400:	test: 0.7996026	best: 0.7996026 (400)	total: 12.7s	remaining: 5m 4s
500:	test: 0.8003146	best: 0.8003217 (496)	total: 14.5s	remaining: 4m 35s
600:	test: 0.8009000	best: 0.8009000 (600)	total: 16.4s	remaining: 4m 15s
700:	test: 0.8012978	best: 0.8013133 (698)	total: 18.2s	remaining: 4m 1s
800:	test: 0.8017952	best: 0.8018018 (794)	total: 21.5s	remaining: 4m 7s
900:	test: 0.8019845	best: 0.8019856 (898)	total: 24.9s	remaining: 4m 11s
1000:	test: 0.8021215	best: 0.8021215 (1000)	total: 26.7s	remaining: 4m
1100:	test: 0.8022086	best: 0.8022106 (1029)	total: 28.6s	remaining: 3m 50s
1200:	test: 0.8023459	best: 0.8023459 (1200)	total: 30.4s	remaining: 3m 42s
1300:	test: 0.8025490	best: 0.8025741 (1298)	t

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7500506	best: 0.7500506 (0)	total: 110ms	remaining: 18m 15s
100:	test: 0.7931401	best: 0.7931401 (100)	total: 3.39s	remaining: 5m 32s
200:	test: 0.7988527	best: 0.7988527 (200)	total: 5.21s	remaining: 4m 13s
300:	test: 0.8014819	best: 0.8014819 (300)	total: 6.96s	remaining: 3m 44s
400:	test: 0.8030519	best: 0.8030519 (400)	total: 8.77s	remaining: 3m 29s
500:	test: 0.8041289	best: 0.8041399 (499)	total: 10.6s	remaining: 3m 20s
600:	test: 0.8046815	best: 0.8046824 (599)	total: 14.5s	remaining: 3m 46s
700:	test: 0.8050360	best: 0.8050360 (700)	total: 17.1s	remaining: 3m 47s
800:	test: 0.8052927	best: 0.8052927 (800)	total: 18.9s	remaining: 3m 37s
900:	test: 0.8056577	best: 0.8056715 (899)	total: 20.7s	remaining: 3m 29s
1000:	test: 0.8058572	best: 0.8058572 (999)	total: 22.6s	remaining: 3m 23s
1100:	test: 0.8060140	best: 0.8060277 (1082)	total: 24.4s	remaining: 3m 17s
1200:	test: 0.8060978	best: 0.8061405 (1157)	total: 29.1s	remaining: 3m 33s
1300:	test: 0.8061673	best: 0.806187

In [7]:
catboostpred = np.mean(cat_predictions,axis=0)

**lightGBM** Validation

In [8]:
#Tuning Parameters
lgb_params = {'boosting_type': 'gbdt','objective': 'binary','metric': 'auc',
              'n_estimators': 500,'early_stopping_rounds': 100,'sub_sample' : 0.7,
              'colsample_bytree' : 0.6,'seed': 19,'silent':False
            }

In [9]:
score = 0
testData['target'] = 0
Skfold = StratifiedKFold(n_splits=5,shuffle=True, random_state=19)
lgbOOF = np.zeros((trainData.shape[0],))
lgb_preds = []
for loop, (idx, vidx) in enumerate(Skfold.split(x, trainData.country)):
    print('Fold:',loop+1)
        
    trx, trY = x.iloc[idx,:], y[idx] 
    vlx, vly = x.iloc[vidx,:], y[vidx] 
        
    train_data = lgb.Dataset(trx, label=trY,categorical_feature=categ_features)
    valid_data = lgb.Dataset(vlx, label=vly,categorical_feature=categ_features)
    
    estimator = lgb.train(
                          lgb_params,
                          train_data,
                          valid_sets = [train_data,valid_data],
                          verbose_eval = 100,
                        )

    y_pred_val = estimator.predict(vlx,num_iteration=estimator.best_iteration)
    lgbOOF[vidx] = y_pred_val
    score = score + roc_auc_score(vly, y_pred_val)
    y_pred_test = estimator.predict(testData[features],num_iteration=estimator.best_iteration)
    lgb_preds.append(y_pred_test)

print(f'Score : {roc_auc_score(y, lgbOOF)}',)

Fold: 1


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.825269	valid_1's auc: 0.799104
[200]	training's auc: 0.839055	valid_1's auc: 0.799764
[300]	training's auc: 0.848961	valid_1's auc: 0.799272
Early stopping, best iteration is:
[238]	training's auc: 0.842781	valid_1's auc: 0.800124
Fold: 2


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.824337	valid_1's auc: 0.807446
[200]	training's auc: 0.838049	valid_1's auc: 0.808424
[300]	training's auc: 0.849121	valid_1's auc: 0.80808
Early stopping, best iteration is:
[218]	training's auc: 0.839885	valid_1's auc: 0.808457
Fold: 3


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.825113	valid_1's auc: 0.803023
[200]	training's auc: 0.838989	valid_1's auc: 0.803369
Early stopping, best iteration is:
[196]	training's auc: 0.838559	valid_1's auc: 0.803437
Fold: 4


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.824215	valid_1's auc: 0.804654
[200]	training's auc: 0.837535	valid_1's auc: 0.805165
Early stopping, best iteration is:
[166]	training's auc: 0.833951	valid_1's auc: 0.805339
Fold: 5


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.825473	valid_1's auc: 0.799403
[200]	training's auc: 0.839297	valid_1's auc: 0.799748
Early stopping, best iteration is:
[149]	training's auc: 0.833045	valid_1's auc: 0.800032
Score : 0.8034417209147834


In [10]:
lightgbm_preds = np.mean(lgb_preds,axis=0)

# Ensembling

In [15]:
Ensemble_score = catOOF*0.55 + lgbOOF*0.45
roc_auc_score(y,Ensemble_score )
#  N=3 >>> 0.8040931624186454 Lightgbm & CatBoost
#  N=5 >>> 0.8046957301409114 Lightgbm & CatBoost

0.8049145750587461

# Submission

In [13]:
testData['target'] = lightgbm_preds #*0.45 + catboostpred*0.55
submission = testData[['ID', 'target']]
submission.to_csv('do-not-over-fit-FMSI-ensemble_fi.csv',index = False)

**Note**:  During the competition i forget to ensemble the predictions

References:

[StratifiedKFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html)

[Speeding up the training](https://catboost.ai/en/docs/concepts/speed-up-training) "CatBoost"

[CatBoostClassifier Parameters](https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier)

[Lightgbm Tuning](https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html)

[XGBoost hyperparameters Tuning](https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook)