# **0-Setup**

In [1]:
!nvidia-smi

Mon Mar 29 07:20:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from zipfile import ZipFile
path = '/content/drive/MyDrive/UHFinancialResilience.zip'
with ZipFile(path) as zf:
    zf.extractall(pwd=b'lpdsv')

In [8]:
!pip install catboost==0.22 --quiet

[K     |████████████████████████████████| 64.4MB 46kB/s 
[?25h

In [2]:
import os, sys, gc, warnings, random
import numpy as np 
import pandas as pd 
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
import lightgbm as lgb  
from catboost import CatBoostClassifier ,Pool

from sklearn.metrics import auc, classification_report, roc_auc_score
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Import libraries
import pandas as pd
pd.set_option('max_colwidth', 500)
import numpy as np



In [4]:
class CFG :
  SEED = 42
  n_splits = 5
  catboost_params = {'learning_rate':0.05,'iterations':10000,'eval_metric':'AUC',
                      'use_best_model' :True,'verbose':100,'random_seed': 0,
                      'devices':'0:1','task_type':"GPU",}

  lgb_params = {'boosting_type': 'gbdt','objective': 'binary','metric': 'auc',
                'n_estimators': 500,'sub_sample' : 0.7,'colsample_bytree' : 0.6,
                'seed': SEED,'silent':False,'early_stopping_rounds': 100,
               }
  remove_features = ['ID', 'country', 'region','target']
  categ_features = ['country_code','region_code']
  TARGET_COL = 'target'

# **1-Pre-Processing**

## **1.1 Utils**

In [5]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)

In [6]:
def Encode(train,test) :
  label_encoder = preprocessing.LabelEncoder()
  train['country_code']= label_encoder.fit_transform(train['country'])
  test['country_code']= label_encoder.transform(test['country'])

  label_encoder2 = preprocessing.LabelEncoder()
  train['region_code']= label_encoder2.fit_transform(train['region'].astype(str))
  test['region_code']= label_encoder2.transform(test['region'].astype(str))

## **1.2 Process**

In [7]:
# Load files
train = pd.read_csv('FinancialResilience/Train.csv')
test = pd.read_csv('FinancialResilience/Test.csv')
samplesubmission = pd.read_csv('FinancialResilience/SampleSubmission.csv')
variable_definations = pd.read_csv('FinancialResilience/VariableDefinitions.csv')

In [8]:
seed_everything(CFG.SEED)

In [9]:
Encode(train,test)

# **2-Modeling**

## **2.1 Catboost**

In [10]:
features_columns = [col for col in train.columns if col not in CFG.remove_features]
len(features_columns)

34

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_splits,shuffle=True, random_state=CFG.SEED)

X , y   = train[features_columns] , train[CFG.TARGET_COL]

oof_cat = np.zeros((train.shape[0],))
test['target'] = 0
cat_preds= []

for fold_, (trn_idx, val_idx) in enumerate(skf.split(X, train.country)):
    print(50*'-')
    print('Fold:',fold_+1)
    X_train, y_train = X.iloc[trn_idx,:], y[trn_idx] 
    X_test, y_test = X.iloc[val_idx,:], y[val_idx] 
       
    estimator = CatBoostClassifier(**CFG.catboost_params)
    estimator.fit(Pool(X_train,y_train,cat_features = CFG.categ_features),
                  eval_set = Pool(X_test,y_test,cat_features = CFG.categ_features),
                  early_stopping_rounds=200)
    
    y_pred_val = estimator.predict_proba(X_test)[:,1]
    oof_cat[val_idx] = y_pred_val
    y_pred_test = estimator.predict_proba(test[features_columns])[:,1]
    cat_preds.append(y_pred_test)
    print(50*'-')
    print()
print('OOF score :',roc_auc_score(y, oof_cat)) 

--------------------------------------------------
Fold: 1
0:	learn: 0.7462481	test: 0.7486294	best: 0.7486294 (0)	total: 39ms	remaining: 6m 29s
100:	learn: 0.7948141	test: 0.7951517	best: 0.7951517 (100)	total: 2.52s	remaining: 4m 6s
200:	learn: 0.8018715	test: 0.7992485	best: 0.7992485 (200)	total: 4.7s	remaining: 3m 49s
300:	learn: 0.8065882	test: 0.8011629	best: 0.8011629 (300)	total: 6.88s	remaining: 3m 41s
400:	learn: 0.8104564	test: 0.8023694	best: 0.8023694 (400)	total: 9.01s	remaining: 3m 35s
500:	learn: 0.8136609	test: 0.8030018	best: 0.8030077 (497)	total: 11.2s	remaining: 3m 32s
600:	learn: 0.8164283	test: 0.8034565	best: 0.8034609 (598)	total: 13.3s	remaining: 3m 28s
700:	learn: 0.8189570	test: 0.8037618	best: 0.8037728 (694)	total: 15.6s	remaining: 3m 26s
800:	learn: 0.8213631	test: 0.8040206	best: 0.8040414 (794)	total: 17.8s	remaining: 3m 24s
900:	learn: 0.8237404	test: 0.8043837	best: 0.8043870 (899)	total: 19.9s	remaining: 3m 21s
1000:	learn: 0.8258016	test: 0.8045661

In [12]:
catboost_preds = np.mean(cat_preds,axis=0)

## **2.2 LGBM**

In [15]:
skf = StratifiedKFold(n_splits=CFG.n_splits,shuffle=True, random_state=CFG.SEED)

X , y   = train[features_columns] , train[CFG.TARGET_COL]

oof_lgb = np.zeros((train.shape[0],))
test['target'] = 0
lgb_preds = []

for fold_, (trn_idx, val_idx) in enumerate(skf.split(X, train.country)):
    print(50*'-')
    print('Fold:',fold_+1)

    tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx] 
    vl_x, vl_y = X.iloc[val_idx,:], y[val_idx] 
        
    train_data = lgb.Dataset(tr_x, label=tr_y,categorical_feature=CFG.categ_features)
    valid_data = lgb.Dataset(vl_x, label=vl_y,categorical_feature=CFG.categ_features)

    estimator = lgb.train(CFG.lgb_params,train_data,valid_sets = [train_data,valid_data],verbose_eval = 100)
    
    y_pred_val = estimator.predict(vl_x,num_iteration=estimator.best_iteration)
    oof_lgb[val_idx] = y_pred_val
    
    y_pred_test = estimator.predict(test[features_columns],num_iteration=estimator.best_iteration)
    lgb_preds.append(y_pred_test)
    print(50*'-')

print('OOF score :',roc_auc_score(y, oof_lgb))

--------------------------------------------------
Fold: 1
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.826008	valid_1's auc: 0.804411
[200]	training's auc: 0.840477	valid_1's auc: 0.804745
Early stopping, best iteration is:
[148]	training's auc: 0.833437	valid_1's auc: 0.804972
--------------------------------------------------
--------------------------------------------------
Fold: 2
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.827208	valid_1's auc: 0.802313
[200]	training's auc: 0.841627	valid_1's auc: 0.802318
Early stopping, best iteration is:
[162]	training's auc: 0.836818	valid_1's auc: 0.802506
--------------------------------------------------
--------------------------------------------------
Fold: 3
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.827657	valid_1's auc: 0.799289
[200]	training's auc: 0.841385	valid_1's auc: 0.798763
Early stopping, best i

In [16]:
lightgbm_preds = np.mean(lgb_preds,axis=0)

# **3- Submission**

In [17]:
test['target'] =catboost_preds*0.5 +  lightgbm_preds*0.5 
submission = test[['ID', 'target']]
submission.to_csv('UmojaHack-Challenge#3-Top3-Solution.csv',index = False)