# **Setup**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
!cp "/content/drive/MyDrive/iX_Mobile_Banking_Prediction_Challenge/Data/Train.csv" .
!cp "/content/drive/MyDrive/iX_Mobile_Banking_Prediction_Challenge/Data/Test.csv" .
!cp "/content/drive/MyDrive/iX_Mobile_Banking_Prediction_Challenge/Data/SampleSubmission.csv" .
!cp "/content/drive/MyDrive/iX_Mobile_Banking_Prediction_Challenge/Data/VariableDefinitions.csv" .

In [47]:
import os, sys, gc, warnings, random
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
import lightgbm as lgb  
from tqdm import tqdm_notebook
from sklearn.metrics import auc, classification_report, roc_auc_score
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

# **Load Data - Processing**

In [7]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [9]:
def remove_nans(train,thresh) :
  def nans_rate(train,col) :
    return train[col].isna().sum() / train.shape[0]

  for col in train.columns : 
    if nans_rate(train,col) >= thresh :
      train.drop(col,axis=1,inplace=True)
  return train

In [10]:
train = remove_nans(train,thresh=0.8)
test = test[train.columns[:-1]]

In [11]:
def process(train , test) :
  data = pd.concat([train,test])
  data['age'].fillna(data.age.mean(),inplace=True)
  FQ = data.filter(like= 'FQ').columns

  for fq in FQ :
    data[fq] = data[fq].fillna(-1)

  data[FQ] = data[FQ].astype('int')
  # get train - test
  train = data[data['ID'].isin(train['ID'].unique())]
  train['Target'] = train['Target'].astype('int')
  test = data[~data['ID'].isin(train['ID'].unique())]

  return train , test

In [12]:
train , test = process(train , test)

In [13]:
train.head()

Unnamed: 0,ID,country_code,region,age,FQ1,FQ2,FQ3,FQ4,FQ6,FQ7,FQ8,FQ9,FQ10,FQ11,FQ12,FQ13,FQ14,FQ15,FQ16,FQ18,FQ19,FQ20,FQ21,FQ22,FQ23,FQ24,FQ35,FQ25,FQ26,FQ29,FQ32,FQ33,FQ34,FQ37,Target
0,ID_000J8GTZ,1,6,35.0,2,-1,-1,2,-1,1,2,1,1,2,2,2,2,2,2,2,4,2,1,1,2,-1,1,2,2,1,-1,1,1,0,0
1,ID_000QLXZM,32,7,70.0,2,-1,-1,2,-1,1,2,2,2,2,2,2,2,2,2,1,3,2,1,1,2,-1,-1,1,1,2,-1,1,2,0,0
2,ID_001728I2,71,7,22.0,2,1,-1,2,-1,1,2,2,2,4,2,2,1,2,1,1,-1,2,2,1,2,-1,-1,2,1,2,-1,2,1,1,0
3,ID_001R7IDN,48,3,27.0,1,-1,-1,2,2,-1,2,1,1,2,2,2,2,2,2,1,1,1,-1,1,2,-1,-1,2,2,-1,2,1,1,1,0
4,ID_0029QKF8,25,0,79.0,2,-1,-1,2,-1,-1,2,2,1,2,2,2,2,2,1,1,-1,2,2,1,1,2,-1,2,2,2,2,1,1,1,0


# **Modeling**

In [15]:
class CFG :
  SEED = 42
  n_splits = 5
  remove_features = ['ID', 'country','Target']
  categ_features = ['country_code','region']
  TARGET_COL = 'Target'

  lgb_params = {'boosting_type': 'gbdt','objective': 'binary','metric': 'auc',
                'n_estimators': 500,'colsample_bytree' : 0.8,
                'seed': 42,'silent':False,'early_stopping_rounds': 100,'learning_rate' :0.1
               }

In [16]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)

In [17]:
seed_everything(CFG.SEED)

In [18]:
features_columns = [col for col in train.columns if col not in CFG.remove_features]
len(features_columns)

33

## **1. LightGBM**

In [20]:
def train_one_region(X,y,Test,skf,reg) :
  print(30*'-')
  oof_lgb = np.zeros((X.shape[0],))
  Test['target'] = 0
  lgb_preds = []
  
  for fold_, (trn_idx, val_idx) in enumerate(skf.split(X, X.country_code)):
      tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx] 
      vl_x, vl_y = X.iloc[val_idx,:], y[val_idx] 
          
      train_data = lgb.Dataset(tr_x, label=tr_y,categorical_feature=CFG.categ_features)
      valid_data = lgb.Dataset(vl_x, label=vl_y,categorical_feature=CFG.categ_features)

      estimator = lgb.train(CFG.lgb_params,train_data,valid_sets = [train_data,valid_data],verbose_eval = 0)
      
      y_pred_val = estimator.predict(vl_x,num_iteration=estimator.best_iteration)
      oof_lgb[val_idx] = y_pred_val
      
      y_pred_test = estimator.predict(Test[features_columns],num_iteration=estimator.best_iteration)
      lgb_preds.append(y_pred_test)
      
  print(f'Region[{reg}] AUC : ',roc_auc_score(y, oof_lgb))
  print(30*'-')
  return np.mean(lgb_preds,axis=0) , oof_lgb

In [22]:
def Custom_training() :

  train_ids = [] ; test_ids = [] ;
  train_target = [] ;oof_preds = [] ; test_preds = [] ;

  for reg in tqdm_notebook(np.sort(train.region.unique())) :
    skf = StratifiedKFold(n_splits=CFG.n_splits,shuffle=True, random_state=CFG.SEED)
    train_ = train[train['region']==reg].reset_index(drop=True)
    Test = test[test['region']==reg].reset_index(drop=True)
    train_ids.extend(train_['ID'].values.tolist()) ; test_ids.extend(Test['ID'].values.tolist())
    X , y   = train_[features_columns] , train_[CFG.TARGET_COL]
    test_pred , oof_pred = train_one_region(X,y,Test,skf,reg=reg)
    train_target.extend(y) ; oof_preds.extend(oof_pred) ; test_preds.extend(test_pred)

  return train_ids , oof_preds ,train_target ,test_ids, test_preds

In [23]:
train_ids , oof_preds ,train_target ,test_ids, test_preds = Custom_training()

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

------------------------------
Region[-1] AUC :  0.5709805990270207
------------------------------
------------------------------
Region[0] AUC :  0.5052724314901191
------------------------------
------------------------------
Region[1] AUC :  0.5101629580425635
------------------------------
------------------------------
Region[2] AUC :  0.5188523908229331
------------------------------
------------------------------
Region[3] AUC :  0.5034877545592946
------------------------------
------------------------------
Region[4] AUC :  0.5124508713138678
------------------------------
------------------------------
Region[5] AUC :  0.5076364002508089
------------------------------
------------------------------
Region[6] AUC :  0.5059167181698214
------------------------------
------------------------------
Region[7] AUC :  0.5053698903094281
------------------------------



In [24]:
oof_data_lgbm = pd.DataFrame({'ID' :train_ids ,'OOF_lgbm' :oof_preds , 'Target' :train_target})

In [25]:
print(f'AUC : ',roc_auc_score(oof_data_lgbm['Target'], oof_data_lgbm['OOF_lgbm'])) # AUC :  0.5100533751060792

AUC :  0.5107955042799776


In [26]:
len(test_ids) ,len(test_preds)

(46477, 46477)

In [27]:
Submission_lgbm = pd.DataFrame({'ID' :test_ids ,'Target' :test_preds})

In [28]:
Submission_lgbm.describe()

Unnamed: 0,Target
count,46477.0
mean,0.272474
std,0.034909
min,0.122583
25%,0.254157
50%,0.272696
75%,0.287921
max,0.535878


In [None]:
Submission_lgbm.to_csv('Unofficial_Winning_Solution.csv',index=False)