# <h1><center> *ML-2 Mid Project Prediction Notebook* </center></h1>

<div style="width:100%;text-align: center;"> <img align=middle src="https://miro.medium.com/max/1400/1*QJZ6W-Pck_W7RlIDwUIN9Q.jpeg" alt="Heat beating" style="height:600px;margin-top:3rem;"> </div>

### Loading necessary libraries

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
import scipy.stats

import missingno
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import BaggingClassifier
import lightgbm as lgb
from sklearn.utils import estimator_html_repr
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from pipelinehelper import PipelineHelper
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report,confusion_matrix
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import optuna

## Loading data

In [6]:
train = pd.read_csv(r"C:\Users\admin\trainEDA.csv")
test = pd.read_csv(r"C:\Users\admin\testEDA.csv")

### Defining the predictors and target

In [7]:
X = train.drop(['Id','defaulted_on_loan'], axis=1)
y = train[['defaulted_on_loan']]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle = True)
print(X_train.shape, y_test.shape)

(57728, 10) (14433, 1)


### Starting with simplest

I will start with simply LogisticRegression, whom is the king of classification for some banking problems!

In [19]:
lr_pipeline=Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('impute', SimpleImputer(strategy='median')),
    ('classifier', LogisticRegression())
]) 

In [20]:
lr_pipeline.fit(X_train,y_train)
pred1 = lr_pipeline.predict_proba(X_test)
print(roc_auc_score(y_test, pred1[:,1]))

0.6776597252268353


Oops, it seems this problem is 'too heavy' for LR, at least with imbalanced data and non-optimized hyperparameters.

### Let's try LightGBM:

In [8]:
model_pipeline2=Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('impute', SimpleImputer()),
    ('classifier', PipelineHelper([
        
        
    
        ('lgb', lgb.LGBMClassifier())
    ])), 
])

In [9]:
params = {
    'classifier__selected_model': model_pipeline2.named_steps['classifier'].generate({
    'lgb__boosting_type' : ['gbdt','dart'],
     'lgb__num_leaves' : [2,3,4,5,6,7,8,9,10,15,20],
    'lgb__n_estimators' : [80,85,90,100,101]
        
    
        })
}

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# GridSearch is also doing cross-validation of data, but we can also add kfold
grid = GridSearchCV(model_pipeline2, params, cv=3, scoring='roc_auc')
result = grid.fit(X_train,y_train)
print('Best Score: ',  round((result.best_score_),4))
print('--------------------------------')
print(grid.best_params_)

Best Score:  0.8442
--------------------------------
{'classifier__selected_model': ('lgb', {'boosting_type': 'dart', 'n_estimators': 101, 'num_leaves': 15})}


Nice score! I tried my submission and got score around 0.82. Let's improve our score.

## *Yandex's CatBoostClassifier:*

In [34]:
ctb_pipeline=Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('impute', SimpleImputer(strategy='median')),
    ('classifier', CatBoostClassifier(silent=True,depth=9, iterations=1700, learning_rate=0.01, l2_leaf_reg=3,use_best_model=True))
]) 

In [42]:
cate_features_index = np.where(X.dtypes != float)[0]
ctb_pipeline.fit(X_train, y_train,classifier__cat_features=cate_features_index, classifier__eval_set=(X_test,y_test))

Pipeline(steps=[('scaler', StandardScaler()),
                ('impute', SimpleImputer(strategy='median')),
                ('classifier',
                 <catboost.core.CatBoostClassifier object at 0x000001AF22F84A88>)])

In [13]:
pred3 = ctb_pipeline.predict_proba(X_test)
print(roc_auc_score(y_test, pred3[:,1]))

0.8557596232537847


### Simple XGBoost

In [26]:
xgb_pipeline1 = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('impute', SimpleImputer()),
    ('classification', XGBClassifier(max_depth = 7))  
])

In [27]:
xgb_pipeline1.fit(X_train, y_train)
pred4 = xgb_pipeline1.predict_proba(X_test)[:, 1]


print(roc_auc_score(y_test,pred4))

0.8339803015224403


### Without parameter optimization, XGBoost is giving good results and it is fast, let's find best parameters for it and give it a shot 

#### Used Optuna for determining best hyperparameters, it tooks a bit long to compile, so I commented it out

In [23]:
# %%time
#def objective(trial):
#    params = {
#
     

#        'max_depth': trial.suggest_int('max_depth', 1, 10),
#
#        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#
#        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
#        
#       'reg_alpha':trial.suggest_uniform('reg_alpha',0,15),
#       
#       'reg_lambda':trial.suggest_uniform('reg_lambda',0,15),
#        
#         'gamma':trial.suggest_uniform('gamma', 0, 15),
        
#        'min_child_weight':trial.suggest_int('min_child_weight',0,15),    
        
#        'colsample_bytree':trial.suggest_uniform('colsample_bytree',0,1),
#        'subsample':trial.suggest_uniform('subsample',0,1),
        
#        'scale_pos_weight':trial.suggest_int('scale_pos_weight',1,15), 
        
        

#    }
    
#    piplin = Pipeline(steps=[
#    ('scaler', StandardScaler()),
#    ('impute', SimpleImputer()),
#    ('classification', xgb.XGBClassifier(**params))  
#])

#    piplin.fit(X_train, y_train)
#    rocaucscore = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
#
#    return (rocaucscore)   

### Prediction with optimized values:

In [39]:
xgb_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('impute', SimpleImputer()),
    ('classification', XGBClassifier(max_depth = 8,  n_estimators = 380, learning_rate = 0.014, reg_alpha = 3.499, reg_lambda = 0.998,gamma = 1.658, min_child_weight = 1, colsample_bytree = 0.564, subsample = 0.634, scale_pos_weight = 1))  
])

In [40]:
xgb_pipeline.fit(X_train, y_train)
pred5 = xgb_pipeline.predict_proba(X_test)[:, 1]


print(roc_auc_score(y_test,pred5))

0.8590046822380071


### Stacking Models`

I used Voting Classifier initiallly but got better results in test dataset by simplying averaging the results of my 2 best predictors:

In [43]:
y_pred_final = ctb_pipeline.predict_proba(test.drop(['Id'],axis=1))
y_pred_final2 = xgb_pipeline.predict_proba(test.drop(['Id'],axis=1))

#Creating prediction column
prediction = test[['Id']]

#Adding average prediction values as new column
prediction['Predicted'] = (y_pred_final[:,1]+y_pred_final2[:,1])/2

#checking the shape for consistency
prediction.shape 

(48108, 2)

In [30]:
prediction.head(3)

Unnamed: 0,Id,Predicted
0,1,0.134209
1,2,0.559777
2,3,0.017864


In [31]:
#prediction.to_csv("ML2submission16.csv", index=False)
print("Submission was successfully saved!")

Submission was successfully saved!
