In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler,  FunctionTransformer
from sklearn.pipeline import Pipeline

import statistics
import optuna
import logging
optuna.logging.set_verbosity(optuna.logging.WARNING)


from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, \
AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier, RidgeClassifier, ElasticNet, LogisticRegression, LinearRegression
from catboost import CatBoostClassifier


from sklearn.model_selection import train_test_split, cross_val_score \
,GridSearchCV, KFold, RepeatedKFold, StratifiedKFold, ShuffleSplit, StratifiedShuffleSplit \
, RandomizedSearchCV

from sklearn.metrics import roc_auc_score, roc_curve

import warnings
warnings.filterwarnings('ignore')

In [3]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')
credit_risk_dataset = pd.read_csv('./data/redit_risk_dataset.csv')

In [4]:
df_test_copy = df_test.copy()

In [5]:
credit_risk_dataset['id'] = list(range(0,len(credit_risk_dataset)))

In [6]:
full_train_df = pd.concat([df_train,credit_risk_dataset], axis=0)

In [7]:
full_train_df.shape

(91226, 13)

In [8]:
cleaned_full_train_df = full_train_df[full_train_df['person_age']<100]

In [9]:
cleaned_full_train_df.shape

(91220, 13)

In [10]:
cleaned_full_train_df[cleaned_full_train_df['person_emp_length']>cleaned_full_train_df['person_age']]

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
41079,41079,28,60350,MORTGAGE,123.0,MEDICAL,D,25000,15.95,0.35,Y,6,1
49252,49252,21,192000,MORTGAGE,123.0,VENTURE,B,20000,11.49,0.1,N,2,0
0,0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,0.59,Y,3,1
210,210,21,192000,MORTGAGE,123.0,VENTURE,A,20000,6.54,0.1,N,4,0


In [11]:
cleaned_full_train_df[cleaned_full_train_df['person_emp_length']<cleaned_full_train_df['person_age']]

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.90,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.10,N,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30,0
32577,32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19,0
32578,32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28,1
32579,32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,26,0


In [12]:
cleaned_full_train_df = cleaned_full_train_df.reset_index(drop=True)

In [13]:
cleaned_full_train_df['id'] = list(range(0,len(cleaned_full_train_df)))

In [14]:
cleaned_full_train_df

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.90,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.10,N,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91215,91215,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30,0
91216,91216,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19,0
91217,91217,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28,1
91218,91218,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,26,0


# Take care of missing values

In [15]:
cleaned_full_train_df.isna().sum()

id                               0
person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3115
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
loan_status                      0
dtype: int64

In [16]:
#person_emp_length              895
#loan_int_rate                 3115

In [17]:
person_emp_length_missing = cleaned_full_train_df[cleaned_full_train_df['person_emp_length'].isna()]
person_emp_length_missing

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
58748,58748,22,12600,MORTGAGE,,PERSONAL,A,2000,5.42,0.16,N,4,1
58864,58864,24,185000,MORTGAGE,,EDUCATION,B,35000,12.42,0.19,N,2,0
59021,59021,24,16800,MORTGAGE,,DEBTCONSOLIDATION,A,3900,,0.23,N,3,1
59049,59049,25,52000,RENT,,PERSONAL,B,24000,10.74,0.46,N,2,1
59050,59050,22,17352,MORTGAGE,,EDUCATION,C,2250,15.27,0.13,Y,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
90925,90925,38,12000,OWN,,EDUCATION,A,4800,7.29,0.40,N,12,1
90967,90967,51,18408,RENT,,PERSONAL,C,1000,14.65,0.05,Y,20,1
90999,90999,70,39996,RENT,,MEDICAL,C,3600,15.23,0.09,Y,19,0
91092,91092,56,32400,RENT,,MEDICAL,A,8575,7.51,0.26,N,18,0


In [18]:
loan_int_rate_missing = cleaned_full_train_df[cleaned_full_train_df['loan_int_rate'].isna()]
loan_int_rate_missing

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
58683,58683,23,71500,RENT,3.0,DEBTCONSOLIDATION,D,30000,,0.42,N,4,1
58694,58694,24,78000,RENT,4.0,DEBTCONSOLIDATION,D,30000,,0.38,Y,4,1
58701,58701,23,277000,OWN,3.0,PERSONAL,A,35000,,0.13,N,4,0
58703,58703,24,12000,OWN,2.0,VENTURE,E,1750,,0.15,Y,3,0
58706,58706,26,263000,MORTGAGE,0.0,EDUCATION,B,10000,,0.04,N,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91186,91186,53,4888,OWN,0.0,VENTURE,C,1400,,0.29,Y,28,1
91191,91191,65,45900,RENT,2.0,EDUCATION,C,10000,,0.22,Y,19,0
91192,91192,54,20000,RENT,2.0,MEDICAL,C,5000,,0.25,N,28,0
91208,91208,51,60000,MORTGAGE,1.0,PERSONAL,A,7500,,0.13,N,23,0


***person_emp_length handling***

In [19]:
person_emp_length_mask = ~cleaned_full_train_df.isin(person_emp_length_missing.to_dict(orient='list')).all(axis=1)
person_emp_length_result = cleaned_full_train_df[person_emp_length_mask]

person_emp_length_result.isna().sum()

id                               0
person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length                0
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3047
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
loan_status                      0
dtype: int64

In [20]:
person_emp_length_features_train = person_emp_length_result[['person_age']]
person_emp_length_target_train = person_emp_length_result[['person_emp_length']]

person_emp_length_lin_reg = LinearRegression()
person_emp_length_lin_reg.fit(person_emp_length_features_train, person_emp_length_target_train)

In [21]:
person_emp_length_predictions = person_emp_length_lin_reg.predict(person_emp_length_missing[['person_age']])
person_emp_length_predicted_data = person_emp_length_missing.copy()
person_emp_length_predicted_data['person_emp_length']=person_emp_length_predictions.astype('int')
person_emp_length_predicted_data

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
58748,58748,22,12600,MORTGAGE,4,PERSONAL,A,2000,5.42,0.16,N,4,1
58864,58864,24,185000,MORTGAGE,4,EDUCATION,B,35000,12.42,0.19,N,2,0
59021,59021,24,16800,MORTGAGE,4,DEBTCONSOLIDATION,A,3900,,0.23,N,3,1
59049,59049,25,52000,RENT,4,PERSONAL,B,24000,10.74,0.46,N,2,1
59050,59050,22,17352,MORTGAGE,4,EDUCATION,C,2250,15.27,0.13,Y,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
90925,90925,38,12000,OWN,5,EDUCATION,A,4800,7.29,0.40,N,12,1
90967,90967,51,18408,RENT,6,PERSONAL,C,1000,14.65,0.05,Y,20,1
90999,90999,70,39996,RENT,8,MEDICAL,C,3600,15.23,0.09,Y,19,0
91092,91092,56,32400,RENT,7,MEDICAL,A,8575,7.51,0.26,N,18,0


In [22]:
cleaned_full_train_df.loc[cleaned_full_train_df['person_emp_length'].isna(),'person_emp_length'] = person_emp_length_predicted_data

In [23]:
cleaned_full_train_df.isna().sum()

id                               0
person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length                0
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3115
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
loan_status                      0
dtype: int64

***loan_int_rate handling***

In [24]:
loan_int_rate_mask = ~cleaned_full_train_df.isin(loan_int_rate_missing.to_dict(orient='list')).all(axis=1)
loan_int_rate_result = cleaned_full_train_df[loan_int_rate_mask]

loan_int_rate_result.isna().sum()

id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64

In [25]:
loan_int_rate_features_train = loan_int_rate_result[['loan_amnt']]
loan_int_rate_target_train = loan_int_rate_result[['loan_int_rate']]

loan_int_rate_lin_reg = LinearRegression()
loan_int_rate_lin_reg.fit(loan_int_rate_features_train, loan_int_rate_target_train)

In [26]:
loan_int_rate_predictions = loan_int_rate_lin_reg.predict(loan_int_rate_missing[['loan_amnt']])
loan_int_rate_predicted_data = loan_int_rate_missing.copy()
loan_int_rate_predicted_data['loan_int_rate']=loan_int_rate_predictions
loan_int_rate_predicted_data

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
58683,58683,23,71500,RENT,3.0,DEBTCONSOLIDATION,D,30000,12.193553,0.42,N,4,1
58694,58694,24,78000,RENT,4.0,DEBTCONSOLIDATION,D,30000,12.193553,0.38,Y,4,1
58701,58701,23,277000,OWN,3.0,PERSONAL,A,35000,12.533372,0.13,N,4,0
58703,58703,24,12000,OWN,2.0,VENTURE,E,1750,10.273577,0.15,Y,3,0
58706,58706,26,263000,MORTGAGE,0.0,EDUCATION,B,10000,10.834278,0.04,N,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91186,91186,53,4888,OWN,0.0,VENTURE,C,1400,10.249790,0.29,Y,28,1
91191,91191,65,45900,RENT,2.0,EDUCATION,C,10000,10.834278,0.22,Y,19,0
91192,91192,54,20000,RENT,2.0,MEDICAL,C,5000,10.494459,0.25,N,28,0
91208,91208,51,60000,MORTGAGE,1.0,PERSONAL,A,7500,10.664369,0.13,N,23,0


In [27]:
cleaned_full_train_df.loc[cleaned_full_train_df['loan_int_rate'].isna(),'loan_int_rate'] = loan_int_rate_predicted_data

In [28]:
cleaned_full_train_df.isna().sum()

id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64

# Missing data handled now some wrangling

In [29]:
bad_data = cleaned_full_train_df[cleaned_full_train_df['person_emp_length']>cleaned_full_train_df['person_age']]
bad_data

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
41079,41079,28,60350,MORTGAGE,123.0,MEDICAL,D,25000,15.95,0.35,Y,6,1
49251,49251,21,192000,MORTGAGE,123.0,VENTURE,B,20000,11.49,0.1,N,2,0
58644,58644,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,0.59,Y,3,1
58852,58852,21,192000,MORTGAGE,123.0,VENTURE,A,20000,6.54,0.1,N,4,0


In [30]:
bad_data['person_emp_length'] = np.nan

In [31]:
bad_data_predictions = person_emp_length_lin_reg.predict(bad_data[['person_age']])
bad_data_predicted_data = bad_data.copy()
bad_data_predicted_data['person_emp_length']=bad_data_predictions.astype('int')
bad_data_predicted_data

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
41079,41079,28,60350,MORTGAGE,4,MEDICAL,D,25000,15.95,0.35,Y,6,1
49251,49251,21,192000,MORTGAGE,4,VENTURE,B,20000,11.49,0.1,N,2,0
58644,58644,22,59000,RENT,4,PERSONAL,D,35000,16.02,0.59,Y,3,1
58852,58852,21,192000,MORTGAGE,4,VENTURE,A,20000,6.54,0.1,N,4,0


In [32]:
cleaned_full_train_df.loc[bad_data['id'],'person_emp_length'] = bad_data_predicted_data

In [33]:
cleaned_full_train_df.loc[bad_data['id']]

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
41079,41079,28,60350,MORTGAGE,4.0,MEDICAL,D,25000,15.95,0.35,Y,6,1
49251,49251,21,192000,MORTGAGE,4.0,VENTURE,B,20000,11.49,0.1,N,2,0
58644,58644,22,59000,RENT,4.0,PERSONAL,D,35000,16.02,0.59,Y,3,1
58852,58852,21,192000,MORTGAGE,4.0,VENTURE,A,20000,6.54,0.1,N,4,0


In [34]:
cleaned_full_train_df[cleaned_full_train_df['person_emp_length']>cleaned_full_train_df['person_age']]

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status


# Data prepared, time to train

In [35]:
prepared_train_df = cleaned_full_train_df.copy().reset_index(drop=True)

In [36]:
prepared_train_df.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [37]:
num_cols_train = cleaned_full_train_df[['cb_person_cred_hist_length','person_age','person_emp_length','loan_percent_income','person_income','loan_amnt','loan_int_rate']]
num_cols_test = df_test[['cb_person_cred_hist_length','person_age','person_emp_length','loan_percent_income','person_income','loan_amnt','loan_int_rate']]
both_cols = pd.concat([num_cols_train,num_cols_test]).reset_index(drop=True)

In [38]:
def data_wrangler1(df):
    df['person_age'] = df['person_age'].astype('str').astype('category')
    df['person_income'] = df['person_income'].astype('str').astype('category')
    df['person_home_ownership'] = df['person_home_ownership'].astype('str').astype('category')
    df['person_emp_length'] = df['person_emp_length'].astype('str').astype('category')
    df['loan_intent'] = df['loan_intent'].astype('str').astype('category')
    df['loan_grade'] = df['loan_grade'].astype('str').astype('category')
    df['loan_amnt'] = df['loan_amnt'].astype('str').astype('category')
    df['loan_int_rate'] = df['loan_int_rate'].astype('str').astype('category')
    df['loan_percent_income'] = df['loan_percent_income'].astype('str').astype('category')
    df['cb_person_default_on_file'] = df['cb_person_default_on_file'].astype('str').astype('category')
    df['cb_person_cred_hist_length'] = df['cb_person_cred_hist_length'].astype('str').astype('category')    

    return df

In [39]:
def data_wrangler2(df):
    ord_encoder = OrdinalEncoder()
    df['cb_person_cred_hist_length'] = df['cb_person_cred_hist_length'].astype('category')
    df['person_age'] = df['person_age'].astype('category')
    df['person_emp_length'] = df['person_emp_length'].astype('category')
    df['loan_percent_income'] = df['loan_percent_income'].astype('category')

    df['person_income'] = df['person_income'].astype('category')
    df['loan_amnt'] = df['loan_amnt'].astype('category')
    df['loan_int_rate'] = df['loan_int_rate'].astype('category')
    
    df['person_home_ownership'] = df['person_home_ownership'].astype('category')
    df['loan_intent'] = df['loan_intent'].astype('category')
    df['loan_grade'] = df['loan_grade'].astype('category')

    df['loan_grade'] = ord_encoder.fit_transform(df[['loan_grade']])
    
    df['cb_person_default_on_file'] = df['cb_person_default_on_file'].map({'N':0,'Y':1}).astype('bool')

    return df

In [40]:
def data_wrangler3(df):

    df['nc_cb_person_cred_hist_length'] = df['cb_person_cred_hist_length']
    df['nc_person_age'] = df['person_age']
    df['nc_person_emp_length'] = df['person_emp_length']
    df['nc_loan_percent_income'] = df['loan_percent_income']
    df['nc_person_income'] = df['person_income']
    df['nc_loan_amnt'] = df['loan_amnt']
    df['nc_loan_int_rate'] = df['loan_int_rate']

    df['person_age'] = df['person_age'].astype('str').astype('category')
    df['person_income'] = df['person_income'].astype('str').astype('category')
    df['person_home_ownership'] = df['person_home_ownership'].astype('str').astype('category')
    df['person_emp_length'] = df['person_emp_length'].astype('str').astype('category')
    df['loan_intent'] = df['loan_intent'].astype('str').astype('category')
    df['loan_grade'] = df['loan_grade'].astype('str').astype('category')
    df['loan_amnt'] = df['loan_amnt'].astype('str').astype('category')
    df['loan_int_rate'] = df['loan_int_rate'].astype('str').astype('category')
    df['loan_percent_income'] = df['loan_percent_income'].astype('str').astype('category')
    df['cb_person_default_on_file'] = df['cb_person_default_on_file'].astype('str').astype('category')
    df['cb_person_cred_hist_length'] = df['cb_person_cred_hist_length'].astype('str').astype('category')    

    return df

In [41]:
def final_standard_scaler(df):
    std_scl = StandardScaler()
    categorical = df.select_dtypes(['object','category','bool'])
    numerical = df.select_dtypes('number')
    numerical_scaled = std_scl.fit_transform(numerical)
    df_numerical_scaled = pd.DataFrame(numerical_scaled, columns=numerical.columns)
    df_final = pd.concat([df_numerical_scaled.reset_index(drop=True),categorical.reset_index(drop=True)], axis=1)
    return df_final

In [42]:
def final_get_dummies(df):
    df_categorical_columns = df.select_dtypes(include=['object','category'])
    cat_columns = list(df_categorical_columns)
    for col in df_categorical_columns:
        get_dummy = pd.get_dummies(df_categorical_columns[col], prefix=col, drop_first=True, dtype='bool')
        df_categorical_columns = pd.concat([df_categorical_columns, get_dummy], axis=1)
    df_dummies_columns = df_categorical_columns.drop(cat_columns, axis=1)
    df_final = pd.concat([df,df_dummies_columns], axis=1)
    return df_final.drop(cat_columns, axis=1)

In [43]:
concatenated_df = pd.concat([prepared_train_df.drop(['loan_status'],axis=1),df_test_copy], axis=0).reset_index(drop=True)
concatenated_df

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.90,0.21,N,10
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.10,N,3
...,...,...,...,...,...,...,...,...,...,...,...,...
130313,97738,22,31200,MORTGAGE,2.0,DEBTCONSOLIDATION,B,3000,10.37,0.10,N,4
130314,97739,22,48000,MORTGAGE,6.0,EDUCATION,A,7000,6.03,0.15,N,3
130315,97740,51,60000,MORTGAGE,0.0,PERSONAL,A,15000,7.51,0.25,N,25
130316,97741,22,36000,MORTGAGE,4.0,PERSONAL,D,14000,15.62,0.39,Y,4


In [44]:
cat_data_prep = Pipeline([('final_data_wrangler3', FunctionTransformer(data_wrangler3)),
                          ('final_standard_scaler', FunctionTransformer(final_standard_scaler))])

In [45]:
final_data_prep = Pipeline([('final_data_wrangler2', FunctionTransformer(data_wrangler2)),
                            ('final_get_dummies', FunctionTransformer(final_get_dummies)),
                            ('final_standard_scaler', FunctionTransformer(final_standard_scaler))])

In [46]:
# data for models
final_data = final_data_prep.transform(concatenated_df.drop('id', axis=1))

both_cols = final_standard_scaler(both_cols)
final_data = pd.concat([final_data, both_cols],axis=1)

df_train_final = final_data[:91220]
df_train_final = pd.concat([df_train_final, prepared_train_df['loan_status']],axis=1)

df_test_final = final_data[91220:].reset_index(drop=True)

In [47]:
# data for catbooster
cat_final_data = cat_data_prep.transform(concatenated_df.drop('id', axis=1))

cat_df_train_final = cat_final_data[:91220]
cat_df_train_final = pd.concat([cat_df_train_final, prepared_train_df['loan_status']],axis=1)

cat_df_test_final = cat_final_data[91220:].reset_index(drop=True)

# Train Basic Models

In [48]:
knn_class = KNeighborsClassifier()
tree_class = DecisionTreeClassifier()
forest_class = RandomForestClassifier()
svc_class = SVC()
ada_class = AdaBoostClassifier()
sgd_class = SGDClassifier()
ridge_class = RidgeClassifier()
log_class = LogisticRegression()
linSVC_class = LinearSVC()
gbc_class = GradientBoostingClassifier()
xgb_class = XGBClassifier()
lgbm_class = LGBMClassifier(verbosity=-1)
cat_class = CatBoostClassifier(verbose=0)
models_list = {'CatBoost_Classifier':cat_class,
               'KNeighbors_Regression':knn_class,
               'Decision_Tree':tree_class, 
               'Random_Forest':forest_class,
               'AdaBoost_Classifier': ada_class,
               'SVC_Classifier':svc_class,
               'SGD_Classifier':sgd_class, 
               'Ridge_Classification':ridge_class,
               'Logistic_Regression':log_class, 
               'LinearSVC':linSVC_class,
               'GradientBoosting_Classification':gbc_class, 
               'XGBoost_Classification':xgb_class, 
               'LightGBM_Classification':lgbm_class}

In [49]:
df_results = pd.DataFrame(columns=['model','cv_method','basic_roc_auc','roc_auc_mean','roc_auc_std','parameters'])

eval_methods = ['cross_val_score',
                'kfold',
                'repeatedkfold',
                'stratifiedkfold',
                'shufflesplit']

dataframes_dict = {'df_train_final': df_train_final}                 

In [50]:
def basic_models(df_results,datasetname ,models, cv_method, features_train, target_train):
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import roc_auc_score, roc_curve
    
    scoring = 'roc_auc'
    cv = 10
    df_final = df_results.copy()
    for key, model in models.items():
        
        if cv_method == 'cross_val_score':
            train_X, test_X, train_y, test_y = train_test_split(features_train, target_train,test_size=0.2 ,random_state=1)
            tm = model.fit(train_X, train_y)
            tm_predict = tm.predict(test_X)
            prediction = roc_auc_score(test_y, tm_predict)
            
            model_cv = cross_val_score(model, train_X, train_y,
                                       scoring=scoring, cv=cv)
            score = model_cv
            df_created = pd.DataFrame(
            {'datasetname': datasetname,
             'model': key,
             'cv_method': cv_method,
             'basic_roc_auc': [prediction],
             'roc_auc_mean': score.mean(),
             'roc_auc_std': score.std(),
             'parameters': 'None'})
            
            df_full_results = pd.concat([df_results,df_created], axis=0, ignore_index=True)
            
        if cv_method == 'kfold':
            predictions = []
            accu = []
            kf = KFold(n_splits=10,shuffle=True ,random_state=12)
            for train_index, test_index in kf.split(features_train):
                train_X, test_X = features_train.iloc[train_index], features_train.iloc[test_index]
                train_y, test_y = target_train.iloc[train_index], target_train.iloc[test_index]
                
                tm = model.fit(train_X, train_y)
                tm_predict = tm.predict(test_X)
                prediction = roc_auc_score(test_y, tm_predict)
                accu.append(prediction)
                predictions.append(prediction)
                
                df_created = pd.DataFrame(
                {'datasetname': datasetname,
                 'model': key,
                 'cv_method': cv_method,
                 'basic_roc_auc': [np.max(predictions)],
                 'roc_auc_mean': np.mean(accu),
                 'roc_auc_std': np.std(accu),
                 'parameters': 'None'})

                df_full_results = pd.concat([df_results,df_created], axis=0, ignore_index=True)
            
        if cv_method == 'repeatedkfold':
            predictions = []
            accu = []
            rkf = RepeatedKFold(n_splits=10, random_state=12)
            for train_index, test_index in rkf.split(features_train):
                train_X, test_X = features_train.iloc[train_index], features_train.iloc[test_index]
                train_y, test_y = target_train.iloc[train_index], target_train.iloc[test_index]
                
                tm = model.fit(train_X, train_y)
                tm_predict = tm.predict(test_X)
                prediction = roc_auc_score(test_y, tm_predict)
                accu.append(prediction)
                predictions.append(prediction)
                
                df_created = pd.DataFrame(
                {'datasetname': datasetname,
                 'model': key,
                 'cv_method': cv_method,
                 'basic_roc_auc': [np.max(predictions)],
                 'roc_auc_mean': np.mean(accu),
                 'roc_auc_std': np.std(accu),
                 'parameters': 'None'})

                df_full_results = pd.concat([df_results,df_created], axis=0, ignore_index=True)
        
        if cv_method == 'stratifiedkfold':
            predictions = []
            accu = []
            skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=12)
            for train_index, test_index in skf.split(features_train, target_train):
                train_X, test_X = features_train.iloc[train_index], features_train.iloc[test_index]
                train_y, test_y = target_train.iloc[train_index], target_train.iloc[test_index]
                
                tm = model.fit(train_X, train_y)
                tm_predict = tm.predict(test_X)
                prediction = roc_auc_score(test_y, tm_predict)
                accu.append(prediction)
                predictions.append(prediction)
                
                df_created = pd.DataFrame(
                {'datasetname': datasetname,
                 'model': key,
                 'cv_method': cv_method,
                 'basic_roc_auc': [np.max(predictions)],
                 'roc_auc_mean': np.mean(accu),
                 'roc_auc_std': np.std(accu),
                 'parameters': 'None'})

                df_full_results = pd.concat([df_results,df_created], axis=0, ignore_index=True)
        
        if cv_method == 'shufflesplit':
            predictions = []
            accu = []
            shs = ShuffleSplit(n_splits=10, test_size=0.2, random_state=12)
            for train_index, test_index in shs.split(features_train):
                train_X, test_X = features_train.iloc[train_index], features_train.iloc[test_index]
                train_y, test_y = target_train.iloc[train_index], target_train.iloc[test_index]
                
                tm = model.fit(train_X, train_y)
                tm_predict = tm.predict(test_X)
                prediction = roc_auc_score(test_y, tm_predict)
                accu.append(prediction)
                predictions.append(prediction)
                
                df_created = pd.DataFrame(
                {'datasetname': datasetname,
                 'model': key,
                 'cv_method': cv_method,
                 'basic_roc_auc': [np.max(predictions)],
                 'roc_auc_mean': np.mean(accu),
                 'roc_auc_std': np.std(accu),
                 'parameters': 'None'})

                df_full_results = pd.concat([df_results,df_created], axis=0, ignore_index=True)
        
        
        
        df_row = pd.DataFrame(
            {'datasetname': [df_full_results.loc[0]['datasetname']] ,
             'model': [df_full_results.loc[0]['model']] ,
             'cv_method': [df_full_results.loc[0]['cv_method']],
             'basic_roc_auc': [df_full_results['basic_roc_auc'].mean()],
             'roc_auc_mean': [df_full_results.loc[0]['roc_auc_mean']],
             'roc_auc_std': [df_full_results.loc[0]['roc_auc_std']],
             'parameters': 'None'})
        
        
        df_final = pd.concat([df_final,df_row], axis=0, ignore_index=True)
    return df_final

In [51]:
def train_models(df_results, dataframes_dict, eval_methods, models_list):

    final_result = []

    for key,value in dataframes_dict.items():
        features_train = value.drop('loan_status', axis=1)
        target_train = value['loan_status']
        print(f'training data: {key}')
        for method in eval_methods:
            print(f'training method: {method}')
            result = basic_models(df_results,key, models_list, method, features_train, target_train)
            final_result.append(result)
            print('done method')
        print('done data')

    final_data_of_trained_models = pd.concat(final_result, axis=0)
    return final_data_of_trained_models

In [52]:
#df_models_trained = train_models(df_results,dataframes_dict,eval_methods,models_list)

#NOT ENOUGH COMPUTING POWER

# Train selected models

In [53]:
final_train_features = df_train_final.drop(['loan_status'], axis=1)
final_train_target = df_train_final['loan_status']

cat_final_train_features = cat_df_train_final.drop(['loan_status'], axis=1)
cat_final_train_target = cat_df_train_final['loan_status']

In [54]:
xgb_class.fit(final_train_features,final_train_target)
predictions_xgb_class = xgb_class.predict_proba(df_test_final)[:, 1]
predictions_xgb_class

array([0.9986112 , 0.03655212, 0.45694333, ..., 0.0125963 , 0.42500862,
       0.93934363], dtype=float32)

In [55]:
lgbm_class.fit(final_train_features,final_train_target)
predictions_lgbm_class = lgbm_class.predict_proba(df_test_final)[:, 1]
predictions_lgbm_class

array([0.99351893, 0.03121272, 0.42811739, ..., 0.01271658, 0.38909161,
       0.96507323])

In [56]:
cat_class.fit(cat_final_train_features,cat_final_train_target, cat_features=list(cat_final_train_features.select_dtypes('category')))
predictions_cat_class = cat_class.predict_proba(cat_df_test_final)[:, 1]
predictions_cat_class

array([0.99966393, 0.0152749 , 0.38255706, ..., 0.00542058, 0.20198333,
       0.96933459])

In [57]:
my_models = pd.DataFrame({'xgb_class':predictions_xgb_class,'lgbm_class':predictions_lgbm_class,'cat_class':predictions_cat_class})
my_models

Unnamed: 0,xgb_class,lgbm_class,cat_class
0,0.998611,0.993519,0.999664
1,0.036552,0.031213,0.015275
2,0.456943,0.428117,0.382557
3,0.008553,0.006953,0.005536
4,0.151990,0.148465,0.020876
...,...,...,...
39093,0.089830,0.112511,0.076184
39094,0.008733,0.009856,0.002900
39095,0.012596,0.012717,0.005421
39096,0.425009,0.389092,0.201983


# Tune selected models with optuna

***lgbm***

In [58]:
def objective_lgbm_class(trial):
    n_estimators = trial.suggest_int('n_estimators',100,1000)
    learning_rate = trial.suggest_uniform('learning_rate',0.01,0.1)
    max_depth = trial.suggest_int('max_depth',3,15)
    num_leaves = trial.suggest_int('num_leaves',31,255)
    min_child_samples = trial.suggest_int('min_child_samples',10,100)
    subsample  = trial.suggest_uniform('subsample',0.5,1.0)
    colsample_bytree  = trial.suggest_uniform('colsample_bytree',0.5,1.0)
    lambda_l1   = trial.suggest_uniform('lambda_l1',0.0,10.0)
    lambda_l2  = trial.suggest_uniform('lambda_l2',0.0,10.0)
    min_split_gain   = trial.suggest_uniform('min_split_gain',0.0,1.0)
    boosting_type  = trial.suggest_categorical('boosting_type', ["gbdt", "dart", "goss"])
    early_stopping_rounds  = trial.suggest_int('early_stopping_rounds',10,100)

    model = LGBMClassifier(n_estimators=n_estimators,
                              learning_rate=learning_rate,
                              max_depth=max_depth,
                              num_leaves=num_leaves,
                              min_child_samples=min_child_samples,
                              subsample=subsample,
                              colsample_bytree=colsample_bytree,
                              lambda_l1=lambda_l1,
                              lambda_l2=lambda_l2,
                              min_split_gain=min_split_gain,
                              boosting_type=boosting_type,
                              early_stopping_rounds=early_stopping_rounds,
                              max_bin= 5000,
                              verbosity=-1)

    X_train, X_val, y_train, y_val = train_test_split(final_train_features, final_train_target, test_size=0.4, random_state=42)

    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

    y_pred = model.predict_proba(X_val)[:,1]
    score = roc_auc_score(y_val, y_pred)
    
    return score

In [59]:
study_lgbm_class = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=12))

In [60]:
study_lgbm_class.optimize(objective_lgbm_class, n_trials=10)

In [61]:
params_lgbm_class = study_lgbm_class.best_params
params_lgbm_class

{'n_estimators': 238,
 'learning_rate': 0.07660447268638643,
 'max_depth': 6,
 'num_leaves': 151,
 'min_child_samples': 11,
 'subsample': 0.9593735040499425,
 'colsample_bytree': 0.9503574270585061,
 'lambda_l1': 0.3342142762634459,
 'lambda_l2': 9.569493362751167,
 'min_split_gain': 0.13720932135607644,
 'boosting_type': 'goss',
 'early_stopping_rounds': 87}

In [62]:
# {'n_estimators': 259,
#  'learning_rate': 0.09663372127250654,
#  'max_depth': 8,
#  'num_leaves': 126,
#  'min_child_samples': 52,
#  'subsample': 0.6868615744119577,
#  'colsample_bytree': 0.7327540500862555,
#  'lambda_l1': 0.3516826147080476,
#  'lambda_l2': 0.8427266973184566,
#  'min_split_gain': 0.7325206981419501,
#  'boosting_type': 'gbdt',
#  'early_stopping_rounds': 30}

***xgboost***

In [63]:
def objective_xgb_class(trial):
    max_depth = trial.suggest_int('max_depth',3,10)
    min_child_weight = trial.suggest_int('min_child_weight',1,7)
    gamma = trial.suggest_uniform('gamma',0.0,1.0)
    subsample = trial.suggest_uniform('subsample',0.5,1.0)
    reg_alpha = trial.suggest_uniform('reg_alpha',0.0,1.0)
    reg_lambda = trial.suggest_uniform('reg_lambda',1,5)
    learning_rate = trial.suggest_uniform('learning_rate',0.01,0.3)
    colsample_bytree = trial.suggest_uniform('colsample_bytree',0.3,1.0)
    colsample_bylevel = trial.suggest_uniform('colsample_bylevel',0.5,1.0)
    colsample_bynode = trial.suggest_uniform('colsample_bynode',0.5,1.0)
    n_estimators   = trial.suggest_int('n_estimators',100,1000)
    verbosity = trial.suggest_int('verbosity',0,1)
    tree_method  = trial.suggest_categorical('tree_method', ['auto', 'exact', 'approx', 'hist'])
    early_stopping_rounds  = trial.suggest_int('early_stopping_rounds',10,100)

    model = XGBClassifier(max_depth=max_depth,
                              min_child_weight=min_child_weight,
                              gamma=gamma,
                              subsample=subsample,
                              reg_alpha=reg_alpha,
                              reg_lambda=reg_lambda,
                              learning_rate=learning_rate,
                              colsample_bytree=colsample_bytree,
                              colsample_bylevel=colsample_bylevel,
                              colsample_bynode=colsample_bynode,
                              n_estimators=n_estimators,
                              tree_method=tree_method,
                              early_stopping_rounds=early_stopping_rounds,
                              #max_bin= 5000,
                              verbosity=verbosity)

    X_train, X_val, y_train, y_val = train_test_split(final_train_features, final_train_target, test_size=0.4, random_state=42)

    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

    y_pred = model.predict_proba(X_val)[:,1]
    score = roc_auc_score(y_val, y_pred)
    
    return score

In [64]:
study_xgb_class = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=12))

In [65]:
study_xgb_class.optimize(objective_xgb_class, n_trials=10)

[0]	validation_0-logloss:0.40168
[1]	validation_0-logloss:0.34802
[2]	validation_0-logloss:0.32542
[3]	validation_0-logloss:0.31320
[4]	validation_0-logloss:0.30107
[5]	validation_0-logloss:0.27531
[6]	validation_0-logloss:0.26759
[7]	validation_0-logloss:0.26329
[8]	validation_0-logloss:0.25547
[9]	validation_0-logloss:0.25388
[10]	validation_0-logloss:0.24900
[11]	validation_0-logloss:0.23955
[12]	validation_0-logloss:0.23801
[13]	validation_0-logloss:0.23563
[14]	validation_0-logloss:0.23360
[15]	validation_0-logloss:0.23119
[16]	validation_0-logloss:0.22953
[17]	validation_0-logloss:0.22677
[18]	validation_0-logloss:0.22565
[19]	validation_0-logloss:0.22420
[20]	validation_0-logloss:0.22092
[21]	validation_0-logloss:0.21938
[22]	validation_0-logloss:0.21551
[23]	validation_0-logloss:0.21351
[24]	validation_0-logloss:0.21264
[25]	validation_0-logloss:0.21166
[26]	validation_0-logloss:0.21020
[27]	validation_0-logloss:0.20865
[28]	validation_0-logloss:0.20751
[29]	validation_0-loglos

In [66]:
params_xgb_class = study_xgb_class.best_params
params_xgb_class

{'max_depth': 4,
 'min_child_weight': 6,
 'gamma': 0.26331501518513467,
 'subsample': 0.7668696966901489,
 'reg_alpha': 0.014574962485419674,
 'reg_lambda': 4.67498803239954,
 'learning_rate': 0.27120730769393353,
 'colsample_bytree': 0.3233949993384412,
 'colsample_bylevel': 0.9784746681375585,
 'colsample_bynode': 0.5686046606780382,
 'n_estimators': 355,
 'verbosity': 1,
 'tree_method': 'auto',
 'early_stopping_rounds': 60}

In [67]:
# {'max_depth': 4,
#  'min_child_weight': 2,
#  'gamma': 0.4025891478959648,
#  'subsample': 0.8934222326238623,
#  'reg_alpha': 0.2538081939276894,
#  'reg_lambda': 1.4379416853260754,
#  'learning_rate': 0.27989604417074704,
#  'colsample_bytree': 0.3884710986422182,
#  'colsample_bylevel': 0.8025026951817817,
#  'colsample_bynode': 0.6450609682939136,
#  'n_estimators': 664,
#  'verbosity': 1,
#  'tree_method': 'exact',
#  'early_stopping_rounds': 10}

***catboost***

In [68]:
def objective_cat_class(trial):
    depth  = trial.suggest_int('depth',4,10)
    iterations  = trial.suggest_int('iterations',500,2000)
    learning_rate  = trial.suggest_uniform('learning_rate',0.01,0.2)
    l2_leaf_reg  = trial.suggest_int('l2_leaf_reg',3,10)
    random_strength = trial.suggest_int('random_strength',1,5)
    border_count = trial.suggest_int('border_count',32,255)
    leaf_estimation_iterations = trial.suggest_int('leaf_estimation_iterations',1,10)
   

    
    bootstrap_type  = trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS'])
    early_stopping_rounds  = trial.suggest_int('early_stopping_rounds',10,100)

    model = CatBoostClassifier(depth=depth,
                              iterations=iterations,
                              learning_rate=learning_rate,
                              l2_leaf_reg=l2_leaf_reg,
                              random_strength=random_strength,
                              border_count=border_count,
                              leaf_estimation_iterations=leaf_estimation_iterations,
                              bootstrap_type=bootstrap_type,
                              early_stopping_rounds=early_stopping_rounds,
                              verbose=0,
                              cat_features=list(cat_final_train_features.select_dtypes('category')))

    X_train, X_val, y_train, y_val = train_test_split(cat_final_train_features, cat_final_train_target, test_size=0.4, random_state=12)

    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

    y_pred = model.predict_proba(X_val)[:,1]
    score = roc_auc_score(y_val, y_pred)
    
    return score

In [69]:
study_cat_class = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=12))

In [70]:
study_cat_class.optimize(objective_cat_class, n_trials=10)

In [71]:
params_cat_class = study_cat_class.best_params
params_cat_class

{'depth': 5,
 'iterations': 1610,
 'learning_rate': 0.06002985288517559,
 'l2_leaf_reg': 7,
 'random_strength': 1,
 'border_count': 237,
 'leaf_estimation_iterations': 10,
 'bootstrap_type': 'Bernoulli',
 'early_stopping_rounds': 35}

In [72]:
# {'depth': 5,
#  'iterations': 1610,
#  'learning_rate': 0.06002985288517559,
#  'l2_leaf_reg': 7,
#  'random_strength': 1,
#  'border_count': 237,
#  'leaf_estimation_iterations': 10,
#  'bootstrap_type': 'Bernoulli',
#  'early_stopping_rounds': 35}

In [73]:
params_lgbm_class = {'n_estimators': 259,
 'learning_rate': 0.09663372127250654,
 'max_depth': 8,
 'num_leaves': 126,
 'min_child_samples': 52,
 'subsample': 0.6868615744119577,
 'colsample_bytree': 0.7327540500862555,
 'lambda_l1': 0.3516826147080476,
 'lambda_l2': 0.8427266973184566,
 'min_split_gain': 0.7325206981419501,
 'boosting_type': 'gbdt'}

params_xgb_class = {'max_depth': 4,
 'min_child_weight': 2,
 'gamma': 0.4025891478959648,
 'subsample': 0.8934222326238623,
 'reg_alpha': 0.2538081939276894,
 'reg_lambda': 1.4379416853260754,
 'learning_rate': 0.27989604417074704,
 'colsample_bytree': 0.3884710986422182,
 'colsample_bylevel': 0.8025026951817817,
 'colsample_bynode': 0.6450609682939136,
 'n_estimators': 664,
 'verbosity': 1,
 'tree_method': 'exact'}

params_cat_class = {'depth': 5,
 'iterations': 1610,
 'learning_rate': 0.06002985288517559,
 'l2_leaf_reg': 7,
 'random_strength': 1,
 'border_count': 237,
 'leaf_estimation_iterations': 10,
 'bootstrap_type': 'Bernoulli'}

# Train models with params

In [74]:
lgbm_class_model_params = LGBMClassifier(**params_lgbm_class)
xgboost_model_params = XGBClassifier(**params_xgb_class)
catboost_model_params = CatBoostClassifier(**params_cat_class,cat_features=list(cat_final_train_features.select_dtypes('category')))

***lgbm***

In [75]:
lgbm_class_model_params.fit(final_train_features,final_train_target)

[LightGBM] [Info] Number of positive: 15458, number of negative: 75762
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004834 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2162
[LightGBM] [Info] Number of data points in the train set: 91220, number of used features: 618
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169458 -> initscore=-1.589470
[LightGBM] [Info] Start training from score -1.589470


In [76]:
predictions_lgbm_class_params = lgbm_class_model_params.predict_proba(df_test_final)[:, 1]
predictions_lgbm_class_params



array([0.99543733, 0.02220545, 0.43566846, ..., 0.01446737, 0.39537547,
       0.92813912])

***xgboost***

In [77]:
xgboost_model_params.fit(final_train_features,final_train_target)

In [78]:
predictions_xgb_class_params = xgboost_model_params.predict_proba(df_test_final)[:, 1]
predictions_xgb_class_params

array([0.99957937, 0.019469  , 0.51344955, ..., 0.01065552, 0.311786  ,
       0.95761555], dtype=float32)

***catboost***

In [79]:
catboost_model_params.fit(cat_final_train_features,cat_final_train_target)

0:	learn: 0.6227284	total: 125ms	remaining: 3m 20s
1:	learn: 0.5630826	total: 219ms	remaining: 2m 55s
2:	learn: 0.5127332	total: 299ms	remaining: 2m 40s
3:	learn: 0.4704488	total: 384ms	remaining: 2m 34s
4:	learn: 0.4342798	total: 442ms	remaining: 2m 21s
5:	learn: 0.4031842	total: 539ms	remaining: 2m 24s
6:	learn: 0.3770664	total: 609ms	remaining: 2m 19s
7:	learn: 0.3554534	total: 688ms	remaining: 2m 17s
8:	learn: 0.3364519	total: 772ms	remaining: 2m 17s
9:	learn: 0.3210229	total: 881ms	remaining: 2m 20s
10:	learn: 0.3072631	total: 1.02s	remaining: 2m 27s
11:	learn: 0.2948518	total: 1.12s	remaining: 2m 29s
12:	learn: 0.2839665	total: 1.22s	remaining: 2m 30s
13:	learn: 0.2710974	total: 1.32s	remaining: 2m 30s
14:	learn: 0.2592439	total: 1.42s	remaining: 2m 31s
15:	learn: 0.2495575	total: 1.5s	remaining: 2m 29s
16:	learn: 0.2429824	total: 1.6s	remaining: 2m 30s
17:	learn: 0.2365178	total: 1.71s	remaining: 2m 31s
18:	learn: 0.2293663	total: 1.78s	remaining: 2m 29s
19:	learn: 0.2247036	tot

<catboost.core.CatBoostClassifier at 0x7d020b02d300>

In [80]:
predictions_catboost_class_params = catboost_model_params.predict_proba(cat_df_test_final)[:, 1]
predictions_catboost_class_params

array([0.9999007 , 0.01717652, 0.37767127, ..., 0.00435729, 0.2064066 ,
       0.97182201])

***results***

In [81]:
my_models_params = pd.DataFrame({'xgb_class_params':predictions_xgb_class_params,
                                 'lgbm_class_params':predictions_lgbm_class_params,
                                 'cat_class_params':predictions_catboost_class_params})


In [82]:
partly_trained_models = pd.concat([my_models, my_models_params],axis=1)
partly_trained_models

Unnamed: 0,xgb_class,lgbm_class,cat_class,xgb_class_params,lgbm_class_params,cat_class_params
0,0.998611,0.993519,0.999664,0.999579,0.995437,0.999901
1,0.036552,0.031213,0.015275,0.019469,0.022205,0.017177
2,0.456943,0.428117,0.382557,0.513450,0.435668,0.377671
3,0.008553,0.006953,0.005536,0.007010,0.010026,0.006227
4,0.151990,0.148465,0.020876,0.024929,0.165164,0.024564
...,...,...,...,...,...,...
39093,0.089830,0.112511,0.076184,0.024759,0.087275,0.067064
39094,0.008733,0.009856,0.002900,0.003413,0.008231,0.003590
39095,0.012596,0.012717,0.005421,0.010656,0.014467,0.004357
39096,0.425009,0.389092,0.201983,0.311786,0.395375,0.206407


# catboost each model with baseline

In [83]:
catboost_with_baseline = CatBoostClassifier(cat_features=list(cat_final_train_features.select_dtypes('category')))

***lgbm***

In [84]:
baseline_predictions_lgbm = lgbm_class.predict_proba(final_train_features)[:, 1]
catboost_with_baseline_lgbm = catboost_with_baseline.fit(cat_final_train_features, cat_final_train_target, baseline=baseline_predictions_lgbm)

Learning rate set to 0.070777
0:	learn: 0.5868056	total: 125ms	remaining: 2m 4s
1:	learn: 0.5210682	total: 214ms	remaining: 1m 46s
2:	learn: 0.4664002	total: 317ms	remaining: 1m 45s
3:	learn: 0.4199010	total: 415ms	remaining: 1m 43s
4:	learn: 0.3826679	total: 522ms	remaining: 1m 43s
5:	learn: 0.3516500	total: 608ms	remaining: 1m 40s
6:	learn: 0.3269262	total: 693ms	remaining: 1m 38s
7:	learn: 0.3070780	total: 783ms	remaining: 1m 37s
8:	learn: 0.2894697	total: 886ms	remaining: 1m 37s
9:	learn: 0.2754609	total: 975ms	remaining: 1m 36s
10:	learn: 0.2637295	total: 1.05s	remaining: 1m 34s
11:	learn: 0.2482373	total: 1.15s	remaining: 1m 34s
12:	learn: 0.2343566	total: 1.22s	remaining: 1m 32s
13:	learn: 0.2253236	total: 1.3s	remaining: 1m 31s
14:	learn: 0.2180090	total: 1.39s	remaining: 1m 31s
15:	learn: 0.2106839	total: 1.49s	remaining: 1m 31s
16:	learn: 0.2036062	total: 1.57s	remaining: 1m 31s
17:	learn: 0.1973473	total: 1.65s	remaining: 1m 30s
18:	learn: 0.1935836	total: 1.72s	remaining: 1

In [85]:
catboost_with_baseline_lgbm_pred = catboost_with_baseline_lgbm.predict_proba(cat_df_test_final)[:,1]

In [86]:
baseline_predictions_lgbm_params = lgbm_class_model_params.predict_proba(final_train_features)[:, 1]
catboost_with_baseline_lgbm_params=catboost_with_baseline.fit(cat_final_train_features, cat_final_train_target, baseline=baseline_predictions_lgbm_params)

Learning rate set to 0.070777
0:	learn: 0.5863429	total: 128ms	remaining: 2m 7s
1:	learn: 0.5205402	total: 218ms	remaining: 1m 48s
2:	learn: 0.4678439	total: 328ms	remaining: 1m 49s
3:	learn: 0.4201298	total: 452ms	remaining: 1m 52s
4:	learn: 0.3820475	total: 553ms	remaining: 1m 50s
5:	learn: 0.3527767	total: 647ms	remaining: 1m 47s
6:	learn: 0.3272251	total: 755ms	remaining: 1m 47s
7:	learn: 0.3075453	total: 851ms	remaining: 1m 45s
8:	learn: 0.2892200	total: 962ms	remaining: 1m 45s
9:	learn: 0.2754186	total: 1.05s	remaining: 1m 44s
10:	learn: 0.2625942	total: 1.17s	remaining: 1m 45s
11:	learn: 0.2523953	total: 1.26s	remaining: 1m 43s
12:	learn: 0.2399087	total: 1.35s	remaining: 1m 42s
13:	learn: 0.2283408	total: 1.46s	remaining: 1m 42s
14:	learn: 0.2184714	total: 1.55s	remaining: 1m 41s
15:	learn: 0.2115745	total: 1.66s	remaining: 1m 42s
16:	learn: 0.2044701	total: 1.75s	remaining: 1m 41s
17:	learn: 0.1993081	total: 1.85s	remaining: 1m 40s
18:	learn: 0.1950287	total: 1.93s	remaining: 

In [87]:
catboost_with_baseline_lgbm_params_pred = catboost_with_baseline_lgbm_params.predict_proba(cat_df_test_final)[:,1]

***xgboost***

In [88]:
baseline_predictions_xgb = xgb_class.predict_proba(final_train_features)[:, 1]
catboost_with_baseline_xgb = catboost_with_baseline.fit(cat_final_train_features, cat_final_train_target, baseline=baseline_predictions_xgb)

Learning rate set to 0.070777
0:	learn: 0.5853931	total: 119ms	remaining: 1m 58s
1:	learn: 0.5196463	total: 204ms	remaining: 1m 41s
2:	learn: 0.4669905	total: 308ms	remaining: 1m 42s
3:	learn: 0.4193153	total: 417ms	remaining: 1m 43s
4:	learn: 0.3812667	total: 512ms	remaining: 1m 41s
5:	learn: 0.3520168	total: 601ms	remaining: 1m 39s
6:	learn: 0.3264824	total: 711ms	remaining: 1m 40s
7:	learn: 0.3060393	total: 807ms	remaining: 1m 40s
8:	learn: 0.2898957	total: 888ms	remaining: 1m 37s
9:	learn: 0.2756990	total: 998ms	remaining: 1m 38s
10:	learn: 0.2627415	total: 1.1s	remaining: 1m 39s
11:	learn: 0.2527135	total: 1.19s	remaining: 1m 37s
12:	learn: 0.2394456	total: 1.27s	remaining: 1m 36s
13:	learn: 0.2290725	total: 1.37s	remaining: 1m 36s
14:	learn: 0.2203220	total: 1.45s	remaining: 1m 35s
15:	learn: 0.2131480	total: 1.53s	remaining: 1m 33s
16:	learn: 0.2068530	total: 1.61s	remaining: 1m 32s
17:	learn: 0.2010585	total: 1.69s	remaining: 1m 32s
18:	learn: 0.1963025	total: 1.79s	remaining: 

In [89]:
catboost_with_baseline_xgb_pred = catboost_with_baseline_xgb.predict_proba(cat_df_test_final)[:,1]

In [90]:
baseline_predictions_xgb_params = xgboost_model_params.predict_proba(final_train_features)[:, 1]
catboost_with_baseline_xgb_params=catboost_with_baseline.fit(cat_final_train_features, cat_final_train_target, baseline=baseline_predictions_xgb_params)

Learning rate set to 0.070777
0:	learn: 0.5842244	total: 128ms	remaining: 2m 8s
1:	learn: 0.5183095	total: 219ms	remaining: 1m 49s
2:	learn: 0.4635050	total: 331ms	remaining: 1m 49s
3:	learn: 0.4168973	total: 442ms	remaining: 1m 50s
4:	learn: 0.3795617	total: 564ms	remaining: 1m 52s
5:	learn: 0.3484538	total: 654ms	remaining: 1m 48s
6:	learn: 0.3236613	total: 739ms	remaining: 1m 44s
7:	learn: 0.3038224	total: 829ms	remaining: 1m 42s
8:	learn: 0.2862384	total: 938ms	remaining: 1m 43s
9:	learn: 0.2722547	total: 1.03s	remaining: 1m 41s
10:	learn: 0.2604493	total: 1.1s	remaining: 1m 39s
11:	learn: 0.2504349	total: 1.2s	remaining: 1m 38s
12:	learn: 0.2398593	total: 1.3s	remaining: 1m 38s
13:	learn: 0.2294044	total: 1.4s	remaining: 1m 38s
14:	learn: 0.2191289	total: 1.47s	remaining: 1m 36s
15:	learn: 0.2114675	total: 1.56s	remaining: 1m 36s
16:	learn: 0.2044098	total: 1.67s	remaining: 1m 36s
17:	learn: 0.1990265	total: 1.74s	remaining: 1m 34s
18:	learn: 0.1951259	total: 1.84s	remaining: 1m 3

In [91]:
catboost_with_baseline_xgb_params_pred = catboost_with_baseline_xgb_params.predict_proba(cat_df_test_final)[:,1]

***catboost***

In [92]:
baseline_predictions_cat = cat_class.predict_proba(cat_final_train_features)[:, 1]
catboost_with_baseline_cat = catboost_with_baseline.fit(cat_final_train_features, cat_final_train_target, baseline=baseline_predictions_cat)

Learning rate set to 0.070777
0:	learn: 0.5670876	total: 124ms	remaining: 2m 3s
1:	learn: 0.5010865	total: 213ms	remaining: 1m 46s
2:	learn: 0.4482089	total: 324ms	remaining: 1m 47s
3:	learn: 0.4026781	total: 442ms	remaining: 1m 50s
4:	learn: 0.3656149	total: 539ms	remaining: 1m 47s
5:	learn: 0.3354040	total: 633ms	remaining: 1m 44s
6:	learn: 0.3110916	total: 752ms	remaining: 1m 46s
7:	learn: 0.2884164	total: 853ms	remaining: 1m 45s
8:	learn: 0.2706223	total: 956ms	remaining: 1m 45s
9:	learn: 0.2568703	total: 1.06s	remaining: 1m 45s
10:	learn: 0.2438232	total: 1.16s	remaining: 1m 44s
11:	learn: 0.2337293	total: 1.27s	remaining: 1m 44s
12:	learn: 0.2188176	total: 1.36s	remaining: 1m 43s
13:	learn: 0.2095318	total: 1.46s	remaining: 1m 42s
14:	learn: 0.2016891	total: 1.56s	remaining: 1m 42s
15:	learn: 0.1937544	total: 1.66s	remaining: 1m 42s
16:	learn: 0.1870764	total: 1.76s	remaining: 1m 41s
17:	learn: 0.1809568	total: 1.83s	remaining: 1m 39s
18:	learn: 0.1765323	total: 1.93s	remaining: 

In [93]:
catboost_with_baseline_cat_pred = catboost_with_baseline_cat.predict_proba(cat_df_test_final)[:,1]

In [94]:
baseline_predictions_cat_params = catboost_model_params.predict_proba(cat_final_train_features)[:, 1]
catboost_with_baseline_cat_params=catboost_with_baseline.fit(cat_final_train_features, cat_final_train_target, baseline=baseline_predictions_cat_params)

Learning rate set to 0.070777
0:	learn: 0.5684809	total: 116ms	remaining: 1m 55s
1:	learn: 0.5026614	total: 200ms	remaining: 1m 39s
2:	learn: 0.4499284	total: 303ms	remaining: 1m 40s
3:	learn: 0.4045233	total: 410ms	remaining: 1m 42s
4:	learn: 0.3675664	total: 499ms	remaining: 1m 39s
5:	learn: 0.3374395	total: 587ms	remaining: 1m 37s
6:	learn: 0.3131932	total: 700ms	remaining: 1m 39s
7:	learn: 0.2905742	total: 792ms	remaining: 1m 38s
8:	learn: 0.2728256	total: 894ms	remaining: 1m 38s
9:	learn: 0.2591111	total: 1s	remaining: 1m 39s
10:	learn: 0.2460927	total: 1.1s	remaining: 1m 38s
11:	learn: 0.2360236	total: 1.21s	remaining: 1m 39s
12:	learn: 0.2210859	total: 1.29s	remaining: 1m 38s
13:	learn: 0.2117832	total: 1.37s	remaining: 1m 36s
14:	learn: 0.2039301	total: 1.45s	remaining: 1m 35s
15:	learn: 0.1955069	total: 1.55s	remaining: 1m 35s
16:	learn: 0.1894280	total: 1.64s	remaining: 1m 35s
17:	learn: 0.1843225	total: 1.74s	remaining: 1m 34s
18:	learn: 0.1796060	total: 1.81s	remaining: 1m 

In [95]:
catboost_with_baseline_cat_params_pred = catboost_with_baseline_cat_params.predict_proba(cat_df_test_final)[:,1]

***results***

In [96]:
catboosted_models = pd.DataFrame({'cat_xgb_class':catboost_with_baseline_xgb_pred,
                                  'cat_lgbm_class':catboost_with_baseline_lgbm_pred,
                                  'cat_cat_class':catboost_with_baseline_cat_pred,
                                  'cat_xgb_class_params':catboost_with_baseline_xgb_params_pred,
                                  'cat_lgbm_class_params':catboost_with_baseline_lgbm_params_pred,
                                  'cat_cat_class_params':catboost_with_baseline_cat_params_pred})

In [97]:
boosted_models = pd.concat([partly_trained_models, catboosted_models],axis=1)
boosted_models

Unnamed: 0,xgb_class,lgbm_class,cat_class,xgb_class_params,lgbm_class_params,cat_class_params,cat_xgb_class,cat_lgbm_class,cat_cat_class,cat_xgb_class_params,cat_lgbm_class_params,cat_cat_class_params
0,0.998611,0.993519,0.999664,0.999579,0.995437,0.999901,0.998844,0.998449,0.999556,0.999309,0.999496,0.999264
1,0.036552,0.031213,0.015275,0.019469,0.022205,0.017177,0.014703,0.013141,0.013991,0.015739,0.015061,0.013776
2,0.456943,0.428117,0.382557,0.513450,0.435668,0.377671,0.297554,0.287620,0.262277,0.341478,0.319253,0.302078
3,0.008553,0.006953,0.005536,0.007010,0.010026,0.006227,0.005675,0.005907,0.006016,0.006035,0.006510,0.004214
4,0.151990,0.148465,0.020876,0.024929,0.165164,0.024564,0.017807,0.026222,0.024408,0.025311,0.025898,0.023118
...,...,...,...,...,...,...,...,...,...,...,...,...
39093,0.089830,0.112511,0.076184,0.024759,0.087275,0.067064,0.067318,0.059949,0.088703,0.073463,0.082457,0.064209
39094,0.008733,0.009856,0.002900,0.003413,0.008231,0.003590,0.003841,0.004047,0.003856,0.001962,0.003773,0.002586
39095,0.012596,0.012717,0.005421,0.010656,0.014467,0.004357,0.003937,0.005201,0.006378,0.005134,0.004306,0.005787
39096,0.425009,0.389092,0.201983,0.311786,0.395375,0.206407,0.178890,0.171125,0.192882,0.182411,0.163478,0.158552


# catboost means of models as baseline

***lgbm***

In [98]:
l1 = lgbm_class.predict_proba(final_train_features)[:, 1]
l2 = lgbm_class_model_params.predict_proba(final_train_features)[:, 1]
l3 = catboost_with_baseline_lgbm.predict_proba(cat_final_train_features)[:, 1]
l4 = catboost_with_baseline_lgbm_params.predict_proba(cat_final_train_features)[:, 1]



In [99]:
l_df = pd.DataFrame({'l1': l1, 'l2': l2, 'l3': l3, 'l4': l4})
l_average = l_df.mean(axis=1)
l_average

0        0.026585
1        0.004255
2        0.005034
3        0.004367
4        0.004754
           ...   
91215    0.037857
91216    0.002736
91217    0.986947
91218    0.005842
91219    0.018714
Length: 91220, dtype: float64

***xgboost***

In [100]:
x1 = xgb_class.predict_proba(final_train_features)[:, 1]
x2 = xgboost_model_params.predict_proba(final_train_features)[:, 1]
x3 = catboost_with_baseline_xgb.predict_proba(cat_final_train_features)[:, 1]
x4 = catboost_with_baseline_xgb_params.predict_proba(cat_final_train_features)[:, 1]

In [101]:
x_df = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'l4': x4})
x_average = x_df.mean(axis=1)
x_average

0        0.024080
1        0.004091
2        0.004895
3        0.003591
4        0.003575
           ...   
91215    0.037003
91216    0.002320
91217    0.967836
91218    0.004406
91219    0.018313
Length: 91220, dtype: float64

***catboost***

In [102]:
c1 = cat_class.predict_proba(cat_final_train_features)[:, 1]
c2 = catboost_model_params.predict_proba(cat_final_train_features)[:, 1]
c3 = catboost_with_baseline_cat.predict_proba(cat_final_train_features)[:, 1]
c4 = catboost_with_baseline_cat_params.predict_proba(cat_final_train_features)[:, 1]

In [103]:
c_df = pd.DataFrame({'c1': c1, 'c2': c2, 'c3': c3, 'c4': l4})
c_average = c_df.mean(axis=1)
c_average

0        0.017142
1        0.003107
2        0.003297
3        0.002936
4        0.001886
           ...   
91215    0.031924
91216    0.000989
91217    0.998379
91218    0.003146
91219    0.011830
Length: 91220, dtype: float64

# Train on average (catboost no params)

***lgbm***

In [104]:
catboost_with_baseline_lgbm_mean = catboost_with_baseline.fit(cat_final_train_features, cat_final_train_target, baseline=l_average)
lgbm_mean_pred = catboost_with_baseline_lgbm_mean.predict_proba(cat_df_test_final)[:,1]

Learning rate set to 0.070777
0:	learn: 0.5767442	total: 128ms	remaining: 2m 8s
1:	learn: 0.5111028	total: 221ms	remaining: 1m 50s
2:	learn: 0.4565239	total: 334ms	remaining: 1m 50s
3:	learn: 0.4126988	total: 448ms	remaining: 1m 51s
4:	learn: 0.3744274	total: 551ms	remaining: 1m 49s
5:	learn: 0.3446997	total: 644ms	remaining: 1m 46s
6:	learn: 0.3207822	total: 768ms	remaining: 1m 48s
7:	learn: 0.2994337	total: 871ms	remaining: 1m 48s
8:	learn: 0.2816686	total: 984ms	remaining: 1m 48s
9:	learn: 0.2679244	total: 1.1s	remaining: 1m 49s
10:	learn: 0.2549010	total: 1.21s	remaining: 1m 48s
11:	learn: 0.2448835	total: 1.31s	remaining: 1m 48s
12:	learn: 0.2300651	total: 1.4s	remaining: 1m 46s
13:	learn: 0.2188427	total: 1.48s	remaining: 1m 44s
14:	learn: 0.2107913	total: 1.57s	remaining: 1m 43s
15:	learn: 0.2033803	total: 1.65s	remaining: 1m 41s
16:	learn: 0.1972448	total: 1.74s	remaining: 1m 40s
17:	learn: 0.1911338	total: 1.84s	remaining: 1m 40s
18:	learn: 0.1871629	total: 1.91s	remaining: 1m

***xgboost***

In [105]:
catboost_with_baseline_xgb_mean = catboost_with_baseline.fit(cat_final_train_features, cat_final_train_target, baseline=x_average)
xgb_mean_pred = catboost_with_baseline_xgb_mean.predict_proba(cat_df_test_final)[:,1]

Learning rate set to 0.070777
0:	learn: 0.5758682	total: 129ms	remaining: 2m 8s
1:	learn: 0.5101962	total: 219ms	remaining: 1m 49s
2:	learn: 0.4555941	total: 330ms	remaining: 1m 49s
3:	learn: 0.4117497	total: 444ms	remaining: 1m 50s
4:	learn: 0.3734692	total: 545ms	remaining: 1m 48s
5:	learn: 0.3437244	total: 629ms	remaining: 1m 44s
6:	learn: 0.3197915	total: 741ms	remaining: 1m 45s
7:	learn: 0.2984348	total: 833ms	remaining: 1m 43s
8:	learn: 0.2805162	total: 937ms	remaining: 1m 43s
9:	learn: 0.2659323	total: 1.04s	remaining: 1m 42s
10:	learn: 0.2542361	total: 1.14s	remaining: 1m 42s
11:	learn: 0.2440546	total: 1.24s	remaining: 1m 42s
12:	learn: 0.2347922	total: 1.35s	remaining: 1m 42s
13:	learn: 0.2230630	total: 1.46s	remaining: 1m 42s
14:	learn: 0.2125923	total: 1.56s	remaining: 1m 42s
15:	learn: 0.2057756	total: 1.65s	remaining: 1m 41s
16:	learn: 0.1990191	total: 1.73s	remaining: 1m 40s
17:	learn: 0.1937478	total: 1.8s	remaining: 1m 38s
18:	learn: 0.1882307	total: 1.9s	remaining: 1m

***catboost***

In [106]:
catboost_with_baseline_cat_mean = catboost_with_baseline.fit(cat_final_train_features, cat_final_train_target, baseline=c_average)
cat_mean_pred = catboost_with_baseline_cat_mean.predict_proba(cat_df_test_final)[:,1]

Learning rate set to 0.070777
0:	learn: 0.5676171	total: 161ms	remaining: 2m 41s
1:	learn: 0.5018826	total: 283ms	remaining: 2m 21s
2:	learn: 0.4492293	total: 422ms	remaining: 2m 20s
3:	learn: 0.4038811	total: 586ms	remaining: 2m 25s
4:	learn: 0.3669666	total: 712ms	remaining: 2m 21s
5:	learn: 0.3368844	total: 830ms	remaining: 2m 17s
6:	learn: 0.3126786	total: 977ms	remaining: 2m 18s
7:	learn: 0.2900829	total: 1.1s	remaining: 2m 16s
8:	learn: 0.2723586	total: 1.23s	remaining: 2m 15s
9:	learn: 0.2586628	total: 1.38s	remaining: 2m 16s
10:	learn: 0.2456517	total: 1.5s	remaining: 2m 15s
11:	learn: 0.2355976	total: 1.64s	remaining: 2m 15s
12:	learn: 0.2206262	total: 1.75s	remaining: 2m 13s
13:	learn: 0.2113043	total: 1.85s	remaining: 2m 10s
14:	learn: 0.2034341	total: 1.96s	remaining: 2m 8s
15:	learn: 0.1949826	total: 2.1s	remaining: 2m 9s
16:	learn: 0.1888902	total: 2.22s	remaining: 2m 8s
17:	learn: 0.1837753	total: 2.32s	remaining: 2m 6s
18:	learn: 0.1790453	total: 2.39s	remaining: 2m 3s


In [107]:
mean_models = pd.DataFrame({'cat_xgb_mean':lgbm_mean_pred,
                                  'cat_lgbm_mean':xgb_mean_pred,
                                  'cat_cat_mean':cat_mean_pred})
mean_models

Unnamed: 0,cat_xgb_mean,cat_lgbm_mean,cat_cat_mean
0,0.999508,0.999450,0.998804
1,0.016488,0.011795,0.015110
2,0.348570,0.349828,0.310768
3,0.003961,0.005648,0.005578
4,0.023644,0.024880,0.024214
...,...,...,...
39093,0.071830,0.077528,0.060567
39094,0.003369,0.003210,0.003233
39095,0.004676,0.003536,0.005532
39096,0.182624,0.191434,0.180328


# Train on average (catboost with params)

***lgbm***

In [108]:
catboost_params_with_baseline_lgbm_mean = catboost_with_baseline_cat_params.fit(cat_final_train_features, cat_final_train_target, baseline=l_average)
params_lgbm_mean_pred = catboost_params_with_baseline_lgbm_mean.predict_proba(cat_df_test_final)[:,1]

Learning rate set to 0.070777
0:	learn: 0.5767442	total: 129ms	remaining: 2m 8s
1:	learn: 0.5111028	total: 221ms	remaining: 1m 50s
2:	learn: 0.4565239	total: 335ms	remaining: 1m 51s
3:	learn: 0.4126988	total: 451ms	remaining: 1m 52s
4:	learn: 0.3744274	total: 555ms	remaining: 1m 50s
5:	learn: 0.3446997	total: 644ms	remaining: 1m 46s
6:	learn: 0.3207822	total: 758ms	remaining: 1m 47s
7:	learn: 0.2994337	total: 851ms	remaining: 1m 45s
8:	learn: 0.2816686	total: 955ms	remaining: 1m 45s
9:	learn: 0.2679244	total: 1.07s	remaining: 1m 46s
10:	learn: 0.2549010	total: 1.17s	remaining: 1m 44s
11:	learn: 0.2448835	total: 1.31s	remaining: 1m 47s
12:	learn: 0.2300651	total: 1.41s	remaining: 1m 47s
13:	learn: 0.2188427	total: 1.49s	remaining: 1m 44s
14:	learn: 0.2107913	total: 1.58s	remaining: 1m 43s
15:	learn: 0.2033803	total: 1.66s	remaining: 1m 41s
16:	learn: 0.1972448	total: 1.75s	remaining: 1m 41s
17:	learn: 0.1911338	total: 1.85s	remaining: 1m 40s
18:	learn: 0.1871629	total: 1.92s	remaining: 

***xgboost***

In [109]:
catboost_params_with_baseline_xgb_mean = catboost_with_baseline_cat_params.fit(cat_final_train_features, cat_final_train_target, baseline=x_average)
params_xgb_mean_pred = catboost_params_with_baseline_xgb_mean.predict_proba(cat_df_test_final)[:,1]

Learning rate set to 0.070777
0:	learn: 0.5758682	total: 117ms	remaining: 1m 56s
1:	learn: 0.5101962	total: 210ms	remaining: 1m 44s
2:	learn: 0.4555941	total: 311ms	remaining: 1m 43s
3:	learn: 0.4117497	total: 465ms	remaining: 1m 55s
4:	learn: 0.3734692	total: 559ms	remaining: 1m 51s
5:	learn: 0.3437244	total: 640ms	remaining: 1m 46s
6:	learn: 0.3197915	total: 751ms	remaining: 1m 46s
7:	learn: 0.2984348	total: 841ms	remaining: 1m 44s
8:	learn: 0.2805162	total: 946ms	remaining: 1m 44s
9:	learn: 0.2659323	total: 1.05s	remaining: 1m 44s
10:	learn: 0.2542361	total: 1.16s	remaining: 1m 43s
11:	learn: 0.2440546	total: 1.26s	remaining: 1m 43s
12:	learn: 0.2347922	total: 1.36s	remaining: 1m 43s
13:	learn: 0.2230630	total: 1.47s	remaining: 1m 43s
14:	learn: 0.2125923	total: 1.57s	remaining: 1m 43s
15:	learn: 0.2057756	total: 1.66s	remaining: 1m 42s
16:	learn: 0.1990191	total: 1.75s	remaining: 1m 41s
17:	learn: 0.1937478	total: 1.82s	remaining: 1m 39s
18:	learn: 0.1882307	total: 1.92s	remaining:

***catboost***

In [110]:
catboost_params_with_baseline_cat_mean = catboost_with_baseline_cat_params.fit(cat_final_train_features, cat_final_train_target, baseline=c_average)
params_cat_mean_pred = catboost_params_with_baseline_cat_mean.predict_proba(cat_df_test_final)[:,1]

Learning rate set to 0.070777
0:	learn: 0.5676171	total: 129ms	remaining: 2m 8s
1:	learn: 0.5018826	total: 220ms	remaining: 1m 49s
2:	learn: 0.4492293	total: 331ms	remaining: 1m 50s
3:	learn: 0.4038811	total: 450ms	remaining: 1m 52s
4:	learn: 0.3669666	total: 550ms	remaining: 1m 49s
5:	learn: 0.3368844	total: 647ms	remaining: 1m 47s
6:	learn: 0.3126786	total: 768ms	remaining: 1m 48s
7:	learn: 0.2900829	total: 872ms	remaining: 1m 48s
8:	learn: 0.2723586	total: 980ms	remaining: 1m 47s
9:	learn: 0.2586628	total: 1.09s	remaining: 1m 47s
10:	learn: 0.2456517	total: 1.18s	remaining: 1m 46s
11:	learn: 0.2355976	total: 1.29s	remaining: 1m 46s
12:	learn: 0.2206262	total: 1.38s	remaining: 1m 44s
13:	learn: 0.2113043	total: 1.45s	remaining: 1m 42s
14:	learn: 0.2034341	total: 1.54s	remaining: 1m 40s
15:	learn: 0.1949826	total: 1.63s	remaining: 1m 40s
16:	learn: 0.1888902	total: 1.73s	remaining: 1m 39s
17:	learn: 0.1837753	total: 1.82s	remaining: 1m 39s
18:	learn: 0.1790453	total: 1.9s	remaining: 1

In [111]:
params_mean_models = pd.DataFrame({'cat_params_xgb_mean':params_lgbm_mean_pred,
                                  'cat_params_lgbm_mean':params_xgb_mean_pred,
                                  'cat_params_cat_mean':params_cat_mean_pred})
params_mean_models

Unnamed: 0,cat_params_xgb_mean,cat_params_lgbm_mean,cat_params_cat_mean
0,0.999508,0.999450,0.998804
1,0.016488,0.011795,0.015110
2,0.348570,0.349828,0.310768
3,0.003961,0.005648,0.005578
4,0.023644,0.024880,0.024214
...,...,...,...
39093,0.071830,0.077528,0.060567
39094,0.003369,0.003210,0.003233
39095,0.004676,0.003536,0.005532
39096,0.182624,0.191434,0.180328


# FINAL PREDICTIONS DATASET

In [112]:
final_predictions_df = pd.concat([boosted_models, mean_models, params_mean_models],axis=1)

In [113]:
final_predictions_df

Unnamed: 0,xgb_class,lgbm_class,cat_class,xgb_class_params,lgbm_class_params,cat_class_params,cat_xgb_class,cat_lgbm_class,cat_cat_class,cat_xgb_class_params,cat_lgbm_class_params,cat_cat_class_params,cat_xgb_mean,cat_lgbm_mean,cat_cat_mean,cat_params_xgb_mean,cat_params_lgbm_mean,cat_params_cat_mean
0,0.998611,0.993519,0.999664,0.999579,0.995437,0.999901,0.998844,0.998449,0.999556,0.999309,0.999496,0.999264,0.999508,0.999450,0.998804,0.999508,0.999450,0.998804
1,0.036552,0.031213,0.015275,0.019469,0.022205,0.017177,0.014703,0.013141,0.013991,0.015739,0.015061,0.013776,0.016488,0.011795,0.015110,0.016488,0.011795,0.015110
2,0.456943,0.428117,0.382557,0.513450,0.435668,0.377671,0.297554,0.287620,0.262277,0.341478,0.319253,0.302078,0.348570,0.349828,0.310768,0.348570,0.349828,0.310768
3,0.008553,0.006953,0.005536,0.007010,0.010026,0.006227,0.005675,0.005907,0.006016,0.006035,0.006510,0.004214,0.003961,0.005648,0.005578,0.003961,0.005648,0.005578
4,0.151990,0.148465,0.020876,0.024929,0.165164,0.024564,0.017807,0.026222,0.024408,0.025311,0.025898,0.023118,0.023644,0.024880,0.024214,0.023644,0.024880,0.024214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39093,0.089830,0.112511,0.076184,0.024759,0.087275,0.067064,0.067318,0.059949,0.088703,0.073463,0.082457,0.064209,0.071830,0.077528,0.060567,0.071830,0.077528,0.060567
39094,0.008733,0.009856,0.002900,0.003413,0.008231,0.003590,0.003841,0.004047,0.003856,0.001962,0.003773,0.002586,0.003369,0.003210,0.003233,0.003369,0.003210,0.003233
39095,0.012596,0.012717,0.005421,0.010656,0.014467,0.004357,0.003937,0.005201,0.006378,0.005134,0.004306,0.005787,0.004676,0.003536,0.005532,0.004676,0.003536,0.005532
39096,0.425009,0.389092,0.201983,0.311786,0.395375,0.206407,0.178890,0.171125,0.192882,0.182411,0.163478,0.158552,0.182624,0.191434,0.180328,0.182624,0.191434,0.180328


In [114]:
final_predictions_df.to_csv('./final_predictions_df.csv')
#df_models_trained = pd.read_csv('./df_models_trained.csv',index_col=0)

In [115]:
for column in final_predictions_df.columns:

    output = pd.DataFrame({'id': df_test.id,
                           'loan_status': final_predictions_df[column].astype('float')})
    output.to_csv(f'{final_predictions_df[column].name}_prediction.csv', index=False)