In [102]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold
import gc
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier

In [103]:
data_train = pd.read_csv("application_train.csv")
data_test = pd.read_csv("application_test.csv")

In [104]:
print(data_train.shape, data_test.shape)

(291131, 122) (48744, 121)


In [105]:
data_train["source"] = "train"
data_test["source"] = "test"

# **Append train and test data**

In [106]:
append = pd.concat([data_train, data_test], axis=0, ignore_index=True)
append.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,source
0,100002,1.0,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,train
1,100003,0.0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
2,100004,0.0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
3,100006,0.0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.0,,,,,,,train
4,100007,0.0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train


# **Find columns with a lot of missing data and remove some of them**

In [107]:
missing_cols_prcnt = data_train.isnull().sum()/data_train.shape[0] * 100
high_missing_values = missing_cols_prcnt[missing_cols_prcnt > 50]
high_missing_values_index = high_missing_values.index.tolist()
high_missing_values[:5]

OWN_CAR_AGE         66.006025
EXT_SOURCE_1        56.371530
APARTMENTS_AVG      50.754471
BASEMENTAREA_AVG    58.527948
YEARS_BUILD_AVG     66.490343
dtype: float64

In [108]:
correlations = data_train.corr()['TARGET'].sort_values()
corr_missing_cols = correlations.reindex(high_missing_values_index).sort_values()
missing_cols_to_be_dropped = corr_missing_cols.index.difference(corr_missing_cols[(corr_missing_cols > 0.02) | (corr_missing_cols < -0.02)].index).tolist()
len(missing_cols_to_be_dropped)

  correlations = data_train.corr()['TARGET'].sort_values()


18

In [109]:
missing_cols_to_be_dropped

['COMMONAREA_AVG',
 'COMMONAREA_MEDI',
 'COMMONAREA_MODE',
 'ENTRANCES_AVG',
 'ENTRANCES_MEDI',
 'ENTRANCES_MODE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'LANDAREA_AVG',
 'LANDAREA_MEDI',
 'LANDAREA_MODE',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAPARTMENTS_MEDI',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_AVG',
 'NONLIVINGAREA_MEDI',
 'NONLIVINGAREA_MODE',
 'WALLSMATERIAL_MODE']

In [110]:
app = append.drop(missing_cols_to_be_dropped, axis=1)
app.shape

(339875, 105)

# **Separate numerical and categorical data**

In [111]:
ctg_vars = [f for f in app.columns if f!='TARGET' and f!='source' and app[f].dtype=="object"]
ctg_vars

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'EMERGENCYSTATE_MODE']

In [112]:
app[ctg_vars].head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,WEEKDAY_APPR_PROCESS_START,ORGANIZATION_TYPE,EMERGENCYSTATE_MODE
0,Cash loans,M,N,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,Laborers,WEDNESDAY,Business Entity Type 3,No
1,Cash loans,F,N,N,Family,State servant,Higher education,Married,House / apartment,Core staff,MONDAY,School,No
2,Revolving loans,M,Y,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,Laborers,MONDAY,Government,
3,Cash loans,F,N,Y,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,Laborers,WEDNESDAY,Business Entity Type 3,
4,Cash loans,M,N,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,Core staff,THURSDAY,Religion,


In [113]:
numerical_vars = [f for f in app.columns if f!='TARGET' and f!='source' and app[f].dtype!="object"]
numerical_vars

['SK_ID_CURR',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'ELEVATORS_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BUILD_MODE',
 'ELEVATORS_MODE',
 'FLOORSMAX_MODE',
 'FLOORSMIN_M

# **DAYS_EMPLOYED discrepancy**

In [114]:
app["DAYS_EMPLOYED_ANOMALY"] = 0
app["DAYS_EMPLOYED_ANOMALY"].loc[app[app["DAYS_EMPLOYED"] > 0]["DAYS_EMPLOYED"].index] = 1
app["DAYS_EMPLOYED"].loc[app[app["DAYS_EMPLOYED"] > 0]["DAYS_EMPLOYED"].index] = 0
app[["DAYS_EMPLOYED", "DAYS_EMPLOYED_ANOMALY"]].head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app["DAYS_EMPLOYED_ANOMALY"].loc[app[app["DAYS_EMPLOYED"] > 0]["DAYS_EMPLOYED"].index] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app["DAYS_EMPLOYED"].loc[app[app["DAYS_EMPLOYED"] > 0]["DAYS_EMPLOYED"].index] = 0


Unnamed: 0,DAYS_EMPLOYED,DAYS_EMPLOYED_ANOMALY
0,-637,0
1,-1188,0
2,-225,0
3,-3039,0
4,-3038,0


# **XNA represents null in categiorical cols**

In [115]:
xna_cols = []
for c in ctg_vars:
    if app[c][app[c]=='XNA'].shape[0] > 0:
        xna_cols.append(c)
        
for c in xna_cols:
    app[c] = app[c].replace('XNA', np.nan)

# **Amt credit missing values imputation**

In [116]:
app[["AMT_CREDIT", "AMT_GOODS_PRICE"]].describe()

Unnamed: 0,AMT_CREDIT,AMT_GOODS_PRICE
count,339875.0,339614.0
mean,587311.0,527611.1
std,398609.1,366089.7
min,45000.0,45000.0
25%,270000.0,234000.0
50%,500211.0,450000.0
75%,794173.5,675000.0
max,4050000.0,4050000.0


In [117]:
app[["AMT_CREDIT", "AMT_GOODS_PRICE"]].isnull().sum()

AMT_CREDIT           0
AMT_GOODS_PRICE    261
dtype: int64

In [118]:
missing = app[app["AMT_GOODS_PRICE"].isnull()].index #missing goods price indices
nf = np.mean(app["AMT_GOODS_PRICE"])/np.mean(app["AMT_CREDIT"]) #norm factor

In [119]:
#Replacing the missing values of AMT_GOODS_PRICE
app.loc[missing, "AMT_GOODS_PRICE"] = app.loc[missing, "AMT_CREDIT"] * nf

# **For other numerical columns impute missing values by median(including ext_source_3)**

In [120]:
numeric = app[numerical_vars].isnull().sum()[app[numerical_vars].isnull().sum() > 0].index.tolist() #numeric cols with missing vals
ctg_cols_with_missing_vals = app[ctg_vars].isnull().sum()[app[ctg_vars].isnull().sum() > 0].index.tolist() #categorical cols with missing vals
numeric[:5]

['AMT_ANNUITY',
 'OWN_CAR_AGE',
 'CNT_FAM_MEMBERS',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2']

In [121]:
#np.median(application["AMT_ANNUITY"].dropna())
for f in numeric:
    app[f] = app[f].replace(np.nan, np.median(app[f].dropna()))

In [122]:
app[numerical_vars].isnull().sum()[app[numerical_vars].isnull().sum() > 0]

Series([], dtype: int64)

# **For categorical columns, impute missing values by mode**

In [123]:
ctg_cols_with_missing_vals

['CODE_GENDER',
 'NAME_TYPE_SUITE',
 'OCCUPATION_TYPE',
 'ORGANIZATION_TYPE',
 'EMERGENCYSTATE_MODE']

In [124]:
# application["CODE_GENDER"].value_counts().idxmax()
for f in ctg_cols_with_missing_vals:
    max_count = app[f].value_counts().idxmax()
    app[f] = app[f].fillna(max_count)
app[ctg_vars].isnull().sum()[app[ctg_vars].isnull().sum() > 0]

Series([], dtype: int64)

In [125]:
app['CREDIT_INCOME_PERCENT'] = app['AMT_CREDIT'] / app['AMT_INCOME_TOTAL']
app['ANNUITY_INCOME_PERCENT'] = app['AMT_ANNUITY'] / app['AMT_INCOME_TOTAL']
app['CREDIT_TERM'] = app['AMT_ANNUITY'] / app['AMT_CREDIT']
app['DAYS_EMPLOYED_PERCENT'] = app['DAYS_EMPLOYED'] / app['DAYS_BIRTH']

# **One hot encoding of categorical variables**

In [126]:
dms = pd.get_dummies(app[ctg_vars])
application = pd.concat([app, dms], axis=1)
application = application.drop(ctg_vars, axis=1)
application.shape

application.head()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,100002,1.0,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,...,0,0,0,0,0,0,0,0,1,0
1,100003,0.0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,...,0,0,0,0,0,0,0,0,1,0
2,100004,0.0,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,...,0,0,0,0,0,0,0,0,1,0
3,100006,0.0,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039,...,0,0,0,0,0,0,0,0,1,0
4,100007,0.0,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038,...,0,0,0,0,0,0,0,0,1,0


In [127]:
application[application.TARGET == 0].shape[0]/ application[application.TARGET == 1].shape[0]

11.372758181045473

# **Imbalanced classes**

In [128]:
data_train = application[application["source"] == 'train']
data_test = application[application["source"] == 'test']

In [129]:
X_train = data_train.drop(["source", "TARGET"], axis=1)
Y_train = data_train["TARGET"]
X_test = data_test.drop(["source", "TARGET"], axis=1)

X_test.shape, X_train.shape

((48744, 219), (291131, 219))

In [130]:
di = np.array(data_train[data_train.TARGET == 1].index) #default index
ri = np.array(data_train[data_train.TARGET == 0].index) #repaid index
rp = np.random.permutation(ri)[:di.shape[0]*3] #repaid permuted index

bi = np.concatenate((rp, di)) #balanced indices
bp = np.random.permutation(bi) #balanced permuted indices

bp

Y_balanced = data_train.TARGET[bp].values
X_balanced_data = data_train.iloc[bp, :].drop(['TARGET', 'source'], axis=1)

X_balanced_data.head()

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
40725,147164,0,90000.0,225000.0,17775.0,225000.0,0.018634,-10445,-2828,-6058.0,...,0,0,0,0,0,0,0,0,1,0
266448,408681,2,180000.0,508495.5,22527.0,454500.0,0.00702,-13803,-3249,-553.0,...,0,0,0,0,0,0,0,0,1,0
262242,403573,0,130500.0,202500.0,10125.0,202500.0,0.035792,-7933,-253,-6862.0,...,0,0,0,0,0,0,0,0,1,0
29216,133933,0,135000.0,135000.0,6493.5,135000.0,0.00963,-18318,-4979,-1585.0,...,0,0,0,0,0,0,0,0,1,0
137404,259361,0,135000.0,888840.0,32053.5,675000.0,0.01885,-15552,-1364,-5586.0,...,0,0,0,0,0,0,0,0,1,0


# **K Fold**

In [131]:
k_fold = KFold(n_splits = 5, shuffle = True, random_state = 50)
x_train = np.array(X_train.values)
y_train = np.array(Y_train.values)
x_test = np.array(X_test.values)
test_predictions = np.zeros(x_test.shape[0])
train_auc = []
valid_auc = []
for train_indices, valid_indices in k_fold.split(x_train):
    train_data, train_target = x_train[train_indices], y_train[train_indices]
    valid_data, valid_target = x_train[valid_indices], y_train[valid_indices]
    clf = LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.03,
        num_leaves=30,
        colsample_bytree=.8,
        subsample=.9,
        max_depth=7,
        reg_alpha=.1,
        reg_lambda=.1,
        min_split_gain=.01,
        min_child_weight=2,
        silent=-1,
        verbose=-1,
    )
        
    clf.fit(train_data, train_target, 
        eval_set= [(train_data, train_target), (valid_data, valid_target)], 
        eval_names = ['train', 'valid'],
        eval_metric='auc', verbose=100, early_stopping_rounds=100  #30
   )
    
    #best_iteration = clf.best_iteration_
    
    valid_score = clf.best_score_['valid']['auc']
    train_score = clf.best_score_['train']['auc']
    valid_auc.append(valid_score)
    train_auc.append(train_score)
    
    test_predictions += clf.predict_proba(x_test, num_iteration=clf.best_iteration_)[:,1]/k_fold.n_splits
    
    gc.enable()
    del clf, train_data, valid_data
    gc.disable()
        



[100]	train's auc: 0.766126	train's binary_logloss: 0.24595	valid's auc: 0.748471	valid's binary_logloss: 0.248209
[200]	train's auc: 0.78456	train's binary_logloss: 0.239302	valid's auc: 0.757569	valid's binary_logloss: 0.244991
[300]	train's auc: 0.796858	train's binary_logloss: 0.235028	valid's auc: 0.761139	valid's binary_logloss: 0.243881
[400]	train's auc: 0.806817	train's binary_logloss: 0.231738	valid's auc: 0.762074	valid's binary_logloss: 0.243578
[500]	train's auc: 0.815325	train's binary_logloss: 0.228831	valid's auc: 0.762446	valid's binary_logloss: 0.24346
[600]	train's auc: 0.823338	train's binary_logloss: 0.226043	valid's auc: 0.763227	valid's binary_logloss: 0.243276
[700]	train's auc: 0.83102	train's binary_logloss: 0.223329	valid's auc: 0.763762	valid's binary_logloss: 0.243149
[800]	train's auc: 0.8383	train's binary_logloss: 0.220723	valid's auc: 0.764069	valid's binary_logloss: 0.24305
[900]	train's auc: 0.844954	train's binary_logloss: 0.21824	valid's auc: 0.7641



[100]	train's auc: 0.765626	train's binary_logloss: 0.244897	valid's auc: 0.751304	valid's binary_logloss: 0.25246
[200]	train's auc: 0.783441	train's binary_logloss: 0.238388	valid's auc: 0.759847	valid's binary_logloss: 0.249213
[300]	train's auc: 0.795718	train's binary_logloss: 0.234173	valid's auc: 0.763842	valid's binary_logloss: 0.247963
[400]	train's auc: 0.8058	train's binary_logloss: 0.230882	valid's auc: 0.765478	valid's binary_logloss: 0.24745
[500]	train's auc: 0.814761	train's binary_logloss: 0.227913	valid's auc: 0.766258	valid's binary_logloss: 0.247161
[600]	train's auc: 0.823292	train's binary_logloss: 0.224991	valid's auc: 0.766578	valid's binary_logloss: 0.247009
[700]	train's auc: 0.831181	train's binary_logloss: 0.222244	valid's auc: 0.766984	valid's binary_logloss: 0.246889
[800]	train's auc: 0.838186	train's binary_logloss: 0.219741	valid's auc: 0.766905	valid's binary_logloss: 0.246903




[100]	train's auc: 0.764325	train's binary_logloss: 0.246069	valid's auc: 0.76058	valid's binary_logloss: 0.247799
[200]	train's auc: 0.782179	train's binary_logloss: 0.239564	valid's auc: 0.768544	valid's binary_logloss: 0.244325
[300]	train's auc: 0.794129	train's binary_logloss: 0.235521	valid's auc: 0.771887	valid's binary_logloss: 0.243067
[400]	train's auc: 0.804042	train's binary_logloss: 0.232232	valid's auc: 0.772932	valid's binary_logloss: 0.242654
[500]	train's auc: 0.813294	train's binary_logloss: 0.229211	valid's auc: 0.773489	valid's binary_logloss: 0.242441
[600]	train's auc: 0.82131	train's binary_logloss: 0.226459	valid's auc: 0.773824	valid's binary_logloss: 0.242297
[700]	train's auc: 0.829405	train's binary_logloss: 0.223626	valid's auc: 0.774011	valid's binary_logloss: 0.242167
[800]	train's auc: 0.837007	train's binary_logloss: 0.220995	valid's auc: 0.773868	valid's binary_logloss: 0.242169




[100]	train's auc: 0.765741	train's binary_logloss: 0.245667	valid's auc: 0.750756	valid's binary_logloss: 0.249587
[200]	train's auc: 0.784277	train's binary_logloss: 0.238968	valid's auc: 0.761218	valid's binary_logloss: 0.245901
[300]	train's auc: 0.796446	train's binary_logloss: 0.234818	valid's auc: 0.764834	valid's binary_logloss: 0.24467
[400]	train's auc: 0.806098	train's binary_logloss: 0.231567	valid's auc: 0.766091	valid's binary_logloss: 0.244183
[500]	train's auc: 0.815058	train's binary_logloss: 0.228508	valid's auc: 0.766776	valid's binary_logloss: 0.243953
[600]	train's auc: 0.823365	train's binary_logloss: 0.225667	valid's auc: 0.767095	valid's binary_logloss: 0.243817
[700]	train's auc: 0.830581	train's binary_logloss: 0.223095	valid's auc: 0.767306	valid's binary_logloss: 0.243748
[800]	train's auc: 0.83735	train's binary_logloss: 0.220592	valid's auc: 0.767451	valid's binary_logloss: 0.243694
[900]	train's auc: 0.844501	train's binary_logloss: 0.217978	valid's auc: 



[100]	train's auc: 0.766796	train's binary_logloss: 0.245913	valid's auc: 0.748773	valid's binary_logloss: 0.247917
[200]	train's auc: 0.78479	train's binary_logloss: 0.239233	valid's auc: 0.758057	valid's binary_logloss: 0.244679
[300]	train's auc: 0.797126	train's binary_logloss: 0.235027	valid's auc: 0.760937	valid's binary_logloss: 0.243744
[400]	train's auc: 0.807265	train's binary_logloss: 0.231646	valid's auc: 0.762456	valid's binary_logloss: 0.243255
[500]	train's auc: 0.816179	train's binary_logloss: 0.228582	valid's auc: 0.763554	valid's binary_logloss: 0.242927
[600]	train's auc: 0.824436	train's binary_logloss: 0.225714	valid's auc: 0.76368	valid's binary_logloss: 0.24291
[700]	train's auc: 0.831831	train's binary_logloss: 0.223091	valid's auc: 0.763681	valid's binary_logloss: 0.242921


In [132]:
kfold_auc = pd.DataFrame({'train_auc': train_auc, 
                          'valid_auc': valid_auc})
kfold_auc

Unnamed: 0,train_auc,valid_auc
0,0.843668,0.764284
1,0.835831,0.767023
2,0.830682,0.774126
3,0.850847,0.767631
4,0.828365,0.763771


In [133]:
test_predictions.shape

(48744,)

# **Logistic Regression**

In [134]:
%%time
lr_cv = cross_val_score(LogisticRegression(), X_balanced_data, Y_balanced, cv=5, scoring='roc_auc')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

CPU times: user 34 s, sys: 4.23 s, total: 38.3 s
Wall time: 22.6 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [135]:
np.mean(lr_cv)

0.6335614392612277

In [136]:
%%time
hyperparameters = {
    'penalty': ['l1', 'l2'], 
    'C': np.logspace(0,1,5)}

clf = GridSearchCV(LogisticRegression(), hyperparameters, cv=5, verbose=0)
best_fit = clf.fit(X_balanced_data, Y_balanced)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

CPU times: user 3min 5s, sys: 23.1 s, total: 3min 28s
Wall time: 2min 6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [137]:
best_fit.best_params_

{'C': 1.0, 'penalty': 'l2'}

In [138]:
best_fit.best_score_

0.7499043773905653

In [139]:
lr_regularized_cv = cross_val_score(LogisticRegression(penalty='l2', C=1), X_balanced_data, Y_balanced, cv=5, scoring='roc_auc')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [140]:
np.mean(lr_regularized_cv)

0.6335614392612277

In [141]:
%%time
lr_regularized = LogisticRegression(penalty='l2', C=1).fit(X_balanced_data, Y_balanced)

CPU times: user 8.63 s, sys: 1.03 s, total: 9.66 s
Wall time: 6.36 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [142]:
%%time
lr_regularized = LogisticRegression(penalty='l2', C=1).fit(X_balanced_data, Y_balanced)

CPU times: user 8.78 s, sys: 960 ms, total: 9.74 s
Wall time: 5.1 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [143]:
Y_pred_test = lr_regularized.predict_proba(X_test)[:, 1]
test_output = pd.DataFrame({'SK_ID_CURR':data_test.SK_ID_CURR.values, 'TARGET':Y_pred_test})

test_output.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.253562
1,100005,0.235285
2,100013,0.207327
3,100028,0.096142
4,100038,0.188022


In [144]:
test_output.to_csv('regularized_logistic_output.csv', index = False)

# **Random Forest**

In [145]:
%%time
random_forest_cv = cross_val_score(RandomForestClassifier(n_estimators=600, min_samples_split=30, min_samples_leaf = 10, max_features="auto"), 
                                   X_balanced_data, 
                                   Y_balanced, 
                                   cv=5, 
                                   scoring='roc_auc')

  warn(
  warn(
  warn(
  warn(
  warn(


CPU times: user 15min 20s, sys: 1.93 s, total: 15min 22s
Wall time: 15min 22s


In [146]:
np.mean(random_forest_cv)

0.750096304434718

In [147]:
%%time
rf_model = RandomForestClassifier(n_estimators=600, min_samples_split=30, min_samples_leaf = 10, max_features="auto")
rf_model = rf_model.fit(X_balanced_data, Y_balanced)
Y_predicted = rf_model.predict_proba(X_train)[:, 1]
print(roc_auc_score(Y_train, Y_predicted))

  warn(


0.8996084159988443
CPU times: user 4min 54s, sys: 892 ms, total: 4min 55s
Wall time: 4min 55s


In [148]:
Y_pred_test = rf_model.predict_proba(X_test)[:, 1]
test_output = pd.DataFrame({'SK_ID_CURR':data_test.SK_ID_CURR.values, 'TARGET':Y_pred_test})
test_output.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.204445
1,100005,0.307304
2,100013,0.126012
3,100028,0.149077
4,100038,0.381756


In [149]:
test_output.to_csv('random_forest_output.csv', index = False)