In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/home-credit-default-risk/sample_submission.csv
/kaggle/input/home-credit-default-risk/bureau_balance.csv
/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv
/kaggle/input/home-credit-default-risk/application_train.csv
/kaggle/input/home-credit-default-risk/HomeCredit_columns_description.csv
/kaggle/input/home-credit-default-risk/application_test.csv
/kaggle/input/home-credit-default-risk/previous_application.csv
/kaggle/input/home-credit-default-risk/credit_card_balance.csv
/kaggle/input/home-credit-default-risk/installments_payments.csv
/kaggle/input/home-credit-default-risk/bureau.csv
/kaggle/input/bureau-kaggle/bureau_kaggle.csv
/kaggle/input/previous-kaggle/previous_loans_int.csv


# Importing datasets and downcasting types to reduce memory usage :

In [3]:
df_train = pd.read_csv("/kaggle/input/home-credit-default-risk/application_train.csv")
df_test = pd.read_csv("/kaggle/input/home-credit-default-risk/application_test.csv")
df_bureau = pd.read_csv("/kaggle/input/bureau-kaggle/bureau_kaggle.csv")
df_previous = pd.read_csv("/kaggle/input/previous-kaggle/previous_loans_int.csv")

In [4]:
def memory_usage(df):
    print('Memory Usage Before :',round(df.memory_usage(index=True).sum()*10e-6,2))
    df = df.replace([np.inf, -np.inf], np.nan)
    for col in df.columns:
        if df[col].dtype == int:
            if df[col].dtype =='int64' and max(df[col])<127 and min(df[col])>-128:
                df[col]=df[col].astype('int8')
            elif df[col].dtype =='int64' and max(df[col])<32767 and min(df[col])>-32767 :
                df[col] = df[col].astype('int16')
            elif df[col].dtype =='int64' and max(df[col])<2147483648 and min(df[col])>-2147483648:
                df[col] = df[col].astype('int32')
        elif df[col].dtype == float :
            if df[col].dtype =='float64' and max(df[col])<6.55e+4 and min(df[col])>-6.55e+4 :
                df[col] = df[col].astype('float16')
            elif df[col].dtype =='float64' and max(df[col])<3.4+38 and min(df[col])>-3.4e+4:
                df[col] = df[col].astype('float32')
    print('Memory Usage After :',round(df.memory_usage(index=True).sum()*10e-6,2))
    print('-----------------------------------------------')
    return df

In [5]:
df_train = memory_usage(df_train)
df_test = memory_usage(df_test)
df_bureau = memory_usage(df_bureau)
df_previous = memory_usage(df_previous)

Memory Usage Before : 3001.31
Memory Usage After : 1036.31
-----------------------------------------------
Memory Usage Before : 471.84
Memory Usage After : 233.97
-----------------------------------------------
Memory Usage Before : 4477.07
Memory Usage After : 1467.89
-----------------------------------------------
Memory Usage Before : 23015.17
Memory Usage After : 5608.08
-----------------------------------------------


# Functions to encode the datasets, impute them and merge them :

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
import re

In [7]:
def encode_impute(df,df_test):
    # Using ordinal encoder to encode categorical values with 2 different values and one hot encoder for the rest
    categorical_columns = df.select_dtypes(exclude='number').columns
    categorical_df = df[categorical_columns]
    values = categorical_df.nunique()
    label = []
    one_hot = []
    for i in range(0,len(values)) :
        if values.iloc[i] > 2:
            one_hot.append(values.index[i])
        else :
            label.append(values.index[i])
    hot = OneHotEncoder(sparse_output = False,handle_unknown = 'ignore')
    ordinal = OrdinalEncoder(handle_unknown = 'use_encoded_value',unknown_value = -1)
    
    df_one_hot = pd.DataFrame(hot.fit_transform(df[one_hot]),columns=hot.get_feature_names_out(one_hot))
    df_ordinal = pd.DataFrame(ordinal.fit_transform(df[label]),columns=label)
    
    df_one_hot_test = pd.DataFrame(hot.transform(df_test[one_hot]),columns=hot.get_feature_names_out(one_hot))
    df_ordinal_test = pd.DataFrame(ordinal.transform(df_test[label]),columns=label)
    
    categorical_df = pd.concat([df_one_hot,df_ordinal],axis=1)
    categorical_df_test = pd.concat([df_one_hot_test,df_ordinal_test],axis=1)

    print(" Shape of categorical datasets :", categorical_df.shape)
    
    numerical_columns = df.select_dtypes(include='number').columns
    numerical_df = df[numerical_columns]
    numerical_df_test = df_test[numerical_columns]

    
    df = pd.concat([numerical_df,categorical_df],axis=1)
    df_test = pd.concat([numerical_df_test,categorical_df_test],axis=1)
    print(" Shape of final datasets :", df.shape)
    
    imputer = SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=0)
    df = pd.DataFrame(imputer.fit_transform(df),columns=df.columns)
    df_test = pd.DataFrame(imputer.transform(df_test),columns=df_test.columns)

    return df,df_test
    

    

In [8]:
def merge_datasets(df1,df2,df3,on='SK_ID_CURR'):
    df1 = pd.merge(df1,df2,how='left',on=on)
    print(" Shape of datasets :", df1.shape)
    df1 = pd.merge(df1,df3,how='left',on=on,suffixes=('_x1', '_x2'))
    print(" Shape of datasets :", df1.shape)
    imputer = SimpleImputer(strategy='constant',fill_value= 0 ,missing_values=np.nan)
    df1.replace([np.inf, -np.inf], np.nan, inplace=True)
    df1 = pd.DataFrame(imputer.fit_transform(df1),columns=df1.columns)
    df1.index = df1[on]
    df1 = df1.drop(on,axis=1)
    print(" Shape of merged datasets :", df1.shape)
    return df1

In [9]:
y = df_train['TARGET']
y=y.astype(int)
df_train = df_train.drop('TARGET',axis=1)

In [10]:
df_train,df_test = encode_impute(df_train,df_test)

 Shape of categorical datasets : (307511, 141)
 Shape of final datasets : (307511, 246)


In [11]:
# Feature engineering manual features based on dataset columns and credit eligibility found on internet
df_train['MONTHLY_INCOME'] = df_train['AMT_INCOME_TOTAL']/12
df_train['MONTHLY_CREDIT'] = df_train['AMT_ANNUITY']/12
df_train['LENGTH_CREDIT'] = df_train['AMT_CREDIT']/df_train['AMT_ANNUITY']
df_train['PERCENTAGE_SALARY'] = df_train['MONTHLY_CREDIT']/df_train['MONTHLY_INCOME']
df_train['DELTA_LOAN_GOOD'] = df_train['AMT_CREDIT']-df_train['AMT_GOODS_PRICE']
df_train['HOUSE_INCOME'] = df_train.apply(
    lambda row: row['MONTHLY_INCOME'] / (row['CNT_CHILDREN'] + 1) if row['CNT_CHILDREN'] > 0 else row['MONTHLY_INCOME'],
    axis=1
)
df_train['AGE'] = df_train['DAYS_BIRTH'].abs()/365
df_train['YEARS_EMPLOYED'] = df_train['DAYS_EMPLOYED'].abs()/365
df_train

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,FLAG_OWN_REALTY,EMERGENCYSTATE_MODE,MONTHLY_INCOME,MONTHLY_CREDIT,LENGTH_CREDIT,PERCENTAGE_SALARY,DELTA_LOAN_GOOD,HOUSE_INCOME,AGE,YEARS_EMPLOYED
0,100002.0,0.0,202500.0,406597.5,24700.5,351000.0,0.018799,-9461.0,-637.0,-3648.0,...,1.0,0.0,16875.0,2058.375,16.461104,0.121978,55597.5,16875.0,25.920548,1.745205
1,100003.0,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003542,-16765.0,-1188.0,-1186.0,...,0.0,0.0,22500.0,2974.875,36.234085,0.132217,164002.5,22500.0,45.931507,3.254795
2,100004.0,0.0,67500.0,135000.0,6750.0,135000.0,0.010033,-19046.0,-225.0,-4260.0,...,1.0,0.0,5625.0,562.500,20.000000,0.100000,0.0,5625.0,52.180822,0.616438
3,100006.0,0.0,135000.0,312682.5,29686.5,297000.0,0.008018,-19005.0,-3039.0,-9832.0,...,1.0,0.0,11250.0,2473.875,10.532818,0.219900,15682.5,11250.0,52.068493,8.326027
4,100007.0,0.0,121500.0,513000.0,21865.5,513000.0,0.028656,-19932.0,-3038.0,-4312.0,...,1.0,0.0,10125.0,1822.125,23.461618,0.179963,0.0,10125.0,54.608219,8.323288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251.0,0.0,157500.0,254700.0,27558.0,225000.0,0.032562,-9327.0,-236.0,-8456.0,...,0.0,0.0,13125.0,2296.500,9.242325,0.174971,29700.0,13125.0,25.553425,0.646575
307507,456252.0,0.0,72000.0,269550.0,12001.5,225000.0,0.025162,-20775.0,365243.0,-4388.0,...,1.0,0.0,6000.0,1000.125,22.459693,0.166687,44550.0,6000.0,56.917808,1000.665753
307508,456253.0,0.0,153000.0,677664.0,29979.0,585000.0,0.005001,-14966.0,-7921.0,-6736.0,...,1.0,0.0,12750.0,2498.250,22.604623,0.195941,92664.0,12750.0,41.002740,21.701370
307509,456254.0,0.0,171000.0,370107.0,20205.0,319500.0,0.005314,-11961.0,-4786.0,-2562.0,...,1.0,0.0,14250.0,1683.750,18.317595,0.118158,50607.0,14250.0,32.769863,13.112329


In [12]:
df_train = df_train.groupby(['SK_ID_CURR'],as_index=False).agg(['mean','sum','median','max','min'])


In [13]:
df_train.columns = ['_'.join(map(str, filter(None, col))) for col in df_train.columns.values]


In [None]:
df_train = memory_usage(df_train)

In [14]:
# Feature engineering manual features based on dataset columns and credit eligibility found on internet
df_test['MONTHLY_INCOME'] = df_test['AMT_INCOME_TOTAL']/12
df_test['MONTHLY_CREDIT'] = df_test['AMT_ANNUITY']/12
df_test['LENGTH_CREDIT'] = df_test['AMT_CREDIT']/df_test['AMT_ANNUITY']
df_test['PERCENTAGE_SALARY'] = df_test['MONTHLY_CREDIT']/df_test['MONTHLY_INCOME']
df_test['DELTA_LOAN_GOOD'] = df_test['AMT_CREDIT']-df_test['AMT_GOODS_PRICE']
df_test['HOUSE_INCOME'] = df_test.apply(
    lambda row: row['MONTHLY_INCOME'] / (row['CNT_CHILDREN'] + 1) if row['CNT_CHILDREN'] > 0 else row['MONTHLY_INCOME'],
    axis=1
)
df_test['AGE'] = df_test['DAYS_BIRTH'].abs()/365
df_test['YEARS_EMPLOYED'] = df_test['DAYS_EMPLOYED'].abs()/365
df_test

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,FLAG_OWN_REALTY,EMERGENCYSTATE_MODE,MONTHLY_INCOME,MONTHLY_CREDIT,LENGTH_CREDIT,PERCENTAGE_SALARY,DELTA_LOAN_GOOD,HOUSE_INCOME,AGE,YEARS_EMPLOYED
0,100001.0,0.0,135000.0,568800.0,20560.5,450000.0,0.018845,-19241.0,-2329.0,-5168.0,...,1.0,0.0,11250.0,1713.375,27.664697,0.152300,118800.0,11250.0,52.715068,6.380822
1,100005.0,0.0,99000.0,222768.0,17370.0,180000.0,0.035797,-18064.0,-4469.0,-9120.0,...,1.0,0.0,8250.0,1447.500,12.824870,0.175455,42768.0,8250.0,49.490411,12.243836
2,100013.0,0.0,202500.0,663264.0,69777.0,630000.0,0.019104,-20038.0,-4458.0,-2176.0,...,1.0,0.0,16875.0,5814.750,9.505482,0.344578,33264.0,16875.0,54.898630,12.213699
3,100028.0,2.0,315000.0,1575000.0,49018.5,1575000.0,0.026398,-13976.0,-1866.0,-2000.0,...,1.0,0.0,26250.0,4084.875,32.130726,0.155614,0.0,8750.0,38.290411,5.112329
4,100038.0,1.0,180000.0,625500.0,32067.0,625500.0,0.010033,-13040.0,-2191.0,-4000.0,...,0.0,0.0,15000.0,2672.250,19.506034,0.178150,0.0,7500.0,35.726027,6.002740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221.0,0.0,121500.0,412560.0,17473.5,270000.0,0.002043,-19970.0,-5169.0,-9096.0,...,1.0,0.0,10125.0,1456.125,23.610610,0.143815,142560.0,10125.0,54.712329,14.161644
48740,456222.0,2.0,157500.0,622413.0,31909.5,495000.0,0.035797,-11186.0,-1149.0,-3016.0,...,0.0,0.0,13125.0,2659.125,19.505570,0.202600,127413.0,4375.0,30.646575,3.147945
48741,456223.0,1.0,202500.0,315000.0,33205.5,315000.0,0.026398,-15922.0,-3037.0,-2680.0,...,1.0,0.0,16875.0,2767.125,9.486380,0.163978,0.0,8437.5,43.621918,8.320548
48742,456224.0,0.0,225000.0,450000.0,25128.0,450000.0,0.018845,-13968.0,-2731.0,-1461.0,...,0.0,0.0,18750.0,2094.000,17.908309,0.111680,0.0,18750.0,38.268493,7.482192


In [15]:
df_test = df_test.groupby(['SK_ID_CURR'],as_index=False).agg(['mean','sum','median','max','min'])


In [16]:
df_test.columns = ['_'.join(map(str, filter(None, col))) for col in df_test.columns.values]


In [19]:
df_test = memory_usage(df_test)

Memory Usage Before : 4936.79
Memory Usage After : 1354.11
-----------------------------------------------


In [20]:
df_train = merge_datasets(df_train,df_bureau,df_previous)
df_test = merge_datasets(df_test,df_bureau,df_previous)

 Shape of datasets : (307511, 1448)
 Shape of datasets : (307511, 2296)
 Shape of merged datasets : (307511, 2295)
 Shape of datasets : (48744, 1448)
 Shape of datasets : (48744, 2296)
 Shape of merged datasets : (48744, 2295)


In [None]:
df_train = memory_usage(df_train)
df_test = memory_usage(df_test)

In [None]:
df_train

In [None]:
from sklearn.model_selection import train_test_split
import re
df_train=df_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
df_test=df_test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

X_train,X_test,y_train,y_test = train_test_split(df_train,y,test_size=0.2)

In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pickle
from sklearn.metrics import roc_auc_score


In [None]:
model_sk = lgb.LGBMClassifier(boosting_type='gbdt', max_depth=7, learning_rate=0.01, n_estimators= 2000, 
                 class_weight='balanced', subsample=0.9, colsample_bytree= 0.8, n_jobs=-1)
train_features, valid_features, train_y, valid_y = train_test_split(X_train, y_train.values.ravel(), test_size = 0.15, random_state = 33)
model_sk.fit(train_features, train_y, callbacks=[
        lgb.early_stopping(stopping_rounds=200)],eval_set = [(valid_features, valid_y)], eval_metric = 'auc')


feature_imp = pd.DataFrame(sorted(zip(model_sk.feature_importances_, X_train.columns)), columns=['Value','Feature'])
features_df = feature_imp.sort_values(by="Value", ascending=False)
selected_features = list(features_df[features_df['Value']>=50]['Feature'])

In [None]:
with open('select_features.txt','wb') as fp:
    pickle.dump(selected_features, fp)
print('The no. of features selected:',len(selected_features))

In [None]:
weight = np.ones((len(X_train),), dtype=int)
for i in range(len(X_train)):
    if int(y_train.iloc[i])== 0:
        weight[i]=1
    else:
        weight[i]=11

train_data=lgb.Dataset(X_train[selected_features], label = y_train, weight= weight )
valid_data=lgb.Dataset(X_test[selected_features], label = y_test)
cv_auc_score = []
max_depth = [3, 5, 7, 10]
for i in max_depth:
    
    params = {'boosting_type': 'gbdt',
          'max_depth' : i,
          'objective': 'binary',
          'nthread': 5,
          'num_leaves': 32,
          'learning_rate': 0.05,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 0.7,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 20,
          'reg_lambda': 20,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 10,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'auc'
          }
lgbm = lgb.train(params,
                 train_data,
                 2500,
                 valid_sets=valid_data,
                  callbacks=[lgb.early_stopping(stopping_rounds=100)],
                 )
y_pred_prob = lgbm.predict(X_test[selected_features])
cv_auc_score.append(roc_auc_score(y_test,y_pred_prob))
print('For  max_depth {0} and some other parameters, cross validation AUC score {1}'.format(i,roc_auc_score(y_test,y_pred_prob)))
print('The optimal  max_depth: ', max_depth[np.argmax(cv_auc_score)])
params = {'boosting_type': 'gbdt',
          'max_depth' : max_depth[np.argmax(cv_auc_score)],
          'objective': 'binary',
          'nthread': 5,
          'num_leaves': 32,
          'learning_rate': 0.05,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 0.7,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 20,
          'reg_lambda': 20,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 10,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'auc'
          }
lgbm = lgb.train(params,
                 train_data,
                 2500,
                 valid_sets=valid_data,
                 callbacks=[lgb.early_stopping(stopping_rounds=100)],
                 )
y_pred_prob = lgbm.predict(X_train[selected_features])
print('For best max_depth {0}, The Train AUC score is {1}'.format(max_depth[np.argmax(cv_auc_score)], 
                                                                  roc_auc_score(y_train,y_pred_prob) ))    

y_pred_prob = lgbm.predict(X_test[selected_features])
print('For best max_depth {0}, The Test AUC score is {1}'.format(max_depth[np.argmax(cv_auc_score)], 
                                                                 roc_auc_score(y_test,y_pred_prob) ))
y_pred = np.ones((len(X_test),), dtype=int)
for i in range(len(y_pred_prob)):
    if y_pred_prob[i]<=0.5:
        y_pred[i]=0
    else:
        y_pred[i]=1

In [None]:
 
df_test = df_test[selected_features]
df_test

In [None]:

y_test_predicted = lgbm.predict(df_test)

In [None]:
from pathlib import Path  
# Chemin complet vers le fichier
filepath = Path('/kaggle/working/submission.csv')

columns = df_test.index.astype(str).str.replace('.0', '').astype('Int32')
df_output = pd.DataFrame({
    'SK_ID_CURR': columns,  # REQUIRED column
    'TARGET': y_test_predicted             # Your predictions
})

# Sauvegarde en CSV
df_output.to_csv(filepath, index=False)

import os
print(os.listdir('/kaggle/working/')) 

In [None]:
df_output