In [None]:
pip install xgboost lightgbm

In [None]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve
from sklearn.impute import KNNImputer

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Necesito-un-credito/train.csv'
file_key_2 = 'Necesito-un-credito/test.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
train['age'] = [train['age'][i][:-1] for i in range(0, train.shape[0])]
train['age'] = pd.to_numeric(train['age'])

test = pd.read_csv(file_content_stream_2)
test['age'] = [test['age'][i][:-1] for i in range(0, test.shape[0])]
test['age'] = pd.to_numeric(test['age'])

# Basic Exploration

In [None]:
train.head()

In [None]:
train['SeriousDlqin2yrs'].value_counts() / train.shape[0]

In [None]:
train.describe()

In [None]:
train['TotalLoans'] = train['NumberOfOpenCreditLinesAndLoans'] + train['NumberRealEstateLoansOrLines']
train['Late90days'] = np.where(train['NumberOfTimes90DaysLate'] >= 1, 1, 0)
train['Late60_90days'] = np.where(train['NumberOfTime60-89DaysPastDueNotWorse'] >= 1, 1, 0)
train['Late30_50days'] = np.where(train['NumberOfTime60-89DaysPastDueNotWorse'] >= 2, 1, 0)

train.head()

In [None]:
plt.figure(figsize = (10, 8))
sns.histplot(x = 'RevolvingUtilizationOfUnsecuredLines', data = train)
plt.show();

In [None]:
plt.figure(figsize = (10, 8))
sns.boxplot(x = 'Late90days', y = 'RevolvingUtilizationOfUnsecuredLines', data = train, hue = 'Late90days').set_yscale('log')
plt.show();

In [None]:
plt.figure(figsize = (10, 8))
sns.scatterplot(x = 'RevolvingUtilizationOfUnsecuredLines', y = 'Late90days', data = train, hue = 'SeriousDlqin2yrs')
plt.show();

In [None]:
## Defining the input and target variables
train_clean = train.dropna()
X = train_clean.drop(columns = ['Id', 'SeriousDlqin2yrs'], axis = 1)
Y = train_clean['SeriousDlqin2yrs']

## Spliting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Building the decision tree on the train data-frame
tree_md = DecisionTreeClassifier(max_depth = 3).fit(X_train, Y_train)

## Visualizing the decision-tree model 
fig = plt.figure(figsize = (25, 15))
plot_tree(tree_md, feature_names = X_train.columns, filled = True);

In [None]:
plt.figure(figsize = (10, 8))
sns.scatterplot(x = 'TotalLoans', y = 'DebtRatio', data = train, hue = 'SeriousDlqin2yrs')
plt.show()

In [None]:
plt.figure(figsize = (10, 8))
sns.boxplot(x = 'SeriousDlqin2yrs', y = 'TotalLoans', data = train, hue = 'SeriousDlqin2yrs')
plt.show();

In [None]:
plt.figure(figsize = (10, 8))
sns.scatterplot(x = 'NumberOfOpenCreditLinesAndLoans', y = 'NumberRealEstateLoansOrLines', data = train, hue = 'SeriousDlqin2yrs')
plt.show();

In [None]:
plt.figure(figsize = (10, 8))
sns.scatterplot(x = 'NumberOfOpenCreditLinesAndLoans', y = 'NumberOfTimes90DaysLate', data = train, hue = 'SeriousDlqin2yrs')
plt.show();

In [None]:
plt.figure(figsize = (10, 8))
sns.scatterplot(x = 'age', y = 'DebtRatio', data = train, hue = 'SeriousDlqin2yrs')
plt.show();

In [None]:
plt.figure(figsize = (10, 8))
sns.scatterplot(x = 'age', y = 'MonthlyIncome', data = train, hue = 'SeriousDlqin2yrs')
plt.show();

# Feature Engineering

In [None]:
train['Late90days'] = np.where(train['NumberOfTimes90DaysLate'] >= 1, 1, 0)
train['Late60_90days'] = np.where(train['NumberOfTime60-89DaysPastDueNotWorse'] >= 1, 1, 0)



# Baseline Model: XGBoost

In [None]:
X = train.drop(columns = ['Id', 'SeriousDlqin2yrs'], axis = 1)
Y = train['SeriousDlqin2yrs']

## Defining the hyper-parameter grid
XGBoost_param_grid = {'n_estimators': [100],
                      'max_depth': [5, 7],
                      'min_child_weight': [5, 7, 10],
                      'learning_rate': [0.01, 0.001],
                      'gamma': [0.3, 0.1],
                      'subsample': [0.8, 1],
                      'colsample_bytree': [0.8, 1]}

## Performing grid search with 5 folds
XGBoost_grid_search = GridSearchCV(XGBClassifier(), XGBoost_param_grid, cv = 5, scoring = 'roc_auc', n_jobs = -1, verbose = 3).fit(X, Y)

## Extracting the best score
best_score = XGBoost_grid_search.best_score_
print('The best area under the ROC cure is:', best_score)

## Extracting the best model
XGBoost_md = XGBoost_grid_search.best_estimator_

## Predicting on test with best xgboost model 
xgb_pred = XGBoost_md.predict_proba(test.drop(columns = ['Id'], axis = 1))[:, 1] 
xgb_pred

In [None]:
from sklearn.metrics import roc_curve

def roc_auc_cutoff(Y_test, Y_pred):
    
    ## Computing the precision recall curve
    fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)
    
    cutoffs = pd.DataFrame({'False_Positive': fpr, 'True_Positive': tpr, 'cutoff': thresholds})

    ## Finding the optimal cut-off
    cutoffs['True_Positive_minus_1'] = cutoffs['True_Positive'] - 1
    cutoffs['Euclidean_dist'] = np.sqrt(cutoffs['False_Positive']**2 + cutoffs['True_Positive_minus_1']**2)

    ## Sorting based on the Euclidean distance
    cutoffs = cutoffs.sort_values(by = 'Euclidean_dist').reset_index(drop = True)
        
    return cutoffs['cutoff'][0]


xgb_pred_train = XGBoost_md.predict_proba(X)[:, 1]
opt_cutoff = roc_auc_cutoff(Y, xgb_pred_train)
print('The optimal cutoff is', opt_cutoff)

In [None]:
xgb_pred_test = XGBoost_md.predict_proba(test.drop(columns = ['Id'], axis = 1))[:, 1]
xgb_label_test = np.where(xgb_pred_test < opt_cutoff, 0, 1)

In [None]:
sum(xgb_label_test) / xgb_label_test.shape[0]

In [None]:
xgb_label.shape

In [None]:
test.shape

In [None]:
data_out = pd.DataFrame({'Id': test['Id'], 'SeriousDlqin2yrs': xgb_label_test})
data_out.head()

In [None]:
data_out['SeriousDlqin2yrs'].value_counts() / data_out.shape[0]

In [None]:
data_out.to_csv('xgb_submission.csv', index = False)

# Baseline Model: LightGBM

In [None]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_curve
from sklearn.impute import KNNImputer

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Necesito-un-credito/train.csv'
file_key_2 = 'Necesito-un-credito/test.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
train['age'] = [train['age'][i][:-1] for i in range(0, train.shape[0])]
train['age'] = pd.to_numeric(train['age'])

test = pd.read_csv(file_content_stream_2)
test['age'] = [test['age'][i][:-1] for i in range(0, test.shape[0])]
test['age'] = pd.to_numeric(test['age'])

X = train.drop(columns = ['Id', 'SeriousDlqin2yrs'], axis = 1)
Y = train['SeriousDlqin2yrs']

## Defining the hyper-parameter grid
LightGBM_param_grid = {'n_estimators': [100, 300],
                       'max_depth': [3, 5, 7],
                       'num_leaves': [20, 25, 30],
                       'min_data_in_leaf': [10, 15, 20],
                       'learning_rate': [0.01, 0.001],
                       'feature_fraction': [0.8, 0.9, 1],
                       'lambda_l1': [0, 10, 100],
                       'lambda_l2': [0, 10, 100]
                      }

## Performing grid search with 5 folds
LightGBM_grid_search = GridSearchCV(LGBMClassifier(), LightGBM_param_grid, cv = 5, scoring = 'roc_auc', n_jobs = -1, verbose = 3).fit(X, Y)

## Extracting the best model
LightGBM_md = LightGBM_grid_search.best_estimator_

def roc_auc_cutoff(Y_test, Y_pred):
    
    ## Computing the precision recall curve
    fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)
    
    cutoffs = pd.DataFrame({'False_Positive': fpr, 'True_Positive': tpr, 'cutoff': thresholds})

    ## Finding the optimal cut-off
    cutoffs['True_Positive_minus_1'] = cutoffs['True_Positive'] - 1
    cutoffs['Euclidean_dist'] = np.sqrt(cutoffs['False_Positive']**2 + cutoffs['True_Positive_minus_1']**2)

    ## Sorting based on the Euclidean distance
    cutoffs = cutoffs.sort_values(by = 'Euclidean_dist').reset_index(drop = True)
        
    return cutoffs['cutoff'][0]

## Predicting on train to estimate cutoff
lightgbm_pred_train = LightGBM_md.predict_proba(X)[:, 1]
opt_cutoff = roc_auc_cutoff(Y, lightgbm_pred_train)
print('The optimal cutoff is', opt_cutoff)

## Predicting on the test
lightgbm_pred_test = LightGBM_md.predict_proba(test.drop(columns = ['Id'], axis = 1))[:, 1]
lightgbm_label_test = np.where(lightgbm_pred_test < opt_cutoff, 0, 1)

## Data-frame for submission
data_out = pd.DataFrame({'Id': test['Id'], 'SeriousDlqin2yrs': lightgbm_label_test})
print(data_out.head())

data_out.to_csv('lightgbm_submission.csv', index = False)