In [1]:
pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.6.2-py3-none-manylinux2014_x86_64.whl (255.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.9/255.9 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.impute import KNNImputer

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Necesito-un-credito/train.csv'
file_key_2 = 'Necesito-un-credito/test.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
train['age'] = [train['age'][i][:-1] for i in range(0, train.shape[0])]
train['age'] = pd.to_numeric(train['age'])

test = pd.read_csv(file_content_stream_2)
test['age'] = [test['age'][i][:-1] for i in range(0, test.shape[0])]
test['age'] = pd.to_numeric(test['age'])

# Basic Exploration

In [3]:
train.head()

Unnamed: 0,Id,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,30894,0,0.155646,59,0,0.596508,7330.0,12,0,3,0,0.0
1,59640,0,0.925853,42,0,0.51376,6249.0,6,0,2,0,4.0
2,83465,0,0.066454,62,1,0.346516,8166.0,17,0,2,0,1.0
3,144373,0,0.04492,49,1,838.0,,9,0,1,0,3.0
4,98153,0,0.429878,51,2,10046.0,,11,0,3,0,1.0


In [4]:
train['SeriousDlqin2yrs'].value_counts() / train.shape[0]

0    0.932773
1    0.067227
Name: SeriousDlqin2yrs, dtype: float64

In [5]:
train.describe()

Unnamed: 0,Id,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,73363.0,73363.0,73363.0,73363.0,73363.0,73363.0,58844.0,73363.0,73363.0,73363.0,73363.0,71431.0
mean,75067.06855,0.067227,4.992171,52.321743,0.423074,350.029652,6660.668,8.465316,0.266933,1.020623,0.241811,0.757066
std,43338.771548,0.250417,188.133666,14.737304,4.215878,2023.108132,12687.67,5.138928,4.192838,1.138891,4.178167,1.110684
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37429.0,0.0,0.029743,41.0,0.0,0.175275,3380.0,5.0,0.0,0.0,0.0,0.0
50%,75093.0,0.0,0.153792,52.0,0.0,0.366823,5400.0,8.0,0.0,1.0,0.0,0.0
75%,112696.0,0.0,0.559024,63.0,0.0,0.869059,8278.5,11.0,0.0,2.0,0.0,1.0
max,149998.0,1.0,20514.0,109.0,98.0,329664.0,1794060.0,58.0,98.0,54.0,98.0,13.0


In [11]:
test.head()

Unnamed: 0,Id,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,57463,0.043638,78,0,50.0,,4,0,0,0,0.0
1,19175,0.026696,31,0,2.28266,420.0,9,0,1,0,0.0
2,57764,0.101431,64,0,0.332949,4333.0,8,0,2,0,1.0
3,89649,0.028622,62,0,0.258538,10833.0,23,0,1,0,2.0
4,124065,0.602949,35,0,0.312312,5327.0,10,0,0,0,1.0


# Baseline Model

In [None]:
X = train.drop(columns = ['Id', 'SeriousDlqin2yrs'], axis = 1)
Y = train['SeriousDlqin2yrs']

## Defining the hyper-parameter grid
XGBoost_param_grid = {'n_estimators': [100],
                      'max_depth': [5, 7],
                      'min_child_weight': [5, 7, 10],
                      'learning_rate': [0.01, 0.001],
                      'gamma': [0.3, 0.1],
                      'subsample': [0.8, 1],
                      'colsample_bytree': [0.8, 1]}

## Performing grid search with 5 folds
XGBoost_grid_search = GridSearchCV(XGBClassifier(), XGBoost_param_grid, cv = 5, scoring = 'roc_auc', n_jobs = -1, verbose = 3).fit(X, Y)

## Extracting the best score
best_score = XGBoost_grid_search.best_score_
print('The best area under the ROC cure is:', best_score)

## Extracting the best model
XGBoost_md = XGBoost_grid_search.best_estimator_

## Predicting on test with best xgboost model 
xgb_pred = XGBoost_md.predict_proba(test.drop(columns = ['Id'], axis = 1))[:, 1] 
xgb_pred

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [22]:
X.describe()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,73363.0,73363.0,73363.0,58844.0,73363.0,73363.0,73363.0,73363.0,71431.0
mean,4.992171,0.423074,350.029652,6660.668,8.465316,0.266933,1.020623,0.241811,0.757066
std,188.133666,4.215878,2023.108132,12687.67,5.138928,4.192838,1.138891,4.178167,1.110684
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.029743,0.0,0.175275,3380.0,5.0,0.0,0.0,0.0,0.0
50%,0.153792,0.0,0.366823,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.559024,0.0,0.869059,8278.5,11.0,0.0,2.0,0.0,1.0
max,20514.0,98.0,329664.0,1794060.0,58.0,98.0,54.0,98.0,13.0
