In [8]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_curve
from sklearn.impute import KNNImputer

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Necesito-un-credito/train.csv'
file_key_2 = 'Necesito-un-credito/test.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
train['age'] = [train['age'][i][:-1] for i in range(0, train.shape[0])]
train['age'] = pd.to_numeric(train['age'])

test = pd.read_csv(file_content_stream_2)
test['age'] = [test['age'][i][:-1] for i in range(0, test.shape[0])]
test['age'] = pd.to_numeric(test['age'])

test_id = test['Id']
test = test.drop(columns = ['Id'], axis = 1)

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train['missing_income'] = np.where(np.isnan(train['MonthlyIncome']) == True, 1, 0)
train['missing_dependents'] = np.where(np.isnan(train['NumberOfDependents']) == True, 1, 0)
train.head()

In [None]:
plt.figure(figsize = (10, 8))
sns.scatterplot(x = 'RevolvingUtilizationOfUnsecuredLines', y = 'age', data = train, hue = 'SeriousDlqin2yrs')
plt.show();

In [None]:
pd.crosstab(train['SeriousDlqin2yrs'], train['missing_income'])

In [None]:
4112 / (4112 + 54732)

In [None]:
820 / (13699 + 820)

In [None]:
pd.crosstab(train['SeriousDlqin2yrs'], train['missing_dependents'])

In [None]:
4852 / (4852 + 66579)

In [None]:
80 / (80 + 1852)

In [None]:
plt.figure(figsize = (10, 8))


# Feature Engineering

In [11]:
train['TotalLoans'] = train['NumberOfOpenCreditLinesAndLoans'] + train['NumberRealEstateLoansOrLines']
train['Late90days'] = np.where(train['NumberOfTimes90DaysLate'] >= 1, 1, 0)
train['Late60_90days'] = np.where(train['NumberOfTime60-89DaysPastDueNotWorse'] >= 1, 1, 0)
train['Late30_50days'] = np.where(train['NumberOfTime60-89DaysPastDueNotWorse'] >= 2, 1, 0)
train['missing_income'] = np.where(np.isnan(train['MonthlyIncome']) == True, 1, 0)
train['missing_dependents'] = np.where(np.isnan(train['NumberOfDependents']) == True, 1, 0)

test['TotalLoans'] = test['NumberOfOpenCreditLinesAndLoans'] + test['NumberRealEstateLoansOrLines']
test['Late90days'] = np.where(test['NumberOfTimes90DaysLate'] >= 1, 1, 0)
test['Late60_90days'] = np.where(test['NumberOfTime60-89DaysPastDueNotWorse'] >= 1, 1, 0)
test['Late30_50days'] = np.where(test['NumberOfTime60-89DaysPastDueNotWorse'] >= 2, 1, 0)
test['missing_income'] = np.where(np.isnan(test['MonthlyIncome']) == True, 1, 0)
test['missing_dependents'] = np.where(np.isnan(test['NumberOfDependents']) == True, 1, 0)

## Filling missing values with kNN
knn_imputer = KNNImputer(n_neighbors = 5, weights = 'distance')
train = pd.DataFrame(knn_imputer.fit_transform(train), columns = train.columns)
test = pd.DataFrame(knn_imputer.fit_transform(test), columns = test.columns)

In [12]:
## Defining input and target variables
X = train.drop(columns = ['Id', 'SeriousDlqin2yrs'], axis = 1)
Y = train['SeriousDlqin2yrs']

## Scaling inputs to 0-1
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
test = pd.DataFrame(scaler.fit_transform(test), columns = test.columns)

## Defining the hyper-parameter grid
logistic_param_grid = {'penalty': ['l1', 'l2', 'elasticnet'],
                       'C': [0.001, 0.01, 0.1, 1, 10, 100],
                       'solver': ['liblinear', 'sag', 'saga']}
#                        'class_weight': ['balanced']}

## Performing grid search with 5 folds
logistic_grid_search = GridSearchCV(LogisticRegression(), logistic_param_grid, cv = 5, scoring = 'roc_auc', n_jobs = -1, verbose = 1).fit(X, Y)

## Extracting the best parameters
best_params = logistic_grid_search.best_params_
print('The optimal hyper-parameters are:', best_params)

## Extracting the best score
best_score = logistic_grid_search.best_score_
print('The best area under the ROC cure is:', best_score)

## Extracting the best model
logistic_md = logistic_grid_search.best_estimator_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


120 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l

The optimal hyper-parameters are: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
The best area under the ROC cure is: 0.8139352954601824


In [13]:
# logit_md = LogisticRegression(C = 10, penalty = 'l1', solver = 'liblinear').fit(X, Y)
## Extracting the best model and its estimated parameters
logistic_md = logistic_grid_search.best_estimator_
coefs =  pd.DataFrame({'feature': X.columns, 'est_coef': abs(logistic_md.coef_.flatten())})
coefs = coefs.sort_values(by = 'est_coef', ascending = False).reset_index(drop = True)
coefs

Unnamed: 0,feature,est_coef
0,NumberOfTime60-89DaysPastDueNotWorse,66.161312
1,NumberOfTime30-59DaysPastDueNotWorse,46.607448
2,MonthlyIncome,40.08823
3,NumberOfTimes90DaysLate,17.755669
4,DebtRatio,17.041987
5,NumberRealEstateLoansOrLines,4.605962
6,age,3.012354
7,Late60_90days,1.877804
8,Late90days,1.663801
9,Late30_50days,1.370461


In [14]:
def roc_auc_cutoff(Y_test, Y_pred):
    
    ## Computing the precision recall curve
    fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)
    
    cutoffs = pd.DataFrame({'False_Positive': fpr, 'True_Positive': tpr, 'cutoff': thresholds})

    ## Finding the optimal cut-off
    cutoffs['True_Positive_minus_1'] = cutoffs['True_Positive'] - 1
    cutoffs['Euclidean_dist'] = np.sqrt(cutoffs['False_Positive']**2 + cutoffs['True_Positive_minus_1']**2)

    ## Sorting based on the Euclidean distance
    cutoffs = cutoffs.sort_values(by = 'Euclidean_dist').reset_index(drop = True)
        
    return cutoffs['cutoff'][0]

## Predicting on train to estimate cutoff
logit_pred_train = logistic_md.predict_proba(X)[:, 1]
opt_cutoff = roc_auc_cutoff(Y, logit_pred_train)
print('The optimal cutoff is', opt_cutoff)

The optimal cutoff is 0.05577720438895364


In [15]:
# Predicting on test with best RF model 
logit_pred_test = logistic_md.predict_proba(test)[:, 1]
logit_pred_label = np.where(logit_pred_test < opt_cutoff, 0, 1)

## Defining data-frame to be exported
data_out = pd.DataFrame({'id': test_id, 'SeriousDlqin2yrs': logit_pred_label})
data_out.to_csv('Logistic_submission.csv', index = False)

In [17]:
data_out['SeriousDlqin2yrs'].value_counts() / data_out.shape[0]

1    0.592488
0    0.407512
Name: SeriousDlqin2yrs, dtype: float64



In [None]:
# Predicting on test with best RF model 
logistic_pred = logistic_md.predict_proba(test)[:, 1] 

## Defining data-frame to be exported
data_out = pd.DataFrame({'id': test_id, 'failure': logistic_pred})
data_out.to_csv('Logistic_submission.csv', index = False)