In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(context='paper', style='darkgrid', rc={'figure.facecolor':'white'}, font_scale=1.2)

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn import metrics

from sklearn.metrics import confusion_matrix
import itertools

from sklearn.model_selection import GridSearchCV

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
df_2 = pd.read_csv('Cleaned_data_w_dummies.csv', index_col = 0)

In [3]:
df_2

Unnamed: 0,LIMIT_BAL,SEX,AGE,September_status,August_status,July_status,June_status,May_status,April_status,Default,...,May_status_7,May_status_8,April_status_0,April_status_2,April_status_3,April_status_4,April_status_5,April_status_6,April_status_7,April_status_8
0,200000,0,30,2,2,2,2,2,2,0,...,0,0,0,1,0,0,0,0,0,0
1,200000,0,27,-1,-1,-1,-1,-1,-1,0,...,0,0,0,0,0,0,0,0,0,0
2,20000,0,28,0,0,2,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
3,50000,1,23,0,0,0,-1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,20000,1,47,-1,-1,-1,-1,-1,-1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22495,50000,0,50,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
22496,80000,0,53,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
22497,50000,0,50,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
22498,220000,1,32,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [4]:
X = df_2.drop('Default', axis = 1)
y = df_2['Default']
feature_cols = X.columns

In [5]:
feature_cols = X.columns

In [None]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
vif.round(5)

## Creating all polynomial features 

In [None]:
poly = PolynomialFeatures(2)

In [None]:
polynomial_features_2 = PolynomialFeatures(degree=2, include_bias=False)

#create polynomials and interactions

In [None]:
X_poly = polynomial_features_2.fit_transform(X)
poly_columns = polynomial_features_2.get_feature_names(X.columns)

In [None]:
X_poly_df = pd.DataFrame(X_poly, columns = poly_columns)

In [None]:
# Create correlation matrix
corr_matrix = X_poly_df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

In [None]:
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

In [None]:
len(to_drop)

In [None]:
X_poly_df.drop(columns = to_drop, inplace = True)

In [None]:
X_poly_df.shape

___

## Logistic regression with balanced weighting

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

In [7]:
scaler = StandardScaler()
final_scaler = scaler.fit(X_train)
X_train = pd.DataFrame(data=scaler.transform(X_train), columns = X.columns)
X_test = pd.DataFrame(data=scaler.transform(X_test), columns = X.columns)

In [8]:
lr_clf_weighted = LogisticRegression(penalty='l1', tol = .01, max_iter = 5000, 
                                     solver='saga', class_weight='balanced')

lr_clf_weighted.fit(X_train, y_train)

y_weighted_test = lr_clf_weighted.predict(X_test)

In [9]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, y_weighted_test))

# checking precision
print('Precision score: ', metrics.precision_score(y_test, y_weighted_test))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test, y_weighted_test))

# checking accuracy
print('Test F1 score: ', f1_score(y_test, y_weighted_test))

Test Accuracy score:  0.7754666666666666
Precision score:  0.5090027700831025
Recall score:  0.5702094647013188
Test F1 score:  0.5378704720087815


## Gridsearch CV with logistic regression

In [47]:
# penalty_value = ['l1', 'l2']
# tol_value = [.1, .01, .001]
# solver_values = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
max_iter_value = [100, 1000, 5000, 10000]

In [49]:
param_grid = {'max_iter': max_iter_value}

In [51]:
lr_cv = LogisticRegression(penalty='l1', tol = .01, solver='saga', class_weight='balanced')

In [52]:
grid = GridSearchCV(lr_cv, param_grid)

In [53]:
grid.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                          dual=False, fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l1',
                                          random_state=None, solver='saga',
                                          tol=0.01, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'max_iter': [100, 1000, 5000, 10000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [54]:
grid.cv_results_

{'mean_fit_time': array([0.38921313, 0.3860764 , 0.38317685, 0.38290706]),
 'std_fit_time': array([0.03598365, 0.03202418, 0.0326983 , 0.02895091]),
 'mean_score_time': array([0.00136356, 0.00125117, 0.00128717, 0.00129147]),
 'std_score_time': array([2.07804156e-04, 6.58073977e-05, 1.39910425e-04, 9.67957586e-05]),
 'param_max_iter': masked_array(data=[100, 1000, 5000, 10000],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_iter': 100},
  {'max_iter': 1000},
  {'max_iter': 5000},
  {'max_iter': 10000}],
 'split0_test_score': array([0.79022222, 0.79111111, 0.79081481, 0.79051852]),
 'split1_test_score': array([0.76503704, 0.76444444, 0.76474074, 0.76474074]),
 'split2_test_score': array([0.76977778, 0.76977778, 0.76977778, 0.76977778]),
 'split3_test_score': array([0.76562963, 0.76562963, 0.76622222, 0.76592593]),
 'split4_test_score': array([0.77303704, 0.77303704, 0.77333333, 0.77333333]),
 'mean_test_score': arra

In [None]:
# grid.cv_results_['mean_test_score'].max()