In [None]:
# This code is prepared by Orhan Erdem
# Please email orhanerdem at gmail.com for errors, suggestions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import statsmodels.formula.api as smf

In [None]:
#Load Data
default_df=pd.read_excel('CC_Default.xlsx')
default_df.head()

In [None]:
default_df=pd.get_dummies(default_df,prefix_sep='_',drop_first=True,dtype=float)
default_df.head()

In [None]:
default_df.corr().round(3)

In [None]:
sns.scatterplot(x='balance',y='default_Yes',data=default_df)

In [None]:
sns.scatterplot(x='income',y='default_Yes',data=default_df)

In [None]:
#Box Plot-1
ax=default_df.boxplot(column='income',by='default_Yes')
ax.set_ylabel('income')
plt.suptitle('')
plt.title('Income')
plt.show()

In [None]:
#Box Plot-1
ax=default_df.boxplot(column='balance',by='default_Yes')
ax.set_ylabel('balance')
plt.suptitle('')
plt.title('Balance')
plt.show()

In [None]:
default_df.describe()

In [None]:
ret_logit=smf.logit(formula='default_Yes~1+balance', data=default_df)
results_logit=ret_logit.fit(disp=0)
print(results_logit.summary())

# Logistic Regression without Train and Test Split

In [None]:
ret_logit=smf.logit(formula='default_Yes~1+balance', data=default_df)
results_logit=ret_logit.fit(disp=0)
print(results_logit.summary())

In [None]:
pred=results_logit.predict(default_df['balance'])

In [None]:
sns.scatterplot(x='balance',y='default_Yes',data=default_df)
sns.scatterplot(x='balance',y=pred,data=default_df,color='red',marker='+')

In [None]:
ret_logit=smf.logit(formula='default_Yes~1+income', data=default_df)
results_logit=ret_logit.fit(disp=0)
print(results_logit.summary())

In [None]:
ret_logit=smf.logit(formula='default_Yes~1+income+balance+student_Yes', data=default_df)
results_logit=ret_logit.fit(disp=0)
print(results_logit.summary())

# Logistic Regression with Train and Test Split

In [None]:
y=default_df['default_Yes']
X=default_df.drop(columns=['default_Yes'])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.40, random_state=101)
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel=LogisticRegression(fit_intercept=True)
logmodel.fit(X_train,y_train)

predictions=logmodel.predict(X_test)

print('Intercept is', logmodel.intercept_)
print('Coefficients are', logmodel.coef_)

In [None]:
#Couple of interesting cases
logit_reg_pred=logmodel.predict(X_test)
logit_reg_proba=logmodel.predict_proba(X_test)
logit_result=pd.DataFrame({'actual': y_test,
                          'p(0)':[p[0] for p in logit_reg_proba],
                          'p(1)':[p[1] for p in logit_reg_proba],
                          'predicted':logit_reg_pred})

In [None]:
interestingCases=[6,728,2721,4808]
print(logit_result.loc[interestingCases])

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,predictions))

In [None]:
print(confusion_matrix(y_test,predictions))

In [None]:
cm = confusion_matrix(y_test, logmodel.predict(X_test))

fig, ax = plt.subplots(figsize=(4, 4))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
ax.set_ylim(1.5, -0.5)
for i in range(2):
    for j in range(2):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
plt.show()