In [1]:
%load_ext lab_black

## Load Packages and Data

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv("clean_default.csv")

In [4]:
df

Unnamed: 0,default,student,balance,income
0,False,False,729.526495,44361.625074
1,False,True,817.180407,12106.134700
2,False,False,1073.549164,31767.138947
3,False,False,529.250605,35704.493935
4,False,False,785.655883,38463.495879
...,...,...,...,...
9995,False,False,711.555020,52992.378914
9996,False,False,757.962918,19660.721768
9997,False,False,845.411989,58636.156984
9998,False,False,1569.009053,36669.112365


## EDA

In [5]:
df["default"].value_counts()

False    9667
True      333
Name: default, dtype: int64

In [6]:
9667 / (9667 + 333)

0.9667

In [7]:
df.dtypes

default       bool
student       bool
balance    float64
income     float64
dtype: object

## Preprocess

In [8]:
df.columns

Index(['default', 'student', 'balance', 'income'], dtype='object')

In [9]:
X = df[["student", "balance", "income"]].values

In [10]:
y = df["default"].values

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Dummy Classifier

In [12]:
dc = DummyClassifier(strategy="most_frequent")

In [13]:
dc.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

In [14]:
y_pred = dc.predict(X_test)

In [15]:
y_pred_proba = dc.predict_proba(X_test)

In [16]:
y_pred_proba

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.97      1.00      0.98      2419
        True       0.00      0.00      0.00        81

    accuracy                           0.97      2500
   macro avg       0.48      0.50      0.49      2500
weighted avg       0.94      0.97      0.95      2500



  _warn_prf(average, modifier, msg_start, len(result))


## Decision Tree

In [18]:
dt = DecisionTreeClassifier()

In [19]:
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [20]:
y_pred = dt.predict(X_test)

In [21]:
y_pred_proba = dt.predict_proba(X_test)
y_pred_proba

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.98      0.98      0.98      2419
        True       0.35      0.33      0.34        81

    accuracy                           0.96      2500
   macro avg       0.66      0.66      0.66      2500
weighted avg       0.96      0.96      0.96      2500



## Random Forest

In [23]:
rf = RandomForestClassifier()

In [24]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [25]:
y_pred = rf.predict(X_test)
y_pred

array([False, False, False, ..., False, False, False])

In [26]:
y_pred_proba = rf.predict_proba(X_test)
y_pred_proba

array([[1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       ...,
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.99, 0.01]])

In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.98      0.99      0.98      2419
        True       0.50      0.28      0.36        81

    accuracy                           0.97      2500
   macro avg       0.74      0.64      0.67      2500
weighted avg       0.96      0.97      0.96      2500



## Naive Bayes

In [28]:
nb = GaussianNB()

In [29]:
nb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [30]:
y_pred = nb.predict(X_test)

In [31]:
y_pred_proba = nb.predict_proba(X_test)
y_pred_proba

array([[9.39134715e-01, 6.08652848e-02],
       [9.99327840e-01, 6.72160247e-04],
       [9.99999190e-01, 8.09730698e-07],
       ...,
       [9.99999314e-01, 6.85764937e-07],
       [9.99887839e-01, 1.12161030e-04],
       [8.07991411e-01, 1.92008589e-01]])

In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.97      0.99      0.98      2419
        True       0.52      0.20      0.29        81

    accuracy                           0.97      2500
   macro avg       0.74      0.60      0.63      2500
weighted avg       0.96      0.97      0.96      2500



## Logistic Regression

In [33]:
lr = LogisticRegression()

In [34]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
y_pred = lr.predict(X_test)

In [36]:
y_pred_proba = lr.predict_proba(X_test)
y_pred_proba

array([[7.87225095e-01, 2.12774905e-01],
       [9.95695330e-01, 4.30466989e-03],
       [9.97125099e-01, 2.87490142e-03],
       ...,
       [9.99424467e-01, 5.75532940e-04],
       [9.99020655e-01, 9.79344774e-04],
       [9.49649113e-01, 5.03508872e-02]])

In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.97      0.99      0.98      2419
        True       0.48      0.20      0.28        81

    accuracy                           0.97      2500
   macro avg       0.73      0.60      0.63      2500
weighted avg       0.96      0.97      0.96      2500

