In [1]:
# Installing the needed dependencies
import sys
!{sys.executable} -m pip install imblearn



In [2]:
# Importing the Needed Dependencies
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.utils.validation import check_array
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from pprint import pprint
import csv
from imblearn.over_sampling import RandomOverSampler

## Loading "ETL_credit_data.csv" Into Dataframe

In [3]:
# Create reference to CSV file
csv_path = Path("../output_data/ETL/csv/ETL_credit_data.csv")

# Import the CSV into a pandas DataFrame
credit_data_df = pd.read_csv(csv_path)

# Display dataframe
credit_data_df.head()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Income,Monthly Debt,Debt / Income Ratio,Credit History(Years),Months Delinquent,Open Accounts,Credit Problems,Credit Balance,Total Credit,Credit Usage Ratio,Bankruptcies
0,611314,747.0,2074116.0,172843.0,42000.83,0.243,21.8,0.0,9,0,621908,1058970,0.587276,0.0
1,266662,734.0,1919190.0,159932.5,36624.4,0.228999,19.4,0.0,11,0,679573,904442,0.751373,0.0
2,153494,709.0,871112.0,72592.67,8391.73,0.1156,12.5,10.0,10,0,38532,388036,0.0993,0.0
3,176242,727.0,780083.0,65006.92,16771.87,0.258001,16.5,27.0,16,1,156940,531322,0.295376,1.0
4,321992,744.0,1761148.0,146762.33,39478.77,0.268998,26.0,44.0,14,0,359765,468072,0.76861,0.0


## Split the Data into Training and Testing Sets

### Step 1: Create the labels set (y) from the “Bankruptcies” column, and then create the features (X) DataFrame from the remaining columns

In [4]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = credit_data_df['Bankruptcies']

# Separate the X variable, the features
X = credit_data_df.drop(columns='Bankruptcies')

In [5]:
# Review the y variable Series
y.head()

0    0.0
1    0.0
2    0.0
3    1.0
4    0.0
Name: Bankruptcies, dtype: float64

In [6]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Income,Monthly Debt,Debt / Income Ratio,Credit History(Years),Months Delinquent,Open Accounts,Credit Problems,Credit Balance,Total Credit,Credit Usage Ratio
0,611314,747.0,2074116.0,172843.0,42000.83,0.243,21.8,0.0,9,0,621908,1058970,0.587276
1,266662,734.0,1919190.0,159932.5,36624.4,0.228999,19.4,0.0,11,0,679573,904442,0.751373
2,153494,709.0,871112.0,72592.67,8391.73,0.1156,12.5,10.0,10,0,38532,388036,0.0993
3,176242,727.0,780083.0,65006.92,16771.87,0.258001,16.5,27.0,16,1,156940,531322,0.295376
4,321992,744.0,1761148.0,146762.33,39478.77,0.268998,26.0,44.0,14,0,359765,468072,0.76861


### Step 2: Check the balance of the labels variable (y) by using the value_counts function

In [7]:
# Check the balance of our target values
y.value_counts()

Bankruptcies
0.0    8911
1.0    1082
Name: count, dtype: int64

In [8]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(7494, 13)

## Create a Logistic Regression Model with the Original Data

### Step 1: Fit a logistic regression model by using the training data (X_train and y_train)

In [9]:
# Instantiate the Logistic Regression model
Log_Reg_Model = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
Log_Reg_Model

In [10]:
# Fit the model using training data
Log_Reg_Model.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (X_test) and the fitted model.

In [11]:
# Make a prediction using the testing data
testing_data_predictions = Log_Reg_Model.predict(X_test)
testing_data_results = pd.DataFrame({"Prediction": testing_data_predictions, "Actual": y_test}).reset_index(drop=True)
testing_data_results.head(10)

Unnamed: 0,Prediction,Actual
0,0.0,1.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,1.0
6,0.0,0.0
7,0.0,1.0
8,0.0,0.0
9,0.0,0.0


### Step 3: Evaluate the model’s performance by doing the following:
Calculate the accuracy score of the model.

Generate a confusion matrix.

Print the classification report.

In [12]:
# Grabbing the accuracy score for the test dataset.
testing_data_accuracy_score = accuracy_score(y_test, testing_data_predictions)

# Displaying the accuracy score for the test dataset.
print(f"Testing Data Accuracy Score: {testing_data_accuracy_score}")

Testing Data Accuracy Score: 0.8915566226490597


In [13]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, testing_data_predictions)

array([[2228,    0],
       [ 271,    0]], dtype=int64)

In [14]:
# Printing Classification Report
target_names = ["Loan Defaulted", "Loan Not Defaulted"]
print(classification_report(y_test, testing_data_predictions, target_names=target_names))

                    precision    recall  f1-score   support

    Loan Defaulted       0.89      1.00      0.94      2228
Loan Not Defaulted       0.00      0.00      0.00       271

          accuracy                           0.89      2499
         macro avg       0.45      0.50      0.47      2499
      weighted avg       0.79      0.89      0.84      2499



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
