<a href="https://colab.research.google.com/github/CJO100293/Project-4/blob/main/ML/ML_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing the Needed Dependencies
import warnings
from warnings import simplefilter
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.utils.validation import check_array
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from pprint import pprint
import csv

# Creating Model To Train Data Using Logistic Regression Algorithm

## Loading "ETL_credit_data.csv" Into Dataframe

In [None]:
# Create reference to CSV file
LR_url = "https://nextcloud.unknowntunnel.com/s/ybcswYZDTBWi3Nx/download/ETL_credit_data.csv"

# Import the CSV into a pandas DataFrame
LR_credit_data_df = pd.read_csv(LR_url)

# Display dataframe
LR_credit_data_df.head()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Income,Monthly Debt,Debt / Income Ratio,Credit History(Years),Months Delinquent,Open Accounts,Credit Problems,Credit Balance,Total Credit,Credit Usage Ratio,Bankruptcies
0,611314,747.0,2074116.0,172843.0,42000.83,0.243,21.8,0.0,9,0,621908,1058970,0.587276,0.0
1,266662,734.0,1919190.0,159932.5,36624.4,0.228999,19.4,0.0,11,0,679573,904442,0.751373,0.0
2,153494,709.0,871112.0,72592.67,8391.73,0.1156,12.5,10.0,10,0,38532,388036,0.0993,0.0
3,176242,727.0,780083.0,65006.92,16771.87,0.258001,16.5,27.0,16,1,156940,531322,0.295376,1.0
4,321992,744.0,1761148.0,146762.33,39478.77,0.268998,26.0,44.0,14,0,359765,468072,0.76861,0.0


### Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [None]:
# Separate the data into labels and features
# Separate the y variable, the labels
LR_y = LR_credit_data_df['Bankruptcies']

# Separate the X variable, the features
LR_X = LR_credit_data_df.drop(columns='Bankruptcies')

In [None]:
# Review the y variable Series
LR_y.head()

0    0.0
1    0.0
2    0.0
3    1.0
4    0.0
Name: Bankruptcies, dtype: float64

In [None]:
# Review the X variable DataFrame
LR_X.head()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Income,Monthly Debt,Debt / Income Ratio,Credit History(Years),Months Delinquent,Open Accounts,Credit Problems,Credit Balance,Total Credit,Credit Usage Ratio
0,611314,747.0,2074116.0,172843.0,42000.83,0.243,21.8,0.0,9,0,621908,1058970,0.587276
1,266662,734.0,1919190.0,159932.5,36624.4,0.228999,19.4,0.0,11,0,679573,904442,0.751373
2,153494,709.0,871112.0,72592.67,8391.73,0.1156,12.5,10.0,10,0,38532,388036,0.0993
3,176242,727.0,780083.0,65006.92,16771.87,0.258001,16.5,27.0,16,1,156940,531322,0.295376
4,321992,744.0,1761148.0,146762.33,39478.77,0.268998,26.0,44.0,14,0,359765,468072,0.76861


### Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [None]:
# Check the balance of our target values
LR_y.value_counts()

Bankruptcies
0.0    8911
1.0    1082
Name: count, dtype: int64

In [None]:
### Split the data into training and testing datasets by using `train_test_split`.

In [None]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
LR_X_train, LR_X_test, LR_y_train, LR_y_test = train_test_split(LR_X,
                                                    LR_y,
                                                    random_state=1,
                                                    stratify=LR_y)
LR_X_train.shape

(7494, 13)

## Create a Logistic Regression Model with the Original Data

###  Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [None]:
# Instantiate the Logistic Regression model
Log_Reg_Model = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
Log_Reg_Model

In [None]:
# Fit the model using training data
Log_Reg_Model.fit(LR_X_train, LR_y_train)

### Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [None]:
# Make a prediction using the testing data
Log_Reg_Model_Predictions = Log_Reg_Model.predict(LR_X_test)

### Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(LR_y_test, Log_Reg_Model_Predictions)

array([[2228,    0],
       [ 271,    0]], dtype=int64)

In [None]:
# Calculating the testing data accuracy score for Logistic Regression Algorithm
LR_acc_score = accuracy_score(LR_y_test, Log_Reg_Model_Predictions)

# Print testing data accuracy score
print(f"Testing Data Accuracy Score: {LR_acc_score}")

Testing Data Accuracy Score: 0.8915566226490597


In [None]:
# Printing Classification Report for Logistic Regression Algorithm
target_names = ["Loan Not Defaulted", "Loan Defaulted"]
print(classification_report(LR_y_test, Log_Reg_Model_Predictions, target_names=target_names))

                    precision    recall  f1-score   support

Loan Not Defaulted       0.89      1.00      0.94      2228
    Loan Defaulted       0.00      0.00      0.00       271

          accuracy                           0.89      2499
         macro avg       0.45      0.50      0.47      2499
      weighted avg       0.79      0.89      0.84      2499



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predict a Logistic Regression Model with Resampled Training Data

### Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points.

In [None]:
# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ROS = RandomOverSampler(random_state = 1)

# Fit the original training data to the random_oversampler model
X_ROS, y_ROS = ROS.fit_resample(LR_X_train, LR_y_train)

In [None]:
# Count the distinct values of the resampled labels data
y_ROS.value_counts()

Bankruptcies
0.0    6683
1.0    6683
Name: count, dtype: int64

In [None]:
# Instantiate the Logistic Regression model
Log_Reg_ROS = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
Log_Reg_ROS

# Fit the model using the resampled training data
Log_Reg_ROS.fit(X_ROS, y_ROS)

# Make a prediction using the testing data
Log_Reg_ROS_Predictions = Log_Reg_ROS.predict(LR_X_test)

### Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(LR_y_test, Log_Reg_ROS_Predictions)

array([[1564,  664],
       [ 141,  130]], dtype=int64)

In [None]:
# Calculating the testing data accuracy score for Logistic Regression Algorithm
LR_testing_acc_score = accuracy_score(LR_y_test, Log_Reg_ROS_Predictions)

# Print testing data accuracy score
print(f"Testing Data Accuracy Score: {LR_testing_acc_score}")

Testing Data Accuracy Score: 0.6778711484593838


In [None]:
# Calculating Precision Score Averages of the testing data for Logistic Regression Algorithm
LR_testing_prec_score = precision_score(LR_y_test, Log_Reg_ROS_Predictions, average=None)

# Print Precision Score Averages of the testing data
print(f"Testing Data Precision Score Averages: {LR_testing_prec_score}")

Testing Data Precision Score Averages: [0.91730205 0.16372796]


In [None]:
# Calculating Recall Score Averages of the testing data for Logistic Regression Algorithm
LR_testing_rec_score = recall_score(LR_y_test, Log_Reg_ROS_Predictions, average=None)

# Print Recall Score Averages of the testing data
print(f"Testing Data Recall Score Averages: {LR_testing_rec_score}")

Testing Data Recall Score Averages: [0.70197487 0.4797048 ]


In [None]:
# Printing Classification Report for Logistic Regression Algorithm
LR_target_names = ["Loan Not Defaulted", "Loan Defaulted"]
print(classification_report(LR_y_test, Log_Reg_ROS_Predictions, target_names=LR_target_names))

                    precision    recall  f1-score   support

Loan Not Defaulted       0.92      0.70      0.80      2228
    Loan Defaulted       0.16      0.48      0.24       271

          accuracy                           0.68      2499
         macro avg       0.54      0.59      0.52      2499
      weighted avg       0.84      0.68      0.74      2499



# Creating Model To Train Data Using KNeighbors Algorithm

## Reloading "ETL_credit_data.csv" Into Dataframe

In [None]:
# Create reference to CSV file
KN_url = "https://nextcloud.unknowntunnel.com/s/ybcswYZDTBWi3Nx/download/ETL_credit_data.csv"

# Import the CSV into a pandas DataFrame
KN_credit_data_df = pd.read_csv(KN_url)

# Display dataframe
KN_credit_data_df.head()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Income,Monthly Debt,Debt / Income Ratio,Credit History(Years),Months Delinquent,Open Accounts,Credit Problems,Credit Balance,Total Credit,Credit Usage Ratio,Bankruptcies
0,611314,747.0,2074116.0,172843.0,42000.83,0.243,21.8,0.0,9,0,621908,1058970,0.587276,0.0
1,266662,734.0,1919190.0,159932.5,36624.4,0.228999,19.4,0.0,11,0,679573,904442,0.751373,0.0
2,153494,709.0,871112.0,72592.67,8391.73,0.1156,12.5,10.0,10,0,38532,388036,0.0993,0.0
3,176242,727.0,780083.0,65006.92,16771.87,0.258001,16.5,27.0,16,1,156940,531322,0.295376,1.0
4,321992,744.0,1761148.0,146762.33,39478.77,0.268998,26.0,44.0,14,0,359765,468072,0.76861,0.0


In [None]:
# Split target column from dataset
KN_y = KN_credit_data_df['Bankruptcies']
KN_X = KN_credit_data_df.drop(columns='Bankruptcies')

In [None]:
# Preview the data
KN_X[:5]

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Income,Monthly Debt,Debt / Income Ratio,Credit History(Years),Months Delinquent,Open Accounts,Credit Problems,Credit Balance,Total Credit,Credit Usage Ratio
0,611314,747.0,2074116.0,172843.0,42000.83,0.243,21.8,0.0,9,0,621908,1058970,0.587276
1,266662,734.0,1919190.0,159932.5,36624.4,0.228999,19.4,0.0,11,0,679573,904442,0.751373
2,153494,709.0,871112.0,72592.67,8391.73,0.1156,12.5,10.0,10,0,38532,388036,0.0993
3,176242,727.0,780083.0,65006.92,16771.87,0.258001,16.5,27.0,16,1,156940,531322,0.295376
4,321992,744.0,1761148.0,146762.33,39478.77,0.268998,26.0,44.0,14,0,359765,468072,0.76861


In [None]:
# Print first five entries for target
KN_y[:5]

0    0.0
1    0.0
2    0.0
3    1.0
4    0.0
Name: Bankruptcies, dtype: float64

In [None]:
# Encode the categorical variables using get_dummies
KN_X = pd.get_dummies(KN_X)

In [None]:
# Preview the data
KN_X.head()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Income,Monthly Debt,Debt / Income Ratio,Credit History(Years),Months Delinquent,Open Accounts,Credit Problems,Credit Balance,Total Credit,Credit Usage Ratio
0,611314,747.0,2074116.0,172843.0,42000.83,0.243,21.8,0.0,9,0,621908,1058970,0.587276
1,266662,734.0,1919190.0,159932.5,36624.4,0.228999,19.4,0.0,11,0,679573,904442,0.751373
2,153494,709.0,871112.0,72592.67,8391.73,0.1156,12.5,10.0,10,0,38532,388036,0.0993
3,176242,727.0,780083.0,65006.92,16771.87,0.258001,16.5,27.0,16,1,156940,531322,0.295376
4,321992,744.0,1761148.0,146762.33,39478.77,0.268998,26.0,44.0,14,0,359765,468072,0.76861


In [None]:
# Split the dataset
KN_X_train, KN_X_test, KN_y_train, KN_y_test = train_test_split(KN_X, KN_y, random_state=1)

In [None]:
# Create the StandardScaler instance
KN_scaler = StandardScaler()
# Fit the Standard Scaler with the training data
KN_X_scaler = KN_scaler.fit(KN_X_train)
# Scale the training data
KN_X_train_scaled = KN_X_scaler.transform(KN_X_train)
KN_X_test_scaled = KN_X_scaler.transform(KN_X_test)

In [None]:
# Instantiate the model with k = 3 neighbors
KN_model = KNeighborsClassifier(n_neighbors=3)

In [None]:
# Train the model
KN_model.fit(KN_X_train_scaled, KN_y_train)

In [None]:
# Create predictions
KN_testing_predictions = KN_model.predict(KN_X_test_scaled)

# Review the predictions
KN_testing_predictions

array([0., 0., 0., ..., 1., 1., 0.])

In [None]:
# Print confusion matrix
confusion_matrix(KN_testing_predictions,KN_y_test)

array([[2186,   19],
       [  66,  228]], dtype=int64)

In [None]:
# Calculating the testing data accuracy score for KNeighbors Algorithm
KN_testing_acc_score = accuracy_score(KN_y_test, KN_testing_predictions)

# Print testing data accuracy score
print(f"Testing Data Accuracy Score: {KN_testing_acc_score}")

Testing Data Accuracy Score: 0.9659863945578231


In [None]:
# Calculating Precision Score Averages of the testing data for KNeighbors Algorithm
KN_testing_prec_score = precision_score(KN_y_test, KN_testing_predictions, average=None)

# Print Precision Score Averages of the testing data
print(f"Testing Data Precision Score Averages: {KN_testing_prec_score}")

Testing Data Precision Score Averages: [0.99138322 0.7755102 ]


In [None]:
# Calculating Recall Score Averages of the testing data for KNeighbors Algorithm
KN_testing_rec_score = recall_score(KN_y_test, KN_testing_predictions, average=None)

# Print Recall Score Averages of the testing data
print(f"Testing Data Recall Score Averages: {KN_testing_rec_score}")

Testing Data Recall Score Averages: [0.97069272 0.92307692]


In [None]:
# Printing Classification Report for KNeighbors Algorithm
KN_target_names = ["Loan Not Defaulted", "Loan Defaulted"]
print(classification_report(KN_y_test, KN_testing_predictions, target_names=KN_target_names))

                    precision    recall  f1-score   support

Loan Not Defaulted       0.99      0.97      0.98      2252
    Loan Defaulted       0.78      0.92      0.84       247

          accuracy                           0.97      2499
         macro avg       0.88      0.95      0.91      2499
      weighted avg       0.97      0.97      0.97      2499



# Creating Model To Train Data Using Random Forest Algorithm

## Reloading "ETL_credit_data.csv" Into Dataframe One Last Time

In [None]:
# Create reference to CSV file
RF_url = "https://nextcloud.unknowntunnel.com/s/ybcswYZDTBWi3Nx/download/ETL_credit_data.csv"

# Import the CSV into a pandas DataFrame
RF_credit_data_df = pd.read_csv(RF_url)

# Display dataframe
RF_credit_data_df.head()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Income,Monthly Debt,Debt / Income Ratio,Credit History(Years),Months Delinquent,Open Accounts,Credit Problems,Credit Balance,Total Credit,Credit Usage Ratio,Bankruptcies
0,611314,747.0,2074116.0,172843.0,42000.83,0.243,21.8,0.0,9,0,621908,1058970,0.587276,0.0
1,266662,734.0,1919190.0,159932.5,36624.4,0.228999,19.4,0.0,11,0,679573,904442,0.751373,0.0
2,153494,709.0,871112.0,72592.67,8391.73,0.1156,12.5,10.0,10,0,38532,388036,0.0993,0.0
3,176242,727.0,780083.0,65006.92,16771.87,0.258001,16.5,27.0,16,1,156940,531322,0.295376,1.0
4,321992,744.0,1761148.0,146762.33,39478.77,0.268998,26.0,44.0,14,0,359765,468072,0.76861,0.0


## Loading and Preprocessing Data

In [None]:
# Define features set
RF_X = RF_credit_data_df.copy()
RF_X.drop("Bankruptcies", axis=1, inplace=True)
RF_X.head()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Income,Monthly Debt,Debt / Income Ratio,Credit History(Years),Months Delinquent,Open Accounts,Credit Problems,Credit Balance,Total Credit,Credit Usage Ratio
0,611314,747.0,2074116.0,172843.0,42000.83,0.243,21.8,0.0,9,0,621908,1058970,0.587276
1,266662,734.0,1919190.0,159932.5,36624.4,0.228999,19.4,0.0,11,0,679573,904442,0.751373
2,153494,709.0,871112.0,72592.67,8391.73,0.1156,12.5,10.0,10,0,38532,388036,0.0993
3,176242,727.0,780083.0,65006.92,16771.87,0.258001,16.5,27.0,16,1,156940,531322,0.295376
4,321992,744.0,1761148.0,146762.33,39478.77,0.268998,26.0,44.0,14,0,359765,468072,0.76861


In [None]:
# Define target vector
RF_y = RF_credit_data_df["Bankruptcies"].ravel()
RF_y[:5]

array([0., 0., 0., 1., 0.])

In [None]:
# Splitting into Train and Test sets
RF_X_train, RF_X_test, RF_y_train, RF_y_test = train_test_split(RF_X, RF_y, random_state=1)

In [None]:
# Creating StandardScaler instance
RF_scaler = StandardScaler()

In [None]:
# Fitting Standard Scaller
RF_X_scaler = RF_scaler.fit(RF_X_train)

In [None]:
# Scaling data
RF_X_train_scaled = RF_X_scaler.transform(RF_X_train)
RF_X_test_scaled = RF_X_scaler.transform(RF_X_test)

## Fitting the Random Forest Model

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [None]:
# Fitting the model
rf_model = rf_model.fit(RF_X_train_scaled, RF_y_train)

## Making Predictions Using the Random Forest Model

In [None]:
# Making predictions using the testing data
RF_testing_predictions = rf_model.predict(RF_X_test_scaled)
RF_testing_predictions

array([0., 0., 0., ..., 1., 1., 0.])

## Model Evaluation

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(RF_y_test, RF_testing_predictions)

array([[2180,   72],
       [   8,  239]], dtype=int64)

In [None]:
# Calculating the testing data accuracy score for Random Forest Algorithm
RF_testing_acc_score = accuracy_score(RF_y_test, RF_testing_predictions)

# Print testing data accuracy score
print(f"Testing Data Accuracy Score: {RF_testing_acc_score}")

Testing Data Accuracy Score: 0.9679871948779512


In [None]:
# Calculating Precision Score Averages of the testing data for Random Forest Algorithm
RF_testing_prec_score = precision_score(RF_y_test, RF_testing_predictions, average=None)

# Print Precision Score Averages of the testing data
print(f"Testing Data Precision Score Averages: {RF_testing_prec_score}")

Testing Data Precision Score Averages: [0.99634369 0.76848875]


In [None]:
# Calculating Recall Score Averages of the testing data for Random Forest Algorithm
RF_testing_rec_score = recall_score(RF_y_test, RF_testing_predictions, average=None)

# Print Recall Score Averages of the testing data
print(f"Testing Data Recall Score Averages: {RF_testing_rec_score}")

Testing Data Recall Score Averages: [0.96802842 0.96761134]


In [None]:
# Printing Classification Report for Random Forest Algorithm
RF_target_names = ["Loan Not Defaulted", "Loan Defaulted"]
print(classification_report(RF_y_test, RF_testing_predictions, target_names=RF_target_names))

                    precision    recall  f1-score   support

Loan Not Defaulted       1.00      0.97      0.98      2252
    Loan Defaulted       0.77      0.97      0.86       247

          accuracy                           0.97      2499
         macro avg       0.88      0.97      0.92      2499
      weighted avg       0.97      0.97      0.97      2499

