In [None]:
# Importing the Needed Dependencies
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.utils.validation import check_array
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
import tensorflow as tf
from pprint import pprint
import csv

# Creating Model To Train Data Using Logistic Regression Algorithm

## Loading "ETL_credit_data.csv" Into Dataframe

In [None]:
# Create reference to CSV file
RL_url = "https://nextcloud.unknowntunnel.com/s/ybcswYZDTBWi3Nx/download/ETL_credit_data.csv"

# Import the CSV into a pandas DataFrame
LR_credit_data_df = pd.read_csv(RL_url)

# Display dataframe
LR_credit_data_df.head()

### Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [None]:
# Separate the data into labels and features
# Separate the y variable, the labels
LR_y = LR_credit_data_df['Bankruptcies']

# Separate the X variable, the features
LR_X = LR_credit_data_df.drop(columns='Bankruptcies')

In [None]:
# Review the y variable Series
LR_y.head()

In [None]:
# Review the X variable DataFrame
LR_X.head()

### Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [None]:
# Check the balance of our target values
LR_y.value_counts()

### Split the data into training and testing datasets by using `train_test_split`.

In [None]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
LR_X_train, LR_X_test, LR_y_train, LR_y_test = train_test_split(LR_X, 
                                                    LR_y, 
                                                    random_state=1, 
                                                    stratify=LR_y)
LR_X_train.shape

## Create a Logistic Regression Model with the Original Data

###  Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [None]:
# Instantiate the Logistic Regression model
Log_Reg_Model = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
Log_Reg_Model

In [None]:
# Fit the model using training data
Log_Reg_Model.fit(LR_X_train, LR_y_train)

### Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [None]:
# Make a prediction using the testing data
Log_Reg_Model_Predictions = Log_Reg_Model.predict(LR_X_test)

### Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(LR_y_test, Log_Reg_Model_Predictions)

In [None]:
# Calculating the testing data accuracy score
LR_acc_score = accuracy_score(LR_y_test, Log_Reg_Model_Predictions)

# Print testing data accuracy score
print(f"Testing Data Accuracy Score: {LR_acc_score}")

In [None]:
# Printing Classification Report
target_names = ["Loan Not Defaulted", "Loan Defaulted"]
print(classification_report(LR_y_test, Log_Reg_Model_Predictions, target_names=target_names))

## Predict a Logistic Regression Model with Resampled Training Data

### Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [None]:
# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ROS = RandomOverSampler(random_state = 1)

# Fit the original training data to the random_oversampler model
X_ROS, y_ROS = ROS.fit_resample(LR_X_train, LR_y_train)

In [None]:
# Count the distinct values of the resampled labels data
y_ROS.value_counts()

In [None]:
# Instantiate the Logistic Regression model
Log_Reg_ROS = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
Log_Reg_ROS

# Fit the model using the resampled training data
Log_Reg_ROS.fit(X_ROS, y_ROS)

# Make a prediction using the testing data
Log_Reg_ROS_Predictions = Log_Reg_ROS.predict(LR_X_test)

### Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(LR_y_test, Log_Reg_ROS_Predictions)

In [None]:
# Calculating the testing data accuracy score
LR_testing_acc_score = accuracy_score(LR_y_test, Log_Reg_ROS_Predictions)

# Print testing data accuracy score
print(f"Testing Data Accuracy Score: {LR_testing_acc_score}")

In [None]:
# Calculating Precision Score Averages of the testing data
LR_testing_prec_score = precision_score(LR_y_test, Log_Reg_ROS_Predictions, average=None)

# Print Precision Score Averages of the testing data
print(f"Testing Data Precision Score Averages: {LR_testing_prec_score}")

In [None]:
# Calculating Recall Score Averages of the testing data
LR_testing_rec_score = recall_score(LR_y_test, Log_Reg_ROS_Predictions, average=None)

# Print Recall Score Averages of the testing data
print(f"Testing Data Recall Score Averages: {LR_testing_rec_score}")

In [None]:
# Printing Classification Report
LR_target_names = ["Loan Not Defaulted", "Loan Defaulted"]
print(classification_report(LR_y_test, Log_Reg_ROS_Predictions, target_names=LR_target_names))

# Creating Model To Train Data Using Random Forest Algorithm

## Reloading "ETL_credit_data.csv" Into Dataframe

In [None]:
# Create reference to CSV file
RF_url = "https://nextcloud.unknowntunnel.com/s/ybcswYZDTBWi3Nx/download/ETL_credit_data.csv"

# Import the CSV into a pandas DataFrame
RF_credit_data_df = pd.read_csv(RF_url)

# Display dataframe
RF_credit_data_df.head()

## Loading and Preprocessing Data

In [None]:
# Define features set
RF_X = RF_credit_data_df.copy()
RF_X.drop("Bankruptcies", axis=1, inplace=True)
RF_X.head()

In [None]:
# Define target vector
RF_y = RF_credit_data_df["Bankruptcies"].ravel()
RF_y[:5]

In [None]:
# Splitting into Train and Test sets
RF_X_train, RF_X_test, RF_y_train, RF_y_test = train_test_split(RF_X, RF_y, random_state=1)

In [None]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fitting Standard Scaller
X_scaler = scaler.fit(RF_X_train)

In [None]:
# Scaling data
X_train_scaled = X_scaler.transform(RF_X_train)
X_test_scaled = X_scaler.transform(RF_X_test)

## Fitting the Random Forest Model

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, RF_y_train)

## Making Predictions Using the Random Forest Model

In [None]:
# Making predictions using the testing data
RF_testing_predictions = rf_model.predict(X_test_scaled)
RF_testing_predictions

## Model Evaluation

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(RF_y_test, RF_testing_predictions)

In [None]:
# Calculating the testing data accuracy score
RF_testing_acc_score = accuracy_score(RF_y_test, RF_testing_predictions)

# Print testing data accuracy score
print(f"Testing Data Accuracy Score: {RF_testing_acc_score}")

In [None]:
# Calculating Precision Score Averages of the testing data
RF_testing_prec_score = precision_score(RF_y_test, RF_testing_predictions, average=None)

# Print Precision Score Averages of the testing data
print(f"Testing Data Precision Score Averages: {RF_testing_prec_score}")

In [None]:
# Calculating Recall Score Averages of the testing data
RF_testing_rec_score = recall_score(RF_y_test, RF_testing_predictions, average=None)

# Print Recall Score Averages of the testing data
print(f"Testing Data Recall Score Averages: {RF_testing_rec_score}")

In [None]:
# Printing Classification Report
RF_target_names = ["Loan Not Defaulted", "Loan Defaulted"]
print(classification_report(RF_y_test, RF_testing_predictions, target_names=RF_target_names))