## EEE485 - Project Final

### Imports

In [None]:
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import numpy.linalg as LA
import smote
import random
import kNN
import fcn
import PCA

### Load Data

In [None]:
raw_data = pd.read_csv("data.csv")
bankrupt_pd = raw_data["Bankrupt?"]
features_pd = raw_data.drop(["Bankrupt?"], axis=1)
raw_data

### Check for NAN and Duplicate Values


In [None]:
print("NAN values:", [col for col in features_pd if features_pd[col].isna().sum() > 0])
print("Duplicates:", features_pd.duplicated().sum())

We now know that we do not have any missing or duplicate data

### Evaluate Data Imbalance

In [None]:
unstable_initial = (raw_data["Bankrupt?"] == 1).sum()
stable_initial = (raw_data["Bankrupt?"] == 0).sum()
print("Data Size:", raw_data.shape[0])
print("# of stable companies:", stable_initial )
print("# of unstable companies:", unstable_initial )
print("Unstable to Stable Ratio: ", unstable_initial/stable_initial)

### PLOTS

In [None]:
raw_data.hist(figsize = (50,40), bins = 50)
plt.show()

In [None]:
f, axes = plt.subplots(ncols=4, figsize = (24,6) )

sns.boxplot(x="Bankrupt?", y=" Cash/Total Assets", data=raw_data, ax = axes[0] )
axes[0].set_title("Bankrupt vs Cash/Total Assets")

sns.boxplot(x="Bankrupt?", y=" Current Assets/Total Assets", data=raw_data, ax = axes[1] )
axes[1].set_title("Bankrupt vs Current Assets/Total Assets")

sns.boxplot(x="Bankrupt?", y=" Net worth/Assets", data=raw_data, ax = axes[2] )
axes[2].set_title("Bankrupt vs Net worth/Assets")

sns.boxplot(x="Bankrupt?", y=" Cash/Current Liability", data=raw_data, ax = axes[3] )
axes[3].set_title("Bankrupt vs Cash/Current Liability")

plt.show()

### Outlier Removal Using IQR

In [None]:
clean_data = raw_data.copy(deep=True)
for col in features_pd:
    clean_data = fcn.remove_outlier(raw_data[col], str(col), raw_data)
clean_data = clean_data.reset_index(drop=True)
clean_data

### Plots with Outliers Removed

In [None]:
clean_data.hist(figsize = (50,40), bins = 50)
plt.show()

In [None]:
f, axes = plt.subplots(ncols=4, figsize = (24,6) )

sns.boxplot(x="Bankrupt?", y=" Cash/Total Assets", data=clean_data, ax = axes[0] )
axes[0].set_title("Bankrupt vs Cash/Total Assets")

sns.boxplot(x="Bankrupt?", y=" Current Assets/Total Assets", data=clean_data, ax = axes[1] )
axes[1].set_title("Bankrupt vs Current Assets/Total Assets")

sns.boxplot(x="Bankrupt?", y=" Net worth/Assets", data=clean_data, ax = axes[2] )
axes[2].set_title("Bankrupt vs Net worth/Assets")

sns.boxplot(x="Bankrupt?", y=" Cash/Current Liability", data=clean_data, ax = axes[3] )
axes[3].set_title("Bankrupt vs Cash/Current Liability")

plt.show()

### PCA

In [None]:
import numpy as np
import numpy.linalg as LA

class PCAnalyser():

    def __init__(self, X: np.ndarray, data_centered=False) -> None:
        if data_centered:
            self.X = X
        else:
            self.X = X - np.mean(X, axis=0)
        self.Sigma = self.X.T @ self.X
        self.eigs = np.array([])
        return

    def analyse(self, k = 10):
        if k > self.Sigma.shape[0]: return

        eig_vals, eig_vecs = LA.eigh(self.Sigma)
        idx = np.argsort(eig_vals)[::-1]
        self.eigs = eig_vals[idx]
        eig_vecs = eig_vecs[:,idx]
        PCs = eig_vecs[:,0:k]

        return self.eigs, PCs

    def calc_PVE(self, m=10, individual=False):
        m = np.clip(m, 0, len(self.eigs))
        if individual:
            return self.eigs[m] / sum(self.eigs) # PVE(m)
        return sum(self.eigs[:m+1]) / sum(self.eigs) # PVE(first m)

In [None]:
clean_X = clean_data.drop(["Bankrupt?"], axis=1)
clean_Y = clean_data["Bankrupt?"]
centered_data = clean_X - np.mean(clean_X, axis=0)
pc_analyser = PCAnalyser(centered_data, data_centered=True)
eigen_vals, PCs = pc_analyser.analyse(k=8)

In [None]:
plt.figure(figsize=(6,4))
plt.plot(eigen_vals)
plt.xlabel("Index")
plt.ylabel("Eigen Value")
plt.title("Eigen Values of the Principal Components")
plt.xlim( (0, 30) )

In [None]:
PCA_data = centered_data @ PCs

In [None]:
PCA_data.columns = ("PC"+str(i) for i in range(1,9))
print("Shape of the Feature Matrix after PCA is:", PCA_data.shape)
PCA_data = pd.concat([clean_Y, PCA_data], axis=1)

### SMOTE

In [None]:
minority = PCA_data[PCA_data["Bankrupt?"] == 1] # Extract minority samples from data
smt = smote.Smote( minority.to_numpy() ) # Initialize the SMOTE class
oversamples = smt.oversample(N=2600) # Employ SMOTE oversampling


In [None]:
smote_data = PCA_data.copy(deep=True) # Cleared from outliers and dim reduced by PCA. Now oversample
oversamples_pd = pd.DataFrame(oversamples, columns = PCA_data.columns)
smote_data = smote_data.append(oversamples_pd)
smote_data = smote_data.reset_index(drop=True)

In [None]:
unstable_smote = (smote_data["Bankrupt?"] == 1).sum()
stable_smote = (smote_data["Bankrupt?"] == 0).sum()
print("Oversampled Data Size:", smote_data.shape[0])
print("Number of Stable Companies:", stable_smote)
print("Number of Unstable Companies (with SMOTE):", unstable_smote)
print("unstable to Stable Ratio: ", unstable_smote/stable_smote, sep="")

In [None]:
smote_data["Bankrupt?"].hist()
plt.show()

In [None]:
f, axes = plt.subplots(ncols=4, figsize = (24,6) )

sns.boxplot(x="Bankrupt?", y=" Cash/Total Assets", data=smote_data, ax = axes[0] )
axes[0].set_title("Bankrupt vs Cash/Total Assets")

sns.boxplot(x="Bankrupt?", y=" Current Assets/Total Assets", data=smote_data, ax = axes[1] )
axes[1].set_title("Bankrupt vs Current Assets/Total Assets")

sns.boxplot(x="Bankrupt?", y=" Net worth/Assets", data=smote_data, ax = axes[2] )
axes[2].set_title("Bankrupt vs Net worth/Assets")

sns.boxplot(x="Bankrupt?", y=" Cash/Current Liability", data=smote_data, ax = axes[3] )
axes[3].set_title("Bankrupt vs Cash/Current Liability")

plt.show()

### Test Train Split

In [128]:
test_ratio = 0.1
#Smote
train_sm, test_sm = fcn.test_train_split(smote_data, test_ratio )
X_train_sm = train_sm.drop(["Bankrupt?"], axis=1)
Y_train_sm = train_sm["Bankrupt?"]
X_test_sm = test_sm.drop(["Bankrupt?"], axis=1)
Y_test_sm = test_sm["Bankrupt?"]
#No Smote
train, test = fcn.test_train_split(clean_data, test_ratio )
X_train = train.drop(["Bankrupt?"], axis=1)
Y_train = train["Bankrupt?"]
X_test = test.drop(["Bankrupt?"], axis=1)
Y_test = test["Bankrupt?"]


### k-Nearest Neighbors Classifier (with and without SMOTE)

In [131]:
knn_classifier = kNN.k_NN_classifier(X_train.to_numpy(), Y_train.to_numpy() )
Y_test_np = Y_test.to_numpy()
X_test_np = X_test.to_numpy()
knn_preds = np.zeros_like(Y_test_np)
for idx, test in enumerate(X_test_np):
    knn_preds[idx] = knn_classifier.classify(test)

knn_classifier_sm = kNN.k_NN_classifier(X_train_sm.to_numpy(), Y_train_sm.to_numpy() )
Y_test_sm_np = Y_test_sm.to_numpy()
X_test_sm_np = X_test_sm.to_numpy()
knn_sm_preds = np.zeros_like(Y_test_sm_np)
for idx, test in enumerate(X_test_sm_np):
    knn_sm_preds[idx] = knn_classifier_sm.classify(test)


In [132]:
print("Confusion Matrix Without SMOTE")
conf_matrix = fcn.confusion_matrix(Y_test_np, knn_preds, ret = True)

print("Recall:", conf_matrix[0,0]/(conf_matrix[0,0]+conf_matrix[1,0]) * 100 )
print("Precision:", conf_matrix[0,0]/(conf_matrix[0,0]+conf_matrix[0,1]) * 100 )
print()
print("Confusion Matrix With SMOTE")
conf_matrix_sm = fcn.confusion_matrix(Y_test_sm_np, knn_sm_preds, ret = True)
print("Recall:", conf_matrix_sm[0,0]/(conf_matrix_sm[0,0]+conf_matrix_sm[1,0]) * 100 )
print("Precision:", conf_matrix_sm[0,0]/(conf_matrix_sm[0,0]+conf_matrix_sm[0,1]) *100 )

Confusion Matrix Without SMOTE
 	1	0 (prediction)
1	0	20
0	0	607
Recall: nan
Precision: 0.0

Confusion Matrix With SMOTE
 	1	0 (prediction)
1	552	30
0	87	524
Recall: 86.3849765258216
Precision: 94.84536082474226
  print("Recall:", conf_matrix[0,0]/(conf_matrix[0,0]+conf_matrix[1,0]) * 100 )


### Logistic Regression

In [134]:
class CrossEntropy():
    def __init__(self):
         pass

    def loss(self, y, p):
        # Avoid division by zero
        p = np.clip(p, 1e-15, 1 - 1e-15)
        return - y * np.log(p) - (1 - y) * np.log(1 - p)

    def acc(self, y, p):
        return accuracy_score(np.argmax(y, axis=1), np.argmax(p, axis=1))

    def gradient(self, y, p):
        # Avoid division by zero
        p = np.clip(p, 1e-15, 1 - 1e-15)
        return - (y / p) + (1 - y) / (1 - p)

In [None]:
class GradientBoostClassify():

    def __init__(self, tree_count, lr, tree_min_split, tree_min_impurity, tree_max_depth):
        self.tree_count = tree_count
        self.lr = lr
        self.tree_min_split = tree_min_split
        self.tree_min_impurity = tree_min_impurity
        self.tree_max_depth = tree_max_depth
        self.loss = CrossEntropy()

        self.trees = []
        for i in range(n)


### Gradient Boosting Classifier

In [None]:
from __future__ import division, print_function
import numpy as np
import progressbar

# Import helper functions
from mlfromscratch.utils import train_test_split, standardize, to_categorical
from mlfromscratch.utils import mean_squared_error, accuracy_score
from mlfromscratch.deep_learning.loss_functions import SquareLoss, CrossEntropy
from mlfromscratch.supervised_learning.decision_tree import RegressionTree
from mlfromscratch.utils.misc import bar_widgets


class GradientBoosting(object):
    """Super class of GradientBoostingClassifier and GradientBoostinRegressor. 
    Uses a collection of regression trees that trains on predicting the gradient
    of the loss function. 
    Parameters:
    -----------
    n_estimators: int
        The number of classification trees that are used.
    learning_rate: float
        The step length that will be taken when following the negative gradient during
        training.
    min_samples_split: int
        The minimum number of samples needed to make a split when building a tree.
    min_impurity: float
        The minimum impurity required to split the tree further. 
    max_depth: int
        The maximum depth of a tree.
    regression: boolean
        True or false depending on if we're doing regression or classification.
    """
    def __init__(self, n_estimators, learning_rate, min_samples_split,
                 min_impurity, max_depth, regression):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity
        self.max_depth = max_depth
        self.regression = regression
        self.bar = progressbar.ProgressBar(widgets=bar_widgets)
        
        # Square loss for regression
        # Log loss for classification
        self.loss = SquareLoss()
        if not self.regression:
            self.loss = CrossEntropy()

        # Initialize regression trees
        self.trees = []
        for _ in range(n_estimators):
            tree = RegressionTree(
                    min_samples_split=self.min_samples_split,
                    min_impurity=min_impurity,
                    max_depth=self.max_depth)
            self.trees.append(tree)


    def fit(self, X, y):
        y_pred = np.full(np.shape(y), np.mean(y, axis=0))
        for i in self.bar(range(self.n_estimators)):
            gradient = self.loss.gradient(y, y_pred)
            self.trees[i].fit(X, gradient)
            update = self.trees[i].predict(X)
            # Update y prediction
            y_pred -= np.multiply(self.learning_rate, update)


    def predict(self, X):
        y_pred = np.array([])
        # Make predictions
        for tree in self.trees:
            update = tree.predict(X)
            update = np.multiply(self.learning_rate, update)
            y_pred = -update if not y_pred.any() else y_pred - update

        if not self.regression:
            # Turn into probability distribution
            y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)
            # Set label to the value that maximizes probability
            y_pred = np.argmax(y_pred, axis=1)
        return y_pred


class GradientBoostingRegressor(GradientBoosting):
    def __init__(self, n_estimators=200, learning_rate=0.5, min_samples_split=2,
                 min_var_red=1e-7, max_depth=4, debug=False):
        super(GradientBoostingRegressor, self).__init__(n_estimators=n_estimators, 
            learning_rate=learning_rate, 
            min_samples_split=min_samples_split, 
            min_impurity=min_var_red,
            max_depth=max_depth,
            regression=True)

class GradientBoostingClassifier(GradientBoosting):
    def __init__(self, n_estimators=200, learning_rate=.5, min_samples_split=2,
                 min_info_gain=1e-7, max_depth=2, debug=False):
        super(GradientBoostingClassifier, self).__init__(n_estimators=n_estimators, 
            learning_rate=learning_rate, 
            min_samples_split=min_samples_split, 
            min_impurity=min_info_gain,
            max_depth=max_depth,
            regression=False)

    def fit(self, X, y):
        y = to_categorical(y)
        super(GradientBoostingClassifier, self).fit(X, y)