In [1]:
import numpy as np
import pandas as pd

import sklearn.linear_model as sk
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


from sklearn.decomposition import PCA


pd.set_option('max_columns', None)


#reading data and removing Nan values
data = pd.read_csv('FinancialDataSet.csv', usecols=np.r_[0:66])
#converting object to float
for col in data.columns: 
    data[col] = pd.to_numeric(data[col], errors='coerce')
 


data = data.fillna("0")


data


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\studies\\first sem docs\\BIGDATA\\bankruptcy_prediction\\FinancialDataSet.csv'

In [23]:
def preprocess_inputs(df):
    df = df.copy()
    y = df['class']
    X = df.drop(['class'], axis = 1)
#splitting the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
#scaling the data
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns) #creating dataframe & combining x_traindata
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns) #creating dataframe & combining x_test data
    return X_train, X_test, y_train, y_test
   


In [24]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [52]:
original_models = {
    "Logistic Regression": LogisticRegression(),
    "RandomForestClassifier" :  RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
    
}

for name, model in original_models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

Logistic Regression trained.
RandomForestClassifier trained.
Gradient Boosting trained.


In [54]:
original_results = []

for name, model in original_models.items():
    result = model.score(X_test, y_test)
    original_results.append(result)
    print(name + ": {:.2f}%".format(result * 100))

Logistic Regression: 95.14%
RandomForestClassifier: 97.05%
Gradient Boosting: 99.94%


In [55]:
y_test.value_counts() / len(y_test)

0     0.952385
1     0.047462
11    0.000154
Name: class, dtype: float64

In [75]:
n_components = 35

pca = PCA(n_components=n_components)
pca.fit(X_train)

X_train_reduced = pd.DataFrame(pca.transform(X_train), index=X_train.index, columns=["PC" + str(i) for i in range(1, n_components + 1)])
X_test_reduced = pd.DataFrame(pca.transform(X_test), index=X_test.index, columns=["PC" + str(i) for i in range(1, n_components + 1)])

In [76]:
X_train_reduced

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30,PC31,PC32,PC33,PC34,PC35
39337,-0.018207,-0.091816,-0.043341,0.022086,-0.044058,-0.000357,0.042621,-0.031325,0.015961,0.008983,0.005934,-0.368680,0.096706,0.119721,0.782559,-0.592727,0.029059,-0.176308,0.846919,0.241247,-0.026397,0.184041,-0.309659,-0.063043,0.021112,-0.050029,-0.037522,0.004109,-0.008812,-0.004033,-0.003679,-0.004416,0.001717,-0.009222,-0.004598
32733,-0.006043,-0.006677,-0.004025,-0.101030,0.003125,0.026960,0.046211,-0.037370,0.091142,0.029256,-0.009276,-0.844516,0.046562,0.063104,0.444674,-0.416484,-0.009398,-0.081558,0.539132,0.345869,0.006707,0.139160,-0.211719,0.631417,0.015894,-0.136361,-0.005624,-0.000303,0.006517,0.017899,0.034604,-0.003083,-0.003383,-0.020467,-0.013847
35630,-0.023577,-0.254631,0.003733,0.224228,0.025463,0.013601,0.052387,-0.020120,-0.005670,-0.021638,0.000522,0.170128,0.052526,0.025809,0.567441,-0.480920,0.032738,-0.154169,0.683449,0.051981,0.029812,0.131271,-0.264927,-0.526987,0.200743,0.257548,-0.021835,0.007567,0.070764,-0.035419,0.127159,0.005126,0.024709,0.010061,0.017138
32427,-0.015426,-0.004466,-0.058430,-0.083580,-0.091469,0.009724,0.035080,-0.029485,0.043164,0.007909,-0.004000,-0.350565,0.049441,0.064746,0.479180,-0.367811,0.015818,-0.116359,0.506987,0.166061,-0.005591,0.119081,-0.185134,0.034936,-0.037305,-0.104775,-0.028781,-0.001358,0.016829,-0.004267,-0.049061,-0.005440,-0.022623,-0.007731,0.018305
21921,-0.022380,-0.150626,-0.071662,-0.082820,-0.098052,0.005512,-0.003085,-0.003262,0.003377,-0.021986,-0.012498,0.509784,0.003131,0.098337,-0.044446,-0.053117,0.004519,-0.029356,0.038576,-0.185862,-0.109099,-0.017077,0.034681,-0.363205,0.140289,0.208859,-0.005559,-0.000342,0.017545,-0.018679,0.071571,-0.004649,0.018105,0.010242,0.000331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7813,-0.018176,-0.119934,-0.073907,-0.022652,-0.058623,-0.024219,-0.032955,0.004253,-0.038382,-0.029574,-0.022004,0.223008,-0.081750,-0.152589,-0.686362,0.409931,-0.019491,0.097038,-0.684267,-0.223331,0.053888,-0.169792,0.235405,-0.162377,0.032445,0.075852,0.011840,-0.011481,-0.029223,-0.007757,0.039409,0.005289,0.009776,0.003295,0.001338
32511,-0.014696,-0.015155,-0.060266,-0.108249,-0.187219,-0.002114,-0.066185,-0.034095,0.025006,-0.032777,-0.001374,0.655120,0.041967,0.082947,0.455010,-0.408804,0.026984,-0.126631,0.584225,-0.139419,-0.023357,0.110257,-0.153473,-0.480559,0.175355,0.284344,-0.015565,0.008100,-0.012301,-0.021572,-0.034457,0.001768,0.010200,0.010559,0.003751
5192,-0.013065,0.072967,-0.050757,0.074531,-0.049593,-0.004070,-0.017805,-0.007037,0.008099,-0.043889,-0.033323,0.359815,-0.142682,-0.141411,-0.861890,0.492450,-0.029518,0.124792,-0.809595,-0.254295,0.039917,-0.160544,0.259495,-0.282258,0.159232,0.231631,0.018067,-0.011459,0.008873,-0.012937,-0.042811,0.006621,0.005334,0.008089,-0.030521
12172,-0.013805,-0.040027,-0.059342,0.034084,-0.095904,-0.037198,-0.081596,-0.010236,-0.038844,-0.046189,-0.017524,0.521747,-0.092814,-0.086666,-0.505362,0.270265,-0.008446,0.048843,-0.452804,-0.277668,0.007442,-0.098138,0.157944,-0.412348,0.132631,0.237533,0.007614,-0.004890,-0.026129,-0.012780,-0.025739,0.007299,-0.014866,0.015166,-0.004720


In [77]:
X_train_reduced.describe()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30,PC31,PC32,PC33,PC34,PC35
count,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0,30380.0
mean,4.6777010000000004e-18,-1.87108e-18,9.355401e-19,9.355401e-18,7.016550999999999e-19,-2.33885e-18,-1.87108e-18,2.3388499999999997e-19,2.33885e-18,-1.637195e-18,-2.3388499999999997e-19,-7.952091000000001e-18,2.33885e-18,-7.016550999999999e-19,-6.548781e-18,-9.355401e-19,7.016550999999999e-19,-9.355401e-18,1.7775260000000002e-17,9.589286e-18,0.0,6.548781e-18,-7.016551e-18,-1.379922e-17,3.27439e-18,3.742161e-18,1.169425e-18,9.355401e-19,-8.185975999999999e-19,-1.637195e-18,-1.87108e-18,1.520253e-18,-2.455793e-18,-3.508276e-19,-5.8471259999999995e-19
std,3.334941,3.29884,2.21667,2.148089,1.802736,1.648744,1.540915,1.493108,1.386848,1.299108,1.233987,1.098409,1.027376,1.021177,1.009374,1.001917,1.000064,0.9996581,0.9963366,0.9867328,0.979587,0.9707967,0.9667075,0.9059941,0.8876284,0.8669272,0.78362,0.6804673,0.6315545,0.6202831,0.566204,0.5595335,0.4147146,0.3538724,0.2815011
min,-54.2512,-176.9364,-25.26207,-91.84092,-23.70756,-97.53875,-84.3576,-59.08563,-81.88391,-2.228986,-26.42795,-13.91913,-31.60745,-60.12955,-32.69279,-24.57679,-20.71078,-49.03698,-48.78407,-29.87395,-63.985748,-69.39957,-45.51679,-22.52404,-61.28162,-57.49838,-48.05139,-17.06337,-41.2501,-14.69148,-38.83726,-36.80663,-35.50694,-5.485385,-18.59767
25%,-0.01670564,-0.1036426,-0.06263856,-0.05654191,-0.1108568,-0.02609797,-0.05575304,-0.03125645,-0.03416832,-0.03197118,-0.02252192,-0.4868247,-0.08143005,-0.101486,-0.5050545,-0.3740067,-0.02575278,-0.1193592,-0.5463885,-0.1735993,-0.019737,-0.1224642,-0.1911097,-0.3485367,-0.1370283,-0.2236025,-0.02717968,-0.01375735,-0.01723946,-0.01755595,-0.0157751,-0.003568936,-0.02506179,-0.01517617,-0.008621185
50%,-0.01305871,-0.05914673,-0.05207079,-0.01102797,-0.05826783,-0.01275411,-0.01112464,-0.02099583,-0.01052535,-0.01206288,-0.0111893,-0.05799707,-0.01362257,-0.01367051,-0.00473502,-0.01684481,-0.00376069,-0.01889332,-0.01251269,-0.004316616,-0.000651,-0.0002124446,-0.01046267,-0.03325429,0.002576643,0.002579669,-0.01169803,-0.005538303,0.0004818038,-0.006450773,0.002372827,0.0004795922,-0.004778332,-0.00235549,-0.0003078354
75%,-0.009320174,0.001362774,-0.03806431,0.03793784,-0.0009362588,0.004179651,0.04019635,-0.01055744,0.01894798,0.008438778,0.0004244033,0.3631733,0.05273083,0.07534419,0.4933845,0.3352918,0.01986603,0.0785978,0.5283291,0.1626946,0.017417,0.1221295,0.171052,0.3144925,0.1350938,0.2145251,0.004471327,0.003469702,0.02106782,0.004312265,0.02019711,0.004383797,0.01570597,0.009545325,0.0066572
max,569.7776,429.3517,269.6642,156.7306,165.3531,156.3989,137.2117,169.7716,94.44763,207.2238,183.9796,56.70828,123.6411,115.5124,96.07916,78.63389,161.1561,77.73624,67.78283,69.72168,106.989457,109.9025,133.9622,58.29705,76.03761,70.68179,84.09973,105.9589,43.58034,68.53062,39.46139,88.78602,38.08033,43.32773,29.27866


In [78]:
pca.explained_variance_ratio_

array([0.17109944, 0.16741514, 0.07559177, 0.0709867 , 0.04999618,
       0.04181951, 0.03652834, 0.0342969 , 0.02958897, 0.02596346,
       0.02342575, 0.01856098, 0.01623794, 0.01604257, 0.01567389,
       0.01544315, 0.01538607, 0.01537359, 0.0152716 , 0.01497861,
       0.01476246, 0.01449869, 0.01437681, 0.01262767, 0.0121209 ,
       0.01156212, 0.00944677, 0.00712339, 0.00613612, 0.00591905,
       0.00493195, 0.00481642, 0.00264589, 0.00192649, 0.00121908])

In [79]:
fig = px.bar(
    x=["PC" + str(i) for i in range(1, n_components + 1)],
    y=pca.explained_variance_ratio_,
    labels={'x': "Principal Component", 'y': "Variance Ratio"},
    color=pca.explained_variance_ratio_,
    color_continuous_scale=[(0, 'lightblue'), (1, 'darkblue')],
    title="Proportion of Variance in Principal Components"
)

fig.show()

In [80]:
reduced_models = {
    " Logistic Regression": LogisticRegression(),
    "RandomForestClassifier" : RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}
for name, model in reduced_models.items():
    model.fit(X_train_reduced, y_train)
    print(name + " trained.")


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

 Logistic Regression trained.
RandomForestClassifier trained.
Gradient Boosting trained.


In [82]:
reduced_results = []

for name, model in reduced_models.items():
    result = model.score(X_test_reduced, y_test)
    reduced_results.append(result)
    print(name + ": {:.2f}%".format(result * 100))

 Logistic Regression: 95.22%
RandomForestClassifier: 96.30%
Gradient Boosting: 95.98%


In [83]:
#change in performance After PCA
fig = px.bar(
    x=np.subtract(reduced_results, original_results),
    y=original_models.keys(),
    #orientation='h',
    labels={'x': "Change in Performance", 'y': "Model"},
    color=np.subtract(reduced_results, original_results),
    color_continuous_scale=[(0, 'red'), (1, 'blue')],
    title="Change in Model Performance After Dimensionality Reduction"
)

fig.show()
