# Financial Distress Forecast

By analyzing financial client information, my aim is to determine if a specific client is likely to face bankruptcy or not.

I'll employ different types of prediction models to make our forecasts.

In [None]:
# Data Analysis
import numpy as np
import pandas as pd

#Visualization
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
# Reducing Dimensionality using PCA
from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('/content/data final.csv')

In [None]:
df.head()

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1.0,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1.0,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1.0,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1.0,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1.0,0.03549


In [None]:
df.shape

(2503, 96)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4377 entries, 0 to 4376
Data columns (total 96 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Bankrupt?                                                 4377 non-null   int64  
 1    ROA(C) before interest and depreciation before interest  4377 non-null   float64
 2    ROA(A) before interest and % after tax                   4377 non-null   float64
 3    ROA(B) before interest and depreciation after tax        4377 non-null   float64
 4    Operating Gross Margin                                   4377 non-null   float64
 5    Realized Sales Gross Margin                              4377 non-null   float64
 6    Operating Profit Rate                                    4377 non-null   float64
 7    Pre-tax net Interest Rate                                4377 non-null   float64
 8    After-tax net Int

In [None]:
df.isnull().sum().sum()

38

In [None]:
df.dropna()

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.405750,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.827890,0.290202,0.026601,0.564050,1.0,0.016469
1,1,0.464291,0.538214,0.516730,0.610235,0.610235,0.998946,0.797380,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1.0,0.020794
2,1,0.426071,0.499019,0.472295,0.601450,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.774670,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1.0,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.998700,0.796967,0.808966,0.303350,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1.0,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1.0,0.035490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4371,0,0.478770,0.540776,0.530596,0.605558,0.605385,0.998993,0.797390,0.809320,0.303476,...,0.799734,0.000993,0.623730,0.605553,0.840351,0.279890,0.027905,0.567744,1.0,0.028817
4372,0,0.490713,0.555059,0.538787,0.598243,0.598243,0.998995,0.797440,0.809351,0.303558,...,0.809580,0.002229,0.623806,0.598238,0.841298,0.281362,0.026847,0.565413,1.0,0.024722
4373,0,0.588310,0.665613,0.648911,0.602956,0.602956,0.999068,0.797502,0.809419,0.303516,...,0.855218,0.005578,0.624031,0.602951,0.845571,0.281514,0.026792,0.565161,1.0,0.024399
4374,0,0.473017,0.535107,0.525349,0.596924,0.596924,0.998972,0.797382,0.809308,0.303504,...,0.796539,0.001135,0.623362,0.596922,0.840113,0.285798,0.029853,0.568904,1.0,0.018940


## Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()

    # Split df into X and y
    X = df.drop('Bankrupt?', axis=1)
    y = df['Bankrupt?']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=1)

    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(df)

In [None]:
y_train

1396    0
3979    0
11      0
3530    0
2662    0
       ..
2895    0
2763    0
905     0
3980    0
235     1
Name: Bankrupt?, Length: 3063, dtype: int64

## Training Original Data

In [None]:
original_models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier()
}

for name, model in original_models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

In [None]:
original_results = []

for name, model in original_models.items():
    result = model.score(X_test, y_test)
    original_results.append(result)
    print(name + ": {:.2f}%".format(result * 100))

## Dimensionality Reduction (PCA)

In [None]:
n_components = 10

pca = PCA(n_components=n_components)
pca.fit(X_train)

X_train_reduced = pd.DataFrame(pca.transform(X_train), index=X_train.index, columns=["PC" + str(i) for i in range(1, n_components + 1)])
X_test_reduced = pd.DataFrame(pca.transform(X_test), index=X_test.index, columns=["PC" + str(i) for i in range(1, n_components + 1)])

In [None]:
X_train_reduced

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
3392,21.396995,-7.427095,-10.383105,16.384730,8.413782,14.831841,-1.160673,0.405999,1.807298,-2.694874
2755,-0.537260,-0.632612,1.521338,0.059506,-0.896009,0.101743,0.173111,-0.408775,-0.434334,-0.448402
4442,-3.777735,-0.482879,1.126984,0.334118,1.276318,0.353064,0.874252,-0.664209,-1.483721,-1.109124
4267,3.117690,-0.669761,1.191446,-0.645607,-1.864045,0.641606,0.852507,-0.837668,-0.099363,-1.009757
4912,0.849585,0.693085,-1.479360,-0.478790,-0.759344,0.084552,-0.798800,0.899314,0.886556,0.631542
...,...,...,...,...,...,...,...,...,...,...
905,0.566128,-0.528644,0.117022,-0.415300,0.396887,0.989461,-0.118342,-0.664049,-0.820065,-0.265597
5192,-2.253184,3.498930,-4.647734,-0.693304,-0.645256,1.186563,-0.233703,0.482092,0.636065,0.496909
3980,-5.630031,0.346380,-0.505307,1.038408,2.013045,0.596092,-1.493156,-2.639506,0.489577,0.483566
235,7.567254,-0.172185,0.268191,-0.819649,-0.230050,-0.923717,2.073517,0.030414,-0.451870,0.052151


In [None]:
fig = px.bar(
    x=["PC" + str(i) for i in range(1, n_components + 1)],
    y=pca.explained_variance_ratio_,
    labels={'x': "Principal Component", 'y': "Variance Ratio"},
    color=pca.explained_variance_ratio_,
    color_continuous_scale=[(0, 'lightblue'), (1, 'darkblue')],
    title="Proportion of Variance in Principal Components"
)

fig.show()

## Training the Reduced Data

In [None]:
reduced_models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier()
}

for name, model in reduced_models.items():
    model.fit(X_train_reduced, y_train)
    print(name + " trained.")

                   Logistic Regression trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.



Liblinear failed to converge, increase the number of iterations.



Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.



Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.



                        Neural Network trained.
                         Random Forest trained.
                     Gradient Boosting trained.


In [None]:
reduced_results = []

for name, model in reduced_models.items():
    result = model.score(X_test_reduced, y_test)
    reduced_results.append(result)
    print(name + ": {:.2f}%".format(result * 100))

                   Logistic Regression: 96.24%
                   K-Nearest Neighbors: 96.33%
                         Decision Tree: 94.72%
Support Vector Machine (Linear Kernel): 96.43%
   Support Vector Machine (RBF Kernel): 96.63%
                        Neural Network: 96.43%
                         Random Forest: 96.48%
                     Gradient Boosting: 96.24%


## Performance Change After PCA

In [None]:
fig = px.bar(
    x=np.subtract(reduced_results, original_results),
    y=original_models.keys(),
    orientation='h',
    labels={'x': "Change in Performance", 'y': "Model"},
    color=np.subtract(reduced_results, original_results),
    color_continuous_scale=[(0, 'red'), (1, 'blue')],
    title="Change in Model Performance After Dimensionality Reduction"
)

fig.show()

Based on the results obtained, it can be concluded that the Support Vector Machine (RBF kernel) has the best performance with an accuracy of 96.63%.

The other models such as Logistic Regression, K-Nearest Neighbors, Support Vector Machine (Linear kernel), and Neural Network also perform well with an accuracy of 96.24-96.43%.

The Decision Tree model performs comparatively poorly with an accuracy of 94.72%.

The Random Forest and Gradient Boosting models also have similar performance with an accuracy of 96.48% and 96.24%, respectively.

#### These results demonstrate the effectiveness of using machine learning algorithms for the task of bankruptcy prediction.