# Clustering

In [1]:
import numpy as np
from sklearn.cluster import KMeans

In [2]:
import pandas as pd

df = pd.read_csv('train_data.csv')
feats = df.drop(columns=['Bankrupt?', 'Index'])
labels = df['Bankrupt?']
feats.describe()

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
count,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,...,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0
mean,0.505416,0.558893,0.553852,0.607958,0.607938,0.998726,0.797192,0.809081,0.303688,0.781369,...,0.807956,18035130.0,0.62403,0.607956,0.840407,0.280324,0.027446,0.565267,1.0,0.047736
std,0.060808,0.06565,0.061613,0.017499,0.01748,0.014087,0.013731,0.014553,0.011666,0.013697,...,0.039062,370601300.0,0.009625,0.017499,0.015447,0.014511,0.011134,0.011678,0.0,0.049046
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.224792,0.000101556,0.419045,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,0.476673,0.535679,0.527437,0.600448,0.600427,0.998969,0.797386,0.809312,0.303466,0.781567,...,0.796843,0.0009038045,0.623634,0.600448,0.840121,0.276916,0.026791,0.565158,1.0,0.024395
50%,0.503096,0.559911,0.552492,0.605969,0.605933,0.999022,0.797464,0.809376,0.303525,0.781635,...,0.81071,0.002108343,0.623879,0.605967,0.841176,0.278801,0.026808,0.565253,1.0,0.033687
75%,0.535417,0.589212,0.583998,0.613856,0.61373,0.999094,0.797579,0.809469,0.303585,0.781734,...,0.826544,0.005328774,0.624169,0.613857,0.842352,0.281487,0.026914,0.565729,1.0,0.053393
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,9820000000.0,1.0,1.0,1.0,1.0,0.540672,0.736985,1.0,0.920638


## Preprocessing
The dataset is already floating-point values from 0 to 1, but it is best to center them to be certain that pca will work well.

In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=True, with_std=False)
feats_centered = scaler.fit_transform(feats)

### PCA

In [4]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA(n_components=0.95)
linear_pca_result = pca.fit_transform(feats_centered)

print(f'Number of components output: {pca.n_components_}')

red_feats = pd.DataFrame(data=linear_pca_result)

Number of components output: 8


From performing PCA on the original dataset, we now have 8 principal components that closely approximate the 95 original components. This presents a large boost to the performance of the clustering.

## KMeans Clustering

In [5]:
# 4 team members = 4 clusters
n_clust = 4 
km = KMeans(n_clusters=n_clust, random_state=67)

out_cl = km.fit_predict(X=red_feats, y=None)

In [None]:
# Add back the label column before clustering so each subgroup can be trained separately
red_feats = pd.concat([red_feats, labels, df['Index']], axis=1)

# Partition the data into separate dataframes based on cluster labels in out_cl
for i in range(n_clust):
    print(f"Cluster {i}")
    curr_cl = red_feats[out_cl == i]
    print("Num bankrupt companies:", np.sum(curr_cl['Bankrupt?']))
    curr_indices = curr_cl.index
    cl_orig_data = df.iloc[curr_indices]
    print(cl_orig_data.info())
    print('-'*100)
    cl_orig_data.to_csv(f'cluster{i}.csv')

cluster 0
Num bankrupt companies: 27
<class 'pandas.core.frame.DataFrame'>
Index: 912 entries, 3 to 5797
Data columns (total 97 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Index                                                     912 non-null    int64  
 1   Bankrupt?                                                 912 non-null    int64  
 2    ROA(C) before interest and depreciation before interest  912 non-null    float64
 3    ROA(A) before interest and % after tax                   912 non-null    float64
 4    ROA(B) before interest and depreciation after tax        912 non-null    float64
 5    Operating Gross Margin                                   912 non-null    float64
 6    Realized Sales Gross Margin                              912 non-null    float64
 7    Operating Profit Rate                                    912 non-null    