# Clustering

In [1]:
import numpy as np
from sklearn.cluster import KMeans

In [2]:
import pandas as pd

df = pd.read_csv('train_data.csv')
feats = df.drop(columns=['Bankrupt?', 'Index'])
labels = df['Bankrupt?']
feats.describe()

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
count,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,...,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0,5807.0
mean,0.505416,0.558893,0.553852,0.607958,0.607938,0.998726,0.797192,0.809081,0.303688,0.781369,...,0.807956,18035130.0,0.62403,0.607956,0.840407,0.280324,0.027446,0.565267,1.0,0.047736
std,0.060808,0.06565,0.061613,0.017499,0.01748,0.014087,0.013731,0.014553,0.011666,0.013697,...,0.039062,370601300.0,0.009625,0.017499,0.015447,0.014511,0.011134,0.011678,0.0,0.049046
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.224792,0.000101556,0.419045,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,0.476673,0.535679,0.527437,0.600448,0.600427,0.998969,0.797386,0.809312,0.303466,0.781567,...,0.796843,0.0009038045,0.623634,0.600448,0.840121,0.276916,0.026791,0.565158,1.0,0.024395
50%,0.503096,0.559911,0.552492,0.605969,0.605933,0.999022,0.797464,0.809376,0.303525,0.781635,...,0.81071,0.002108343,0.623879,0.605967,0.841176,0.278801,0.026808,0.565253,1.0,0.033687
75%,0.535417,0.589212,0.583998,0.613856,0.61373,0.999094,0.797579,0.809469,0.303585,0.781734,...,0.826544,0.005328774,0.624169,0.613857,0.842352,0.281487,0.026914,0.565729,1.0,0.053393
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,9820000000.0,1.0,1.0,1.0,1.0,0.540672,0.736985,1.0,0.920638


## Preprocessing
The dataset is already floating-point values from 0 to 1, but it is best to center them to be certain that pca will work well.

In [3]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def preprocess_for_kmeans(features, n_components=0.90, random_state=67, preprocessor=None, fit_transform=True, save_path=None):
    """
    Preprocess features for K-Means clustering.
    
    Parameters:
    -----------
    features : pandas.DataFrame or numpy.array
        Input features to preprocess
    preprocessor : sklearn.pipeline.Pipeline, optional
        Pre-fitted preprocessor. If None, creates a new one.
    fit_transform : bool
        If True, fits the preprocessor and transforms data. If False, only transforms.
    save_path : str, optional
        Path to save the fitted preprocessor using joblib
        
    Returns:
    --------
    tuple: (transformed_features, fitted_preprocessor)
    """
    if preprocessor is None:
        preprocessor = Pipeline([
            ('scaler', StandardScaler()),  # Both center AND scale (with_mean=True, with_std=True by default)
            ('pca', PCA(n_components=n_components, random_state=random_state))
        ])
    
    if fit_transform:
        transformed_features = preprocessor.fit_transform(features)
        print(f"Fitted preprocessor:")
        print(f"  - Scaler: centers and scales features")
        print(f"  - PCA: {preprocessor['pca'].n_components_} components explaining "
              f"{preprocessor['pca'].explained_variance_ratio_.sum():.4f} of variance")
    else:
        transformed_features = preprocessor.transform(features)
        print("Applied existing preprocessor to new data")
    
    if save_path:
        joblib.dump(preprocessor, save_path)
        print(f"Preprocessor saved to {save_path}")
    
    return transformed_features, preprocessor

## KMeans Clustering

In [4]:
# 4 team members = 4 clusters
n_clust = 5 
km = KMeans(n_clusters=n_clust, random_state=67)
red_feats, preproc = preprocess_for_kmeans(feats, n_components=0.90, random_state=67, fit_transform=True, save_path='kmeans_preproc.joblib')

out_cl = km.fit_predict(X=red_feats, y=None)

Fitted preprocessor:
  - Scaler: centers and scales features
  - PCA: 45 components explaining 0.9078 of variance
Preprocessor saved to kmeans_preproc.joblib


In [5]:
# Convert red_feats to DataFrame and add back the label column and cluster assignments
red_feats_df = pd.DataFrame(red_feats)
red_feats_with_info = pd.concat([red_feats_df, labels, df['Index']], axis=1)

# Add cluster assignments to both the original and reduced dataframes
red_feats_with_info['Cluster'] = out_cl
df['ClusterID'] = out_cl

# Partition the data into separate dataframes based on cluster labels in out_cl
for i in range(n_clust):
    curr_cl = red_feats_with_info[red_feats_with_info['Cluster'] == i]
    bankrupt_count = np.sum(curr_cl['Bankrupt?'])
    total_count = len(curr_cl)
    bankruptcy_rate = bankrupt_count / total_count * 100
    
    print(f"Cluster {i}: {total_count} companies, {bankrupt_count} bankrupt ({bankruptcy_rate:.1f}%)")
    
    # Get original data for this cluster and save
    curr_indices = curr_cl.index
    cl_orig_data = df.iloc[curr_indices].copy()
    cl_orig_data['Cluster'] = i  # Add cluster label to original data
    cl_orig_data.to_csv(f'cluster{i}.csv', index=False)

Cluster 0: 108 companies, 0 bankrupt (0.0%)
Cluster 1: 1153 companies, 84 bankrupt (7.3%)
Cluster 2: 319 companies, 0 bankrupt (0.0%)
Cluster 3: 2090 companies, 5 bankrupt (0.2%)
Cluster 4: 2137 companies, 109 bankrupt (5.1%)
Cluster 4: 2137 companies, 109 bankrupt (5.1%)
