# Generalization

In [18]:
import numpy as np
import pandas as pd
from joblib import load
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, QuantileTransformer
from sklearn.decomposition import PCA

In [19]:
# Define PreprocessingBundle class again so we can load the saved bundle
class PreprocessingBundle:
    """
    A class that encapsulates preprocessing pipelines and metadata.
    Can be saved/loaded with joblib.
    Contains nested transformer classes for complete encapsulation.
    """
    
    # Nested transformer classes
    class ColumnNameStripper(BaseEstimator, TransformerMixin):
        """Ensures any incoming DataFrame has stripped column names."""
        def fit(self, X, y=None):
            return self
        def transform(self, X):
            X = X.copy()
            X.columns = [str(c).strip() for c in X.columns]
            return X

    class ColumnDropper(BaseEstimator, TransformerMixin):
        """Drops specified columns if they exist."""
        def __init__(self, drop_cols=None):
            self.drop_cols = drop_cols or []
        def fit(self, X, y=None):
            return self
        def transform(self, X):
            X = X.copy()
            cols = [c for c in self.drop_cols if c in X.columns]
            return X.drop(columns=cols)

    class HighCorrelationFilter(BaseEstimator, TransformerMixin):
        """
        Drops features with absolute correlation above threshold.
        Keeps the first feature in each correlated group.
        """
        def __init__(self, threshold=0.98):
            self.threshold = float(threshold)

        def fit(self, X, y=None):
            Xdf = pd.DataFrame(X).copy()
            corr = Xdf.corr(numeric_only=True).abs()
            upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
            to_drop = [col for col in upper.columns if any(upper[col] > self.threshold)]
            self.keep_cols_ = [c for c in Xdf.columns if c not in to_drop]
            self.dropped_cols_ = to_drop
            return self

        def transform(self, X):
            Xdf = pd.DataFrame(X).copy()
            result = Xdf[self.keep_cols_]
            return result

    class SkewHandler(BaseEstimator, TransformerMixin):
        """
        Handle very large-scale nonnegative columns by clipping and log1p transforming.
        """
        def __init__(self, huge_max_threshold=1e6, sentinel_fraction=0.95):
            self.huge_max_threshold = float(huge_max_threshold)
            self.sentinel_fraction = float(sentinel_fraction)

        def fit(self, X, y=None):
            Xdf = pd.DataFrame(X).copy()

            self.cols_in_ = list(Xdf.columns)
            self.huge_cols_ = []
            self.clip_ = {}

            for c in self.cols_in_:
                s = Xdf[c]
                mx, mn = float(s.max()), float(s.min())
                if mx > self.huge_max_threshold and mn >= 0:
                    self.huge_cols_.append(c)
                    self.clip_[c] = self.sentinel_fraction * mx

            return self

        def transform(self, X):
            Xdf = pd.DataFrame(X, columns=self.cols_in_).copy()

            for c in self.huge_cols_:
                clip_val = self.clip_[c]
                Xdf[c] = np.log1p(np.minimum(Xdf[c].values, clip_val))

            return Xdf
        
        def get_feature_names_out(self, input_features=None):
            return np.array(self.cols_in_)

    class ConstantColumnDropper(BaseEstimator, TransformerMixin):
        """Drops columns with <=1 unique value (constant)."""
        def fit(self, X, y=None):
            Xdf = pd.DataFrame(X).copy()
            nunique = Xdf.nunique(dropna=False)
            self.drop_cols_ = nunique[nunique <= 1].index.tolist()
            self.keep_cols_ = [c for c in Xdf.columns if c not in self.drop_cols_]
            return self

        def transform(self, X):
            Xdf = pd.DataFrame(X).copy()
            return Xdf[self.keep_cols_]
    
    def __init__(self, train_df, target_col, index_col=None, random_state=67):
        """Initialize and fit preprocessing pipelines."""
        self.train_df = train_df
        self.target_col = target_col
        self.index_col = index_col
        self.random_state = random_state
        
        # Build raw X / y for training
        drop_cols = [self.target_col]
        if self.index_col is not None:
            drop_cols.append(self.index_col)
        
        self.X_train_raw = train_df.drop(columns=drop_cols).copy()
        self.y_train = train_df[self.target_col].astype(int).values
        
        # Detect Liability-Assets flag column (if present)
        self.liab_asset_flag = None
        for c in self.X_train_raw.columns:
            if "liability-assets flag" in c.lower():
                self.liab_asset_flag = c
                break
        
        # Create preprocessing pipelines
        self._create_pipelines()
        
        # Fit preprocessors on full training data
        self.core_preprocess.fit(self.X_train_raw)
        self.cluster_preprocess.fit(self.X_train_raw)
    
    def _create_pipelines(self):
        """Create the core and cluster preprocessing pipelines."""
        # Core preprocessing for later supervised models (cluster-id classifier)
        self.core_preprocess = Pipeline(steps=[
            ("strip_names", self.ColumnNameStripper()),
            ("drop_constant", self.ConstantColumnDropper()),
            ("corr_filter", self.HighCorrelationFilter(threshold=0.95)),
            ("skew", self.SkewHandler(huge_max_threshold=1e6, sentinel_fraction=0.9)),
            ("robust_scale", RobustScaler(with_centering=True, with_scaling=True)),
            ("quantile_gauss", QuantileTransformer(
                output_distribution="normal",
                n_quantiles=2000,
                random_state=self.random_state
            )),
        ])
        
        # Preprocessing specifically for clustering (drop Liability-Assets flag for clustering)
        self.cluster_preprocess = Pipeline(steps=[
            ("strip_names", self.ColumnNameStripper()),
            ("drop_cols", self.ColumnDropper(drop_cols=[self.liab_asset_flag] if self.liab_asset_flag else [])),
            ("drop_constant", self.ConstantColumnDropper()),
            ("corr_filter", self.HighCorrelationFilter(threshold=0.95)),
            ("skew", self.SkewHandler(huge_max_threshold=1e6, sentinel_fraction=0.95)),
            ("robust_scale", RobustScaler(with_centering=True, with_scaling=True)),
            ("quantile_gauss", QuantileTransformer(
                output_distribution="normal",
                n_quantiles=2000,
                random_state=self.random_state
            )),
            ("pca", PCA(n_components=50, random_state=self.random_state)),
        ])
    
    def transform_core(self, X):
        """Transform data using core preprocessing pipeline."""
        return self.core_preprocess.transform(X)
    
    def transform_cluster(self, X):
        """Transform data using cluster preprocessing pipeline."""
        return self.cluster_preprocess.transform(X)
    
    def get_info(self):
        """Get information about the preprocessing."""
        return {
            "TARGET_COL": self.target_col,
            "INDEX_COL": self.index_col,
            "LIAB_ASSET_FLAG": self.liab_asset_flag,
            "dropped_constant_cols": self.core_preprocess.named_steps["drop_constant"].drop_cols_,
            "corr_filter_dropped_count": len(self.core_preprocess.named_steps["corr_filter"].dropped_cols_)
        }

# Make PreprocessingBundle available to pickle/joblib under __main__ for backwards compatibility
# This allows loading files that were saved when class was defined in __main__
import __main__
__main__.PreprocessingBundle = PreprocessingBundle

In [20]:
test_df = pd.read_csv("test_data.csv")
test_df.head()

Unnamed: 0,Index,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,0,0.414323,0.481029,0.46828,0.609514,0.609514,0.998889,0.797159,0.809132,0.30329,...,0.761704,0.001404,0.623973,0.609512,0.838286,0.27545,0.026749,0.56495,1,0.136203
1,1,0.497441,0.560892,0.546603,0.61066,0.61066,0.999108,0.797545,0.809431,0.303506,...,0.815244,0.004466,0.623724,0.610658,0.842427,0.285886,0.026965,0.56587,1,0.018871
2,2,0.501584,0.548899,0.556721,0.606134,0.606134,0.999034,0.797427,0.80937,0.303453,...,0.806318,0.000684,0.625387,0.606132,0.840598,0.275816,0.026793,0.565165,1,0.095511
3,3,0.574465,0.637375,0.61968,0.600376,0.600376,0.99903,0.797528,0.809426,0.30364,...,0.852655,0.001718,0.624151,0.600375,0.844727,0.279977,0.026795,0.565178,1,0.028513
4,4,0.39336,0.456444,0.440334,0.600009,0.600009,0.9988,0.797025,0.809,0.30324,...,0.741604,0.002545,0.623612,0.600009,0.835578,0.279901,0.026623,0.564204,1,0.028779


# Cluster ID Predictions

In [21]:
clust_preproc_bundle = load('artifacts/preprocess_bundle.joblib')

In [22]:
test_X = clust_preproc_bundle.transform_core(test_df)
pd.DataFrame(test_X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,67,68,69,70,71,72,73,74,75,76
0,-1.638053,-1.414283,0.340176,-1.187328,-1.305165,-1.387131,0.118943,-0.537571,-1.514873,1.775919,...,-0.974402,0.162289,-1.923557,-5.199338,-0.29429,0.255842,-1.193272,-0.995595,-1.05448,1.808328
1,-0.131414,0.023966,0.437574,0.764879,0.523249,-0.284682,-0.995157,-5.199338,1.024532,-0.438077,...,0.284263,-1.110514,1.121353,-5.199338,0.536864,-0.456692,0.719591,0.834166,0.786157,-1.321962
2,-0.035745,-0.299731,0.018772,0.139076,-0.298284,-0.765698,-0.973781,1.502118,0.665551,-5.199338,...,0.186408,-0.146036,-1.167403,-5.199338,-0.925847,1.651268,-0.36804,-0.359249,-0.396409,1.392141
3,1.294683,1.440076,-0.683569,0.089148,0.440148,0.981194,0.733536,-0.026336,0.520081,2.003542,...,0.369826,0.882127,-0.492587,-5.199338,-0.138198,0.648145,1.797141,-0.255752,-0.291503,-0.320457
4,-1.843318,-1.653062,-0.729148,-1.479083,-1.512341,-1.488238,-0.015929,-5.199338,-0.833025,0.973781,...,-0.84029,0.139517,1.29576,-5.199338,0.145687,-0.728069,-1.665709,-1.351808,-1.443883,-0.305606


In [23]:
cluster_classif = load('artifacts/cluster_id_classifier.joblib')
cluster = cluster_classif.predict(test_X)

# Split test data by cluster ID
clustered_test_dfs = {}
for cid in np.unique(cluster):
    clustered_test_dfs[cid] = test_df[cluster == cid]
clustered_test_dfs[1].head()

Unnamed: 0,Index,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
6,6,0.519914,0.562527,0.571658,0.600621,0.600621,0.99903,0.797459,0.809367,0.303518,...,0.81192,0.002655,0.624866,0.600616,0.841371,0.280075,0.027008,0.566017,1,0.028179
9,9,0.551455,0.567706,0.608384,0.610523,0.610523,0.999145,0.797578,0.809474,0.303485,...,0.814718,0.004763,0.623611,0.610524,0.841388,0.278412,0.026841,0.565384,1,0.036182
10,10,0.500122,0.554405,0.542106,0.600455,0.600455,0.999062,0.797463,0.80936,0.303457,...,0.808566,0.010313,0.623839,0.600456,0.84089,0.277775,0.026885,0.565569,1,0.041522
21,21,0.514113,0.563509,0.569998,0.603237,0.603237,0.999038,0.797502,0.809416,0.303575,...,0.814105,0.01178,0.623228,0.603238,0.84133,0.278304,0.02687,0.565508,1,0.036956
22,22,0.479891,0.548899,0.539055,0.608066,0.608253,0.998972,0.797408,0.809345,0.303551,...,0.804593,0.001174,0.623695,0.608064,0.840797,0.280675,0.026912,0.565676,1,0.026383


# Cluster 0 Bankrupcy Predictions

In [24]:
sub0 = load("./artifacts/subgroup0_complete.joblib")
sub0_preproc = sub0['pipeline']
sub0_model = sub0['model']

In [25]:
sub0_X = sub0_preproc.transform(clustered_test_dfs[0])
sub0_test_pred = sub0_model.predict(sub0_X)
print("Total Bankrupt Predictions For Subgroup 0 Test :", sum(sub0_test_pred))
indices_sub0 = clustered_test_dfs[0]["Index"]
submission_sub0 = pd.DataFrame({
    "Index": indices_sub0,
    "Bankrupt?": sub0_test_pred
})
submission_sub0.head()

Total Bankrupt Predictions For Subgroup 0 Test : 6




Unnamed: 0,Index,Bankrupt?
0,0,0
1,1,0
2,2,0
4,4,0
11,11,0


# Cluster 1 Bankrupcy Predictions

In [26]:
sub1 = load("./artifacts/preprocessing_pipeline_subgroup1.joblib")
sub1_feats = sub1['selected_features']
sub1_scaler = sub1['scaler']
sub1_model = sub1['model']

In [27]:
# Strip whitespace from column names to match trained model's feature names
cluster1_data = clustered_test_dfs[1]
cluster1_data.columns = cluster1_data.columns.str.strip()

sub1_X = sub1_scaler.transform(cluster1_data[sub1_feats])
sub1_test_pred = sub1_model.predict(sub1_X)
print("Total Bankrupt Predictions For Subgroup 1 Test :", sum(sub1_test_pred))
indices_sub1 = clustered_test_dfs[1]["Index"]
submission_sub1 = pd.DataFrame({
    "Index": indices_sub1,
    "Bankrupt?": sub1_test_pred
})
submission_sub1.head()

Total Bankrupt Predictions For Subgroup 1 Test : 20


Unnamed: 0,Index,Bankrupt?
6,6,0
9,9,0
10,10,0
21,21,0
22,22,0


# Cluster 2 Bankrupcy Predictions

In [28]:
sub2 = load("./artifacts/preprocessing_pipeline_subgroup2.joblib")
sub2_feats = sub2['selected_features']
sub2_scaler = sub2['scaler']
sub2_model = sub2['model']

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [29]:
# Strip whitespace from column names to match trained model's feature names
cluster2_data = clustered_test_dfs[2]
cluster2_data.columns = cluster2_data.columns.str.strip()

sub2_X = sub2_scaler.transform(cluster2_data[sub2_feats])
sub2_test_pred = sub2_model.predict(sub2_X)
print("Total Bankrupt Predictions For Subgroup 2 Test :", sum(sub2_test_pred))
indices_sub2 = clustered_test_dfs[2]["Index"]
submission_sub2 = pd.DataFrame({
    "Index": indices_sub2,
    "Bankrupt?": sub2_test_pred
})
submission_sub2.head()

Total Bankrupt Predictions For Subgroup 2 Test : 15


Unnamed: 0,Index,Bankrupt?
5,5,0
19,19,1
31,31,0
37,37,0
62,62,0


# Cluster 3 Bankrupcy Predictions

In [30]:
sub3_selector = load("./artifacts/feature_selector_subgroup3.joblib")
sub3_preproc = load("./artifacts/preprocessing_pipeline_subgroup3.joblib")
sub3_model = load("./artifacts/stacking_model_subgroup3.joblib")

In [31]:

# Strip whitespace from column names to match trained model's feature names
#cluster3_data = clustered_test_dfs[3]
#cluster3_data.columns = cluster3_data.columns.str.strip()
sub3_X = sub3_preproc.transform(clustered_test_dfs[3].drop(columns=["Index"]))
sub3_X_Selected = sub3_selector.transform(sub3_X)
sub3_test_pred = sub3_model.predict(sub3_X_Selected)
print("Total Bankrupt Predictions For Subgroup 3 Test :", sum(sub3_test_pred))
indices_sub3 = clustered_test_dfs[3]["Index"]
submission_sub3 = pd.DataFrame({
    "Index": indices_sub3,
    "Bankrupt?": sub3_test_pred
})
submission_sub3.head()

Total Bankrupt Predictions For Subgroup 3 Test : 3




Unnamed: 0,Index,Bankrupt?
3,3,0
7,7,0
8,8,0
13,13,0
15,15,0


# Combine Predictions
We can now put all the predictions in a Dataframe, sort them, and store them into a csv output

In [32]:
all_submissions = pd.concat([submission_sub0, submission_sub1, submission_sub2, submission_sub3])
all_submissions = all_submissions.sort_values(by="Index").reset_index(drop=True)
all_submissions.to_csv("./artifacts/Group6_Generalization.csv", index=False)
all_submissions.head()

Unnamed: 0,Index,Bankrupt?
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
