In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/american-companies-bankruptcy-prediction-dataset/american_bankruptcy.csv')

## Exploratory data analysis and some adjust

In [None]:
df.info()

In [None]:
# Check columns with nan
df.columns[df.isna().any()].tolist()

In [None]:
# Delete company_name column
df = df.drop(columns=['company_name'], axis=1)

In [None]:
# Rename "X" columns
df.rename(columns={
    'X1':'current_assets',
    'X2':'cost_of_goods_sold',
    'X3':'depreciation _and_amortization',
    'X4':'ebitda',
    'X5':'inventory',
    'X6':'net_income',
    'X7':'total_receivables',
    'X8':'market_value',
    'X9':'net_sales',
    'X10':'total_assets',
    'X11':'total_long_term_debt',
    'X12':'ebit',
    'X13':'gross_profit',
    'X14':'total_current_liabilities',
    'X15':'retained_earnings',
    'X16':'total_revenue',
    'X17':'total_liabilities',
    'X18':'total_operating_expenses'}, inplace = True)

In [None]:
# Recode and rename target
df['status_label'] = df['status_label'].apply(lambda x : 0 if x == 'alive' else 1)
df = df.rename(columns={'status_label':'bankruptcy'})

In [None]:
df.columns

In [None]:
## Target distribution
print(df['bankruptcy'].value_counts())
print('')
print(df['bankruptcy'].value_counts(normalize=True))
print('')

# Plot target distribution
df.groupby('bankruptcy').size().plot(kind='pie',
                                       autopct='%.1f%%',
                                       fontsize=13,
                                       colors=['skyblue', 'tomato'])

plt.title('Distribution bankruptcy', size=20)
plt.tight_layout()
plt.show()

Comments:
- Dataset presents an imbalance labels, as observed in our target, only 6.6% of the occasions correspond to 1 (bankruptcy).

In [None]:
## Outliers
# Check outliers in all columns except bankrupcty and year
columns_out = ['current_assets', 'cost_of_goods_sold',
               'depreciation _and_amortization', 'ebitda', 'inventory', 'net_income',
               'total_receivables', 'market_value', 'net_sales', 'total_assets',
               'total_long_term_debt', 'ebit', 'gross_profit',
               'total_current_liabilities', 'retained_earnings', 'total_revenue',
               'total_liabilities', 'total_operating_expenses']

df_out = df[columns_out] # Subsample

len(columns_out)

In [None]:
# Boxplot of df_out 
fig, axes = plt.subplots(nrows=6, ncols=3, figsize=(14, 18))

for i, col in enumerate(columns_out):
    fila = i // 3
    columna = i % 3
    df_out.boxplot(column=col, ax=axes[fila, columna])
    axes[fila, columna].set_title(col)
    axes[fila, columna].set_xticks([]) # x axis without label

plt.tight_layout()
plt.show()

In [None]:
# Boxplot of df_out without outliers

fig, axes = plt.subplots(6, 3, figsize=(14, 18))

for i, col in enumerate(columns_out):
    fila = i // 3
    columna = i % 3

    df_out.boxplot(column=col, ax=axes[fila, columna], showfliers=False)
    axes[fila, columna].set_title(col)
    axes[fila, columna].set_xticks([])

plt.tight_layout()
plt.show()

Comments:
- We have a lot of outliers.
- Outliers will not be removed, because when I did so, there were no bankrupt companies left in the database. So I preferred to keep all the data, considering that in the outliers there could be important information of 1 (bankruptcy).

## Feature selection: correlation

In [None]:
X = df.drop('bankruptcy', axis=1)
y = df['bankruptcy']

# Correlation matrix
corr = X.corr()

In [None]:
## Heatmap of X
plt.figure(figsize=(10, 6))
# mask = np.triu(np.ones_like(corr, dtype=bool))

sns.heatmap(corr,
            annot=True,
            cmap='coolwarm',
            #mask=mask,
            fmt='.2f',
            linewidth=0.5,
            annot_kws={'size':7})

plt.title('Correlation Matrix Features (X)', fontsize=17)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.show()

In [None]:
# Identify corr features
threshold = 0.95
corr_features = set()

for i in range(len(corr.columns)):
    for j in range(i):
        if abs(corr.iloc[i, j]) >= threshold:
            column_name = corr.columns[i]
            corr_features.add(column_name)

print('Features correlations:', corr_features)

In [None]:
# Remove related features from X
X = X.drop(corr_features, axis=1)

In [None]:
# Heatmap of X without corr_features 
plt.figure(figsize=(10,6))

sns.heatmap(X.corr(),
            annot=True,
            cmap='coolwarm',
            fmt='.2f',
            linewidth=0.5,
            annot_kws={'size':7})

plt.title('Correlation Matrix Features (X)', fontsize=17)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.show()

## Split data + SMOTE

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [None]:
## Split data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=123)

In [None]:
## Rebalance train data
smote = SMOTE(sampling_strategy='auto', random_state=123)

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
print('y_train distribution before SMOTE:')
print(y_train.value_counts(normalize=True)) # Before SMOTE
print('')
print('y_train distribution after SMOTE:')
print(y_train_smote.value_counts(normalize=True)) # After SMOTE

In [None]:
print(y_train.shape, y_train_smote.shape)
print(X_train.shape, X_train_smote.shape)

Comments:
- SMOTE oversample train data, creating examples or synthetic samples, it is guided by the characteristics that share the retail class (1).
- The bankruptcy class (or y_train) distributions could be adjusted by modifying sampling_strategy. By default the class distribution is equal.

## Apply clasificacion models
- We will apply 2 models, Random Forest and Gradient Boost, both of clatification.
- We will not apply cross validation, time issues.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import randint, uniform

In [None]:
'''
## Cross validation 
param_dist_rf = {'n_estimators':randint(400, 800),
                 'max_depth':randint(3, 10)}

rand_search_rf = RandomizedSearchCV(RandomForestClassifier(),
                                    param_distributions=param_dist_rf,
                                    n_iter=5,
                                    cv=5)

rand_search_rf.fit(X_train_smote, y_train_smote)

print('Best hiperparameters:', rand_search_rf.best_params_)
'''

In [None]:
## Random Forest Classifier
rfCl = RandomForestClassifier(n_estimators=450, max_depth=8, random_state=123)

rfCl.fit(X_train_smote, y_train_smote) # Adjust model

In [None]:
y_rfCl = rfCl.predict(X_test)

In [None]:
print('Classification Report Random Forest Classifier:')
print(classification_report(y_test, y_rfCl))

# Confusion Matrix
cm_rfCl = confusion_matrix(y_test, y_rfCl)

print(cm_rf, '\n')
ConfusionMatrixDisplay(confusion_matrix=cm_rfCl).plot();

In [None]:
'''
## Looking for the best learning rate
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for lr in lr_list:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate=lr, max_features=2, max_depth=2, random_state=123)
    gb.fit(X_train_smote, y_train_smote)

    print('Learning rate: ', lr)
    print('Accuracy score (training): {0:.3f}'.format(gb.score(X_train_smote, y_train_smote)))
    print('Accuracy score (validation): {0:.3f}'.format(gb.score(X_test, y_test)), '\n')
'''

In [None]:
## Gradient Boosting Classifier
gbCl = GradientBoostingClassifier(n_estimators=300,
                                  learning_rate=0.5,
                                  max_depth=8,
                                  random_state=123)

gbCl.fit(X_train_smote, y_train_smote) # Adjust model

In [None]:
y_gbCl = gbCl.predict(X_test)

In [None]:
print('Classification Report Gradient Boosting Classifier:')
print(classification_report(y_test, y_gbCl))

# Confusion Matrix
cm_gbCl = confusion_matrix(y_test, y_gbCl)

print(cm_rf, '\n')
ConfusionMatrixDisplay(confusion_matrix=cm_gbCl).plot();