# This is the Failure model

below is the prompt to the Chat GPT 

Please help me write a Python code for a classification task using ensemble learning to determine whether a bank will go bankrupt after a certain period of time. 
I will input many features, but ultimately, I want to keep only the k most important features. 
The ensemble learning should include penalized logistic regression with Lasso, penalized logistic regression with elastic net, penalized regression with Ridge, and bootstrap decision trees. The ensemble learning should also include Cox Proportional Hazard Model.

By including all the models above, we want to know: 
1. which features are the most important features in predicting the failure of a bank. 
2. When those variables start taking on which values should we start worrying about the failure of a specific bank? 
3. When will the bank fail if those variables start taking on that value and we do nothing about it?

The data comes from two CSV files. The first CSV file contains the cert (unique identifier) and bankruptcy date of each bank, while the second CSV file contains the cert and all asset data for different dates during the bank's existence, which can be used as features. The rows don't exist after the bank fails. Some banks may appear in the first CSV file but not in the second. 

Please write your code in one window. Regardless of the length of your answer, please provide all your code in one response.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from lifelines import CoxPHFitter
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer





ModuleNotFoundError: ignored

In [5]:
# 1. Load and preprocess the data
bankruptcies = pd.read_csv('failure_data.csv')
assets = pd.read_csv('financials.csv')

# Convert FAILDATE to a datetime object with the format '%m/%d/%y'
bankruptcies['FAILDATE'] = pd.to_datetime(bankruptcies['FAILDATE'], format='%m/%d/%y')

# Convert date to a datetime object with the format '%Y%m'
assets['CALLYM'] = pd.to_datetime(assets['date'], format='%Y%m')

# Merge the data
data = pd.merge(assets, bankruptcies, on='cert', how='left')
data['failed'] = data['FAILDATE'].notna().astype(int)
data['time_to_failure'] = (data['FAILDATE'] - data['date']).dt.days


data = pd.merge(assets, bankruptcies, on='cert', how='left')
data['failed'] = data['bankruptcy_date'].notna().astype(int)
data['bankruptcy_date'] = pd.to_datetime(data['bankruptcy_date'])
data['date'] = pd.to_datetime(data['date'])
data['time_to_failure'] = (data['bankruptcy_date'] - data['date']).dt.days

FileNotFoundError: ignored

In [2]:
# 1.1 Generating the variables

In [None]:
# 2. Feature selection
k = 10
X = data.drop(['cert', 'bankruptcy_date', 'failed', 'time_to_failure'], axis=1)
y = data['failed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
selector = SelectKBest(f_classif, k=k).fit(X_train, y_train)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

selected_features = X.columns[selector.get_support()]

# 3. Train ensemble learning models
models = {
    'lasso': Pipeline([
        ('scaler', StandardScaler()),
        ('imputer', SimpleImputer(strategy='mean')),
        ('classifier', LogisticRegression(penalty='l1', solver='liblinear'))
    ]),
    'elastic_net': Pipeline([
        ('scaler', StandardScaler()),
        ('imputer', SimpleImputer(strategy='mean')),
        ('classifier', LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5))
    ]),
    'ridge': Pipeline([
        ('scaler', StandardScaler()),
        ('imputer', SimpleImputer(strategy='mean')),
        ('classifier', RidgeClassifier())
    ]),
    'random_forest': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('classifier', RandomForestClassifier(n_estimators=100))
    ]),
    'cox_ph': CoxPHFitter()
}

for name, model in models.items():
    if name == 'cox_ph':
        model.fit(data[['time_to_failure', 'failed'] + list(selected_features)], duration_col='time_to_failure', event_col='failed')
    else:
        model.fit(X_train_selected, y_train)

# 4. Evaluate the importance of features
importances = {}
for name, model in models.items():
    if name == 'cox_ph':
        importances[name] = model.summary['coef']
    else:
        importances[name] = model.named_steps['classifier'].coef_.flatten()

importances_df = pd.DataFrame(importances, index=selected_features)
print(importances_df)

# 5. Determine the values of the features that indicate a bank is at risk of failure
thresholds = {}
for feature in selected_features:
    feature_data = X_test_selected[:, list(selected_features).index(feature)]
    feature_mean = np.mean(feature_data)
    feature_std = np.std(feature_data)
    thresholds[feature] = {
        'low': feature_mean - 2 * feature_std,
        'high': feature_mean + 2 * feature_std
    }

print("Feature thresholds:")
print(thresholds)

# 6. Predict when a bank will fail
def predict_failure(model, data, threshold=0.5):
    if isinstance(model, CoxPHFitter):
        survival_pred = model.predict_survival_function(data)
        failure_pred = 1 - survival_pred
        failure_time = failure_pred.index[failure_pred.gt(threshold).idxmax()]
        return failure_time
    else:
        pred_proba = model.predict_proba(data)[:, 1]
        return np.where(pred_proba > threshold)[0]

at_risk_banks = {}
for name, model in models.items():
    at_risk_banks[name] = []
    for idx, row in X_test.iterrows():
        bank_data = row[selected_features].values.reshape(1, -1)
        if name != 'cox_ph':
            bank_data = model.named_steps['scaler'].transform(bank_data)
            bank_data = model.named_steps['imputer'].transform(bank_data)

        time_to_failure = predict_failure(model, bank_data)
        if time_to_failure is not None:
            at_risk_banks[name].append((idx, time_to_failure))

print("At-risk banks:")
print(at_risk_banks)