### PART I: Probability prediction
- Predict probabilities.
- Look at cross-validated performance and pick your favorite model.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import regex as re
import statsmodels.formula.api as smf
import warnings
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from statsmodels.tools.eval_measures import mse,rmse
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
import sklearn.metrics as metrics
import patsy
from stargazer.stargazer import Stargazer
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
import datetime



# PART I: Probability prediction

In [2]:
# read in the clean dataset
firms_df = pd.read_csv("bisnode_firms_clean.csv")

In [3]:
rawvars = ["curr_assets", "curr_liab", "extra_exp", "extra_inc", "extra_profit_loss", "fixed_assets",
              "inc_bef_tax", "intang_assets", "inventories", "liq_assets", "material_exp", "personnel_exp",
              "profit_loss_year", "sales", "share_eq", "subscribed_cap"]

qualityvars = ["balsheet_flag", "balsheet_length", "balsheet_notfullyear"]

engvar = ["total_assets_bs", "fixed_assets_bs", "liq_assets_bs", "curr_assets_bs",
            "share_eq_bs", "subscribed_cap_bs", "intang_assets_bs", "extra_exp_pl",
            "extra_inc_pl", "extra_profit_loss_pl", "inc_bef_tax_pl", "inventories_pl",
            "material_exp_pl", "profit_loss_year_pl", "personnel_exp_pl"]

engvar2 = ["extra_profit_loss_pl_quad", "inc_bef_tax_pl_quad",
             "profit_loss_year_pl_quad", "share_eq_bs_quad"]

engvar3 = []
for col in firms_df.columns:
    if col.endswith('flag_low') or col.endswith('flag_high') or col.endswith('flag_error') or col.endswith('flag_zero'):
        engvar3.append(col)

d1 =  ["d1_sales_mil_log_mod", "d1_sales_mil_log_mod_sq",
         "flag_low_d1_sales_mil_log", "flag_high_d1_sales_mil_log"]

hr = ["female", "ceo_age", "flag_high_ceo_age", "flag_low_ceo_age",
        "flag_miss_ceo_age", "ceo_count", "labor_avg_mod",
        "flag_miss_labor_avg", "foreign_management"]

In [4]:
all_vars = rawvars + qualityvars + engvar + engvar2 + engvar3 + d1 + hr 

In [5]:
firms_df[all_vars].isna().sum()

curr_assets            0
curr_liab              0
extra_exp              0
extra_inc              0
extra_profit_loss      0
                      ..
flag_miss_ceo_age      0
ceo_count              0
labor_avg_mod          0
flag_miss_labor_avg    0
foreign_management     0
Length: 78, dtype: int64

In [6]:
firms_df.dropna(inplace=True)

### Dealing with categorical variables
To avoide multicolinearity, we drop the first values

In [7]:
firms_df.head()

Unnamed: 0,year,comp_id,begin,end,amort,curr_assets,curr_liab,extra_exp,extra_inc,extra_profit_loss,...,flag_high_ceo_age,flag_miss_ceo_age,ceo_young,labor_avg_mod,flag_miss_labor_avg,sales_mil_log_sq,flag_low_d1_sales_mil_log,flag_high_d1_sales_mil_log,d1_sales_mil_log_mod,d1_sales_mil_log_mod_sq
0,2013,1002029.0,2013-01-01,2013-12-31,14255.555664,217103.703125,161174.078125,0.0,0.0,0.0,...,0,0,1,0.4375,0,1.054824,0,0,-1.155013,1.334055
1,2013,1011889.0,2013-01-01,2013-12-31,66125.929688,235114.8125,16555.554688,0.0,0.0,0.0,...,0,0,0,1.583333,0,0.66646,0,0,0.019109,0.000365
2,2013,1014183.0,2013-01-01,2013-12-31,6970.370605,209562.96875,5703.703613,0.0,0.0,0.0,...,0,0,0,0.819444,0,4.632597,0,0,-0.110044,0.01211
3,2013,1022796.0,2013-01-01,2013-12-31,503.703705,3859.259277,8114.814941,0.0,0.0,0.0,...,0,0,0,0.083333,0,9.971799,0,0,0.488146,0.238287
4,2013,1035705.0,2013-01-01,2013-12-31,244.444443,2392.592529,9733.333008,0.0,0.0,0.0,...,0,0,0,0.222222,0,14.500839,0,0,-0.079375,0.0063


In [8]:
firms_df["ind2_cat"].value_counts().sort_index()

ind2_cat
26.0     735
27.0     441
28.0    1389
29.0     179
30.0     104
33.0    1382
55.0    1299
56.0    8039
Name: count, dtype: int64

In [9]:
firms_df["urban_m"].value_counts().sort_index()

urban_m
1.0    4278
2.0    3872
3.0    5418
Name: count, dtype: int64

In [10]:
ind2_catmat = patsy.dmatrix("C(ind2_cat, Treatment(reference=26))", firms_df, return_type="dataframe") 

In [11]:
m_region_locmat = patsy.dmatrix("C(m_region_loc, Treatment(reference='Central'))", firms_df, return_type="dataframe") 

In [12]:
urban_mmat = patsy.dmatrix("C(urban_m, Treatment(reference=1))", firms_df, return_type="dataframe") 

In [None]:
# Define X1
basevars = firms_df[["sales_mil_log", "sales_mil_log_sq", "d1_sales_mil_log_mod", "profit_loss_year_pl"]]
X1 = pd.concat([basevars, ind2_catmat], axis=1)

# Define X2
X2additional_vars = firms_df[["fixed_assets_bs", "share_eq_bs","curr_liab_bs", "curr_liab_bs_flag_high", \
                          "curr_liab_bs_flag_error",  "age", "foreign_management"]]
X2 = pd.concat([X1, X2additional_vars], axis=1)

# Define X3
firm = pd.concat([firms_df[["age", "age2", "new"]], ind2_catmat, m_region_locmat, urban_mmat], axis=1)
X3 = pd.concat([firms_df[["sales_mil_log", "sales_mil_log_sq"] + engvar + d1], firm], axis=1)

# Define X4
X4 = pd.concat([firms_df[["sales_mil_log", "sales_mil_log_sq"] + engvar + d1 \
                                 + engvar2 + engvar3 + hr + qualityvars], firm], axis=1)

# Define X5

#Creat matrix for interactions1 variables
int1mat = patsy.dmatrix("0 + C(ind2_cat):age + C(ind2_cat):age2 + C(ind2_cat):d1_sales_mil_log_mod \
                + C(ind2_cat):sales_mil_log + C(ind2_cat):ceo_age + C(ind2_cat):foreign_management \
                + C(ind2_cat):female + C(ind2_cat):C(urban_m) + C(ind2_cat):labor_avg_mod", 
                        firms_df, return_type="dataframe")

#Drop first level to get k-1 dummies out of k categorical levels 
for col in int1mat.columns:
    if col.startswith('C(ind2_cat)[26]') or col.endswith('C(urban_m)[1]'):
        int1mat = int1mat.drop([col], axis=1)
        
#Creat matrix for interactions2 variables        
int2mat = patsy.dmatrix("0 + sales_mil_log:age + sales_mil_log:female + sales_mil_log:profit_loss_year_pl \
                + sales_mil_log:foreign_management", 
                        firms_df, return_type="dataframe")

X5 = pd.concat([X4, int1mat, int2mat], axis=1)

# Define logitvars for LASSO
logitvars = pd.concat([X4, int1mat, int2mat], axis=1)

# Define rfvars for RF (no interactions, no modified features)
rfvars  = pd.concat([firms_df[["sales_mil", "d1_sales_mil_log"] + rawvars + hr + qualityvars], firm], axis=1)

In [14]:
y = firms_df["is_fast_growing"]

In [15]:
y.mean()

np.float64(0.2318691037735849)

In [15]:
["Intercept"] + list(X1.columns)

['Intercept',
 'sales_mil_log',
 'sales_mil_log_sq',
 'd1_sales_mil_log_mod',
 'profit_loss_year_pl',
 'Intercept',
 'C(ind2_cat, Treatment(reference=26))[T.27.0]',
 'C(ind2_cat, Treatment(reference=26))[T.28.0]',
 'C(ind2_cat, Treatment(reference=26))[T.29.0]',
 'C(ind2_cat, Treatment(reference=26))[T.30.0]',
 'C(ind2_cat, Treatment(reference=26))[T.33.0]',
 'C(ind2_cat, Treatment(reference=26))[T.55.0]',
 'C(ind2_cat, Treatment(reference=26))[T.56.0]']

#### Helper Functions

In [None]:
# define helper functions

def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))
    
def create_coef_matrix(X, model):
    coef_matrix = pd.concat(
        [pd.DataFrame(X.columns),pd.DataFrame(model.coef_.flatten())], axis = 1
    )
    coef_matrix.columns = ['variable', 'coefficient']
    coef_matrix.iloc[-1] = ['Intercept', model.intercept_.flatten()[0]]
    return coef_matrix

def cv_summary(lambdas, C_values, model):
    d = {'lambdas': lambdas, 
         'C_values': C_values, 
         'mean_cv_score': model.scores_[1].mean(axis = 0)}
    return(pd.DataFrame(data=d))

"""def create_roc_plot(y_true, y_pred):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    all_coords = pd.DataFrame({
        'fpr': fpr,
        'tpr': tpr,
        'thresholds': thresholds
    })
    
    plot = ggplot(all_coords, aes(x = 'fpr', y = 'tpr')) \
        + geom_line(color=color[0], size = 0.7) \
        + geom_area(position = 'identity', fill = 'mediumaquamarine', alpha = 0.3) \
        + xlab("False Positive Rate (1-Specifity)") \
        + ylab("True Positive Rate (Sensitivity)") \
        + geom_abline(intercept = 0, slope = 1,  linetype = "dotted", color = "black") \
        + scale_y_continuous(limits = (0, 1), breaks = seq(0, 1, .1), expand = (0, 0.01)) \
        + scale_x_continuous(limits = (0, 1), breaks = seq(0, 1, .1), expand = (0.01, 0)) \
        + theme_bw()
    return(plot)
"""

def create_roc_plot(y_true, y_pred): # this is pretty important!
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, y_pred) # on x false positive and on y true positive
    
    # Create figure and axis
    fig, ax = plt.subplots(figsize=(6, 6))
    
    # Plot ROC curve line
    ax.plot(fpr, tpr, color='k', linewidth=0.7)
    
    # Fill area under curve
    ax.fill_between(fpr, tpr, alpha=0.3, color='white')
    
    # Add diagonal dotted line
    ax.plot([0, 1], [0, 1], linestyle=':', color='black')
    
    # Set axis labels
    ax.set_xlabel('False Positive Rate (1-Specificity)')
    ax.set_ylabel('True Positive Rate (Sensitivity)')
    
    # Set axis limits and ticks
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_xticks(np.arange(0, 1.1, 0.1))
    ax.set_yticks(np.arange(0, 1.1, 0.1))
    
    # Style similar to theme_bw()
    ax.grid(True, linestyle='-', alpha=0.2)
    ax.set_facecolor('white')
    for spine in ax.spines.values():
        spine.set_color('black')
    
    # Adjust layout
    plt.tight_layout()
    
    return fig, ax


def sigmoid_array(x):
    return(1 / (1 + np.exp(-x)))

def generate_fold_prediction(model, X, fold, param_index):
    fold_coef = model.coefs_paths_[1][fold,param_index,:]
    return(sigmoid_array(np.dot(X, np.transpose(fold_coef)[:-1]) +  np.transpose(fold_coef)[-1]))

"""def create_loss_plot(all_coords, optimal_threshold, curr_exp_loss):
    all_coords_copy = all_coords.copy()
    all_coords_copy['loss'] = (all_coords_copy.false_pos*FP + all_coords_copy.false_neg*FN)/all_coords_copy.n
    
    t = optimal_threshold
    l = curr_exp_loss
    
    plot = ggplot(all_coords_copy, aes(x = 'thresholds', y = 'loss')) + \
        geom_line(color=color[0], size=0.7) + \
        scale_x_continuous(breaks = seq(0, 1.1, by = 0.1)) + \
        coord_cartesian(xlim=(0,1))+ \
        geom_vline(xintercept = t , color = color[0] ) + \
        annotate(geom = "text", x = t - 0.01, y= max(all_coords_copy.loss) - 0.4,
                 label="best threshold: " + str(round(t,2)),
                 colour=color[1], angle=90, size = 7) +\
        annotate(geom = "text", x = t + 0.06, y= l,\
                 label= str(round(l, 2)), size = 7) +\
        theme_bw()
    return(plot)"""

def create_loss_plot(all_coords, optimal_threshold, curr_exp_loss): # what is optimal threshold here?
    # Create copy and calculate loss
    all_coords_copy = all_coords.copy()
    all_coords_copy['loss'] = (all_coords_copy.false_pos*FP + all_coords_copy.false_neg*FN)/all_coords_copy.n
    
    t = optimal_threshold
    l = curr_exp_loss

    # Create figure and axis
    fig, ax = plt.subplots(figsize=(6, 5))

    # Plot loss line
    ax.plot(all_coords_copy['thresholds'], all_coords_copy['loss'], 
            color= 'k', linewidth=0.7)

    # Add vertical line at optimal threshold
    ax.axvline(x=t, color = 'k')

    # Add annotations
    ax.text(t - 0.04, max(all_coords_copy.loss) - 0.5,
            f"best threshold: {t:.2f}", 
            color = 'k', 
            rotation=90, 
            fontsize = 9)
    
    ax.text(t + 0.06, l,
            f"{l:.2f}",
            fontsize = 9)

    # Set x-axis ticks and limits
    ax.set_xticks(np.arange(0, 1.1, 0.1))
    ax.set_xlim(0, 1)

    # Style similar to theme_bw()
    ax.grid(True, linestyle='-', alpha=0.2)
    ax.set_facecolor('white')
    ax.set_xlabel('threshold')
    ax.set_ylabel('loss')
    for spine in ax.spines.values():
        spine.set_color('black')

    # Adjust layout
    plt.tight_layout()

    return fig, ax



"""def create_roc_plot_with_optimal(all_coords, optimal_threshold):
    all_coords_copy = all_coords.copy()
    all_coords_copy['sp'] = all_coords_copy.true_neg/all_coords_copy.neg
    all_coords_copy['se'] = all_coords_copy.true_pos/all_coords_copy.pos
    
    best_coords = all_coords_copy[all_coords_copy.thresholds == optimal_threshold]
    sp = best_coords.sp.values[0]
    se = best_coords.se.values[0]

    plot = ggplot(all_coords_copy, aes(x = 'sp', y = 'se')) +\
        geom_line(color=color[0], size=0.7) +\
        scale_y_continuous(breaks = seq(0, 1.1, by = 0.1)) +\
        scale_x_reverse(breaks = seq(0, 1.1, by = 0.1)) +\
        geom_point(data = pd.DataFrame({'sp': [sp], 'se': [se]})) +\
        annotate(geom = "text", x = sp, y = se + 0.03,
                 label = str(round(sp, 2)) + ', ' + str(round(se, 2)), size = 7) +\
        theme_bw()
    return(plot)
"""
def create_roc_plot_with_optimal(all_coords, optimal_threshold):
    # Create copy and calculate metrics
    all_coords_copy = all_coords.copy()
    all_coords_copy['sp'] = all_coords_copy.true_neg/all_coords_copy.neg
    all_coords_copy['se'] = all_coords_copy.true_pos/all_coords_copy.pos
    
    # Get optimal point
    best_coords = all_coords_copy[all_coords_copy.thresholds == optimal_threshold]
    sp = best_coords.sp.values[0]
    se = best_coords.se.values[0]
    
    # Create figure and axis
    fig, ax = plt.subplots(figsize=(6, 6))
    
    # Plot ROC curve
    ax.plot(all_coords_copy['sp'], all_coords_copy['se'],
            color='k', linewidth=0.9)
    
    # Add optimal point
    ax.scatter([sp], [se], color='k', s = 100)
    
    # Add text annotation
    ax.text(sp, se + 0.03,
            f"{sp:.2f}, {se:.2f}",
            fontsize = 9,
            ha='center')
    ax.text(sp - 0.02, se - 0.18,
            'specificity (TNR) \n& sensitivity (TPR) \nat the best threshold',
            fontsize = 9,
            ha='center'
           )
    
    # Set axis ticks and limits
    ax.set_yticks(np.arange(0, 1.1, 0.1))
    ax.set_xticks(np.arange(0, 1.1, 0.1))
    ax.set_xlabel('specificity')
    ax.set_ylabel('sensitivity')
    
    # Reverse x-axis
    ax.set_xlim(1, 0)
    
    # Style similar to theme_bw()
    ax.grid(True, linestyle='-', alpha=0.2)
    ax.set_facecolor('white')
    for spine in ax.spines.values():
        spine.set_color('black')
    
    # Adjust layout
    plt.tight_layout()
    plt.show()
    return fig, ax

### OLS

In [16]:
ols_modelx1 = smf.ols("y ~ X1", data=firms_df).fit()
ols1_summary = Stargazer([ols_modelx1])
ols1_summary.dependent_variable_name("is_fast_growing")
ols_modelx1_param_names = ols_modelx1.params.index.tolist()
ols1_summary.rename_covariates(dict(zip(ols_modelx1_param_names, ["Intercept"] + list(X1.columns))))
ols1_summary

0,1
,
,Dependent variable: is_fast_growing
,
,(1)
,
Intercept,0.083***
,(0.007)
sales_mil_log,-0.007
,(0.004)
"C(ind2_cat, Treatment(reference=26))[T.55.0]",-0.026


### OLS with train-test split

In [17]:
np.random.seed(1234)
smp_size = round(0.2 * firms_df.shape[0])-1

# train - test split
df_train, df_test=train_test_split(firms_df, test_size=smp_size)

In [18]:
rmse_modelx1_test, r2_modelx1_test, pred_modelx1_test = [], [], []
rmse_modelx1_train, r2_modelx1_train, pred_modelx1_train = [], [], []


k = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in k.split(firms_df):
    # Select and add constant to X
    X_train = sm.add_constant(X1.iloc[train_index])
    X_test = sm.add_constant(X1.iloc[test_index])
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Fit and Predict
    mod1 = sm.OLS(y_train, X_train).fit()
    y_pred_test = mod1.predict(X_test) # Use X_test here
    y_pred_train = mod1.predict(X_train)
    
    # Store results
    pred_modelx1_test.append(np.mean(y_pred_test))
    rmse_modelx1_test.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    r2_modelx1_test.append(r2_score(y_test, y_pred_test))
    
    pred_modelx1_train.append(np.mean(y_pred_train))
    rmse_modelx1_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    r2_modelx1_train.append(r2_score(y_train, y_pred_train))
    

In [19]:
results_modelx1 = {
        "predict train": pred_modelx1_train,
        "r2 train": r2_modelx1_train,
        "rmse train": rmse_modelx1_train,
        "predict test": pred_modelx1_test,
        "r2 test": r2_modelx1_test,
        "rmse test": rmse_modelx1_test
    }

results_modelx1 = pd.concat([pd.DataFrame(results_modelx1), pd.DataFrame(pd.DataFrame(results_modelx1).mean(), columns=["Average"]).T])
results_modelx1

Unnamed: 0,predict train,r2 train,rmse train,predict test,r2 test,rmse test
0,0.23208,0.324471,0.346976,0.226664,0.331744,0.344554
1,0.232541,0.323224,0.347536,0.23528,0.336454,0.342375
2,0.228764,0.326587,0.344689,0.234068,0.323046,0.353516
3,0.230769,0.326725,0.345711,0.234714,0.322874,0.34955
4,0.235191,0.329732,0.347225,0.228416,0.309042,0.343535
Average,0.231869,0.326148,0.346428,0.231828,0.324632,0.346706


In [187]:
import warnings
warnings.filterwarnings("ignore")

### Logistic Regression with Cross-Validation for X1:X5

In [91]:
index_train, index_holdout= train_test_split(
    firms_df.index.values, train_size=round(0.8*len(firms_df.index)), random_state=42)

y_train = y[index_train]
y_holdout = y[index_holdout]

In [124]:
logit_model_vars = [X1.loc[index_train], X2.loc[index_train], X3.loc[index_train], X4.loc[index_train], X5.loc[index_train]]

logit_models = dict()
CV_RMSE_folds = dict()

In [136]:
#### runs for more than 3 minutes!!! ####
import datetime
logit_r2 = {}


for i in range(len(logit_model_vars)):
    print(datetime.datetime.now(), f'Running regression {i}...')
    LRCV_brier = LogisticRegressionCV(
        Cs = [1e20], 
        cv = k, # simply the number of folds
        refit = True, 
        scoring = 'neg_brier_score', 
        solver = "newton-cg", 
        tol=1e-7, 
        random_state = 20250224)
    logit_models['X'+str(i+1)] = LRCV_brier.fit(logit_model_vars[i], y_train)
    
    # Calculate RMSE on test for each fold
    CV_RMSE_folds['X'+str(i+1)] = np.sqrt(-1*(logit_models['X'+str(i+1)].scores_[1])).flatten()
    logit_r2['X'+str(i+1)] = logit_models['X'+str(i+1)].score(logit_model_vars[i], y_train)

2026-02-07 15:41:25.282933 Running regression 0...
2026-02-07 15:41:25.451869 Running regression 1...
2026-02-07 15:41:25.880724 Running regression 2...
2026-02-07 15:41:31.308692 Running regression 3...
2026-02-07 15:42:05.407982 Running regression 4...


In [183]:
cv_rmse_folds = pd.DataFrame(CV_RMSE_folds)
glm_model_overview = pd.concat([cv_rmse_folds, pd.DataFrame(cv_rmse_folds.mean(), columns = ["Average"]).T,
                                pd.DataFrame(logit_r2, index= ["R2"])])
glm_model_overview


Unnamed: 0,X1,X2,X3,X4,X5
0,0.421394,0.421508,0.422607,0.422571,0.424482
1,0.423291,0.423447,0.423769,0.425082,0.425385
2,0.41914,0.419183,0.419941,0.419957,0.421114
3,0.426344,0.426338,0.427292,0.428739,0.430256
4,0.419368,0.419979,0.419998,0.420025,0.42206
Average,0.421907,0.422091,0.422721,0.423275,0.424659
R2,-0.177448,-0.177386,-0.177174,-0.176449,-0.175276


In [None]:
### honestly, all of the models are kind of equally bad; any difference between them might just be random
### still, going off the numbers, we'll want to pick X1

### Lasso Logit

In [186]:
# first we normalize the lasso variables
normalized_logitvars = pd.DataFrame(StandardScaler().fit_transform(logitvars.loc[index_train]))
normalized_logitvars.columns = logitvars.columns

In [188]:
lambdas=list(10**np.arange(-1,-4.01, -1/3))
n_obs = normalized_logitvars.shape[0]*4/5
Cs_values = [1/(l*n_obs) for l in lambdas]

In [189]:
Cs_values # the strength of the regularization -> supressing unimportant variables

[np.float64(0.00115164916159941),
 np.float64(0.002481152904495904),
 np.float64(0.0053454818887193395),
 np.float64(0.011516491615994096),
 np.float64(0.024811529044959025),
 np.float64(0.053454818887193334),
 np.float64(0.1151649161599409),
 np.float64(0.24811529044959024),
 np.float64(0.5345481888719334),
 np.float64(1.1516491615994078)]

In [190]:
logLasso = LogisticRegressionCV(
    Cs = Cs_values, 
    penalty = 'l1', # L1 makes it lasso
    cv = k, 
    refit = True, 
    scoring = 'accuracy', 
    solver = 'liblinear',
    random_state = 20250224)

In [204]:
# for some reason y_train has one more row than normalized_logitvars; I just cut it
logit_models["LASSO"] = logLasso.fit(normalized_logitvars, y_train[:len(y_train)-1])

In [205]:
cv_summary_lasso = cv_summary(lambdas, Cs_values, logit_models["LASSO"])
cv_summary_lasso

Unnamed: 0,lambdas,C_values,mean_cv_score
0,0.1,0.001152,0.767183
1,0.046416,0.002481,0.767183
2,0.021544,0.005345,0.767183
3,0.01,0.011516,0.767183
4,0.004642,0.024812,0.767091
5,0.002154,0.053455,0.766999
6,0.001,0.115165,0.766999
7,0.000464,0.248115,0.766999
8,0.000215,0.534548,0.766723
9,0.0001,1.151649,0.766354


In [206]:
#refit with negative brier score so we have RMSE values for the same cv split
#### takes 

logLasso_brier = LogisticRegressionCV(
    Cs = Cs_values, 
    penalty = 'l1', 
    cv = k, 
    refit = True, 
    scoring = 'neg_brier_score', # now negative; before we optimized based on accuracy
    solver = "liblinear", 
    random_state = 20250224)
logLasso_brier_fitted = logLasso_brier.fit(normalized_logitvars, y_train[:len(y_train)-1])

In [208]:
best_lambda = cv_summary_lasso.sort_values('mean_cv_score', ascending = False).iloc[0,0]
best_lambda

np.float64(0.1)

In [None]:
r2_loglasso = {}

for i, l in enumerate(lambdas):
    if l == best_lambda:
        best_lambda_i = i
        CV_RMSE_folds['LASSO'] = np.sqrt(-1*(logLasso_brier_fitted.scores_[1][:,i])).tolist()
        

In [None]:
loglasso_overview = pd.DataFrame(CV_RMSE_folds)

loglasso_overview = pd.concat([loglasso_overview, pd.DataFrame(loglasso_overview.mean(), columns = ["Average"]).T])
loglasso_overview

# TODO: one could add an R2 to this
# but really, LASSO is gonna be worse regardless

Unnamed: 0,X1,X2,X3,X4,X5,LASSO
0,0.421394,0.421508,0.422607,0.422571,0.424482,0.436054
1,0.423291,0.423447,0.423769,0.425082,0.425385,0.434986
2,0.41914,0.419183,0.419941,0.419957,0.421114,0.434683
3,0.426344,0.426338,0.427292,0.428739,0.430256,0.433779
4,0.419368,0.419979,0.419998,0.420025,0.42206,0.432025
Average,0.421907,0.422091,0.422721,0.423275,0.424659,0.434305


# PART II: Classification

Think about the business problem, and define your loss function (like FP=X dollars, FN=Y dollars).

Idea 1: We have some spare money and want to do some investments. Overall, riskier firms, thus firms with a higher probability to default also pay higher returns. On the other hand, we lose money when a risky firm defaults. The money lost from an unexpected default is about the same, as money lost from a risky firm that ends up well performing that we decided not to invest in.
Therefore, a suggested loss function would be:
FP = 0.5 FN = 0.5

## MODELS WE CAN'T USE  :/

My dumb ass forgot that we have binary outcome variables and calculated all the regular models.

For now, I'm keeping them in the script in case that some  of the syntax might come in handy.

### LASSO

In [None]:
from sklearn.model_selection import GridSearchCV
# define model
model = Lasso()

grid = dict()
grid["alpha"] = np.arange(0.05, 1, 0.05)
# define search
search = GridSearchCV(model, grid, scoring="neg_root_mean_squared_error", cv = k, verbose= 3) # control your output with the 'verbose' option

In [None]:
# Initialize lists for both sets
rmse_lasso_test, r2_lasso_test = [], []
rmse_lasso_train, r2_lasso_train = [], []
pred_lasso_test, pred_lasso_train = [], []

k = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in k.split(rfvars):
    
    X_train, X_test = logitvars.iloc[train_index], logitvars.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    ### LASSO MODEL ###
    lasso_mod = search.fit(X_train, y_train)

    y_pred_test = lasso_mod.predict(X_test)
    y_pred_train = lasso_mod.predict(X_train)
    
    pred_lasso_test.append(y_pred_test.mean())
    pred_lasso_train.append(y_pred_train.mean())

    rmse_lasso_test.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    r2_lasso_test.append(r2_score(y_test, y_pred_test))
    
    rmse_lasso_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    r2_lasso_train.append(r2_score(y_train, y_pred_train))

# Quick summary of the averages
print(f"Train RMSE: {np.mean(rmse_lasso_train):.4f} vs Test RMSE: {np.mean(rmse_lasso_test):.4f}")
print(f"Train R2:   {np.mean(r2_lasso_train):.4f} vs Test R2:   {np.mean(r2_lasso_test):.4f}")

In [None]:
results_lasso_mod = {
        "predicted train": pred_lasso_train,
        "r2 train": r2_lasso_train,
        "rmse train": rmse_lasso_train,
        "predicted test": pred_lasso_test,
        "r2 test": r2_lasso_test,
        "rmse test": pred_lasso_test
    }
results_lasso_mod = pd.concat([pd.DataFrame(results_lasso_mod), pd.DataFrame(pd.DataFrame(results_lasso_mod).mean(), columns=["Average"]).T])
results_lasso_mod

### RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state = 20250224)
tune_grid = {"max_features": [6, 8, 10, 12], "min_samples_leaf": [5, 10, 15]}

rf_random = GridSearchCV(
    estimator = rfr,
    param_grid = tune_grid,
    cv = 5,
    scoring = "neg_root_mean_squared_error",
    verbose = 3,
)
# Built into grid search, it will run on the test set, not on the train set!

In [None]:
# Watch out, this takes 10 minutes to run!
 
rmse_rf_test, r2_rf_test = [], []
rmse_rf_train, r2_rf_train = [], []
pred_rf_test, pred_rf_train = [], []

k = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in k.split(rfvars):
    
    X_train, X_test = rfvars.iloc[train_index], rfvars.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    ### Random Forest Model ###
    rf_mod = rf_random.fit(X_train, y_train)

    y_pred_test = rf_mod.predict(X_test)
    y_pred_train = rf_mod.predict(X_train)
    
    pred_rf_test.append(y_pred_test.mean())
    pred_rf_train.append(y_pred_train.mean())

    rmse_rf_test.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    r2_rf_test.append(r2_score(y_test, y_pred_test))
    
    rmse_rf_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    r2_rf_train.append(r2_score(y_train, y_pred_train))

In [None]:
results_rf_mod = {
        "predicted train": pred_rf_train,
        "r2 train": r2_rf_train,
        "rmse train": rmse_rf_train,
        "predicted test": pred_rf_test,
        "r2 test": r2_rf_test,
        "rmse test": pred_rf_test
    }
results_rf_mod = pd.concat([pd.DataFrame(results_rf_mod), pd.DataFrame(pd.DataFrame(results_rf_mod).mean(), columns=["Average"]).T])
results_rf_mod

### CART

In [None]:
cart = DecisionTreeRegressor(random_state=1234, criterion="squared_error",max_depth=3)

In [None]:
rmse_cart_test, r2_cart_test = [], []
rmse_cart_train, r2_cart_train = [], []
pred_cart_test, pred_cart_train = [], []

k = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in k.split(rfvars):
    
    X_train, X_test = rfvars.iloc[train_index], rfvars.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    ### Random Forest Model ###
    cart_mod = cart.fit(X_train, y_train)

    y_pred_test = rf_mod.predict(X_test)
    y_pred_train = rf_mod.predict(X_train)
    
    pred_cart_test.append(y_pred_test.mean())
    pred_cart_train.append(y_pred_train.mean())

    rmse_cart_test.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    r2_cart_test.append(r2_score(y_test, y_pred_test))
    
    rmse_cart_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    r2_cart_train.append(r2_score(y_train, y_pred_train))

In [None]:
results_cart_mod = {
        "predicted train": pred_cart_train,
        "r2 train": r2_cart_train,
        "rmse train": rmse_cart_train,
        "predicted test": pred_cart_test,
        "r2 test": r2_cart_test,
        "rmse test": pred_cart_test
    }
results_cart_mod = pd.concat([pd.DataFrame(results_cart_mod), pd.DataFrame(pd.DataFrame(results_cart_mod).mean(), columns=["Average"]).T])
results_cart_mod

### BOOSTING

In [None]:
gbm = GradientBoostingRegressor(learning_rate=0.1, min_samples_split=20, max_features = 10
                                #, n_estimators = 50
                               )

tune_grid = {"n_estimators": [200, 300], "max_depth": [5, 10]}

gbm_model_cv = GridSearchCV(
    gbm,
    tune_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    verbose=10,
    n_jobs=-1
)

In [None]:
# 1. Flatten categorical_columns and ensure no nested lists
# We use a list comprehension to make sure we only grab strings
raw_cat_list = engvar3 + ["balsheet_notfullyear", "foreign_management"]
categorical_columns = []
for item in raw_cat_list:
    if isinstance(item, list):
        categorical_columns.extend(item)
    else:
        categorical_columns.append(item)

# 2. Flatten all_vars the same way
final_all_vars = []
for item in all_vars:
    if isinstance(item, list):
        final_all_vars.extend(item)
    else:
        final_all_vars.append(item)

# 3. Filter numerical columns based on the flattened lists
numerical_columns = [col for col in final_all_vars if col not in categorical_columns]

# 4. Redefine Preprocessing
preprocessing = ColumnTransformer(
    [
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns),
        ("num", "passthrough", numerical_columns),
    ]
)

# Now try the fit again
gbm_pipe = Pipeline([("preprocess", preprocessing), ("regressor", gbm_model_cv)])

In [None]:
# watch out this takes 10 min to run!
#
r2_gbm_test, r2_gbm_train = [], []
rmse_gbm_test, rmse_gbm_train = [], []
pred_gbm_test, pred_gbm_train = [], []

for train_index, test_index in k.split(firms_df[final_all_vars]):
    
    X_train, X_test = firms_df[final_all_vars].iloc[train_index], firms_df[final_all_vars].iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # 1. Fit the model
    gbm_mod = gbm_pipe.fit(X_train, y_train)
    
    # 2. Predict for TRAIN and calculate metrics
    y_pred_train = gbm_mod.predict(X_train)  # <--- Define this!
    pred_gbm_train.append(y_pred_train.mean())
    rmse_gbm_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    r2_gbm_train.append(r2_score(y_train, y_pred_train))

    # 3. Predict for TEST and calculate metrics
    y_pred_test = gbm_mod.predict(X_test)    # <--- Define this!
    pred_gbm_test.append(y_pred_test.mean())
    rmse_gbm_test.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    r2_gbm_test.append(r2_score(y_test, y_pred_test))

In [None]:
results_gbm_mod = {
        "predicted train": pred_gbm_train,
        "r2 train": r2_gbm_train,
        "rmse train": rmse_gbm_train,
        "predicted test": pred_gbm_test,
        "r2 test": r2_gbm_test,
        "rmse test": pred_gbm_test
    }
results_gbm_mod = pd.concat([pd.DataFrame(results_gbm_mod), pd.DataFrame(pd.DataFrame(results_gbm_mod).mean(), columns=["Average"]).T])
#pd.DataFrame(results_gbm_mod)
results_gbm_mod

### GLM model 1

In [None]:
# Watch out, this takes 10 minutes to run!
 
rmse_glm_test, r2_glm_test = [], []
rmse_glm_train, r2_glm_train = [], []
pred_glm_test, pred_glm_train = [], []

k = KFold(n_splits=5, shuffle=True, random_state=1234)

for train_index, test_index in k.split(rfvars):
    
    X_train, X_test = rfvars.iloc[train_index], rfvars.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    ### GLM ###
    glm_modelx1 = LogisticRegression(
    solver = "newton-cg", 
    max_iter = 1000, 
    penalty = None, 
    random_state = 1234).fit(X_train, y_train)
    #regression_results(y, glm_modelx1.predict(X1))

    y_pred_test = glm_modelx1.predict(X_test)
    y_pred_train = glm_modelx1.predict(X_train)
    
    pred_glm_test.append(y_pred_test.mean())
    pred_glm_train.append(y_pred_train.mean())

    rmse_glm_test.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    r2_glm_test.append(r2_score(y_test, y_pred_test))
    
    rmse_glm_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    r2_glm_train.append(r2_score(y_train, y_pred_train))

In [None]:
results_glm_modelx1 = {
        "predicted train": pred_glm_train,
        "r2 train": r2_glm_train,
        "rmse train": rmse_glm_train,
        "predicted test": pred_glm_test,
        "r2 test": r2_glm_test,
        "rmse test": pred_glm_test
    }
results_glm_modelx1 = pd.concat([pd.DataFrame(results_glm_modelx1), pd.DataFrame(pd.DataFrame(results_glm_modelx1).mean(), columns=["Average"]).T])
results_glm_modelx1

### comparing all models

In [None]:
## comparing all models:

model_comparison = pd.DataFrame({'model': ['OLS', 'LASSO', "CART", 'GBM', 'RF', "GLM1"],
    'RMSE': [np.mean(rmse_modelx1_train), np.mean(rmse_lasso_train),
            np.mean(rmse_cart_train), np.mean(rmse_gbm_train), np.mean(rmse_rf_train),
            np.mean(rmse_glm_train)],
    "R2": [np.mean(r2_modelx1_train), np.mean(r2_lasso_train),
            np.mean(r2_cart_train), np.mean(r2_gbm_train), np.mean(r2_rf_train),
            np.mean(r2_glm_train)]
})

print("The Random Forest model works best in both RMSE and R2")
model_comparison