### PART I: Probability prediction
- Predict probabilities.
- Look at cross-validated performance and pick your favorite model.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import regex as re
import statsmodels.formula.api as smf
import warnings
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from statsmodels.tools.eval_measures import mse,rmse
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
import sklearn.metrics as metrics
import patsy
from stargazer.stargazer import Stargazer
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor


In [2]:
# read in the clean dataset
firms_df = pd.read_csv("bisnode_firms_clean.csv")

In [3]:
rawvars = ["curr_assets", "curr_liab", "extra_exp", "extra_inc", "extra_profit_loss", "fixed_assets",
              "inc_bef_tax", "intang_assets", "inventories", "liq_assets", "material_exp", "personnel_exp",
              "profit_loss_year", "sales", "share_eq", "subscribed_cap"]

qualityvars = ["balsheet_flag", "balsheet_length", "balsheet_notfullyear"]

engvar = ["total_assets_bs", "fixed_assets_bs", "liq_assets_bs", "curr_assets_bs",
            "share_eq_bs", "subscribed_cap_bs", "intang_assets_bs", "extra_exp_pl",
            "extra_inc_pl", "extra_profit_loss_pl", "inc_bef_tax_pl", "inventories_pl",
            "material_exp_pl", "profit_loss_year_pl", "personnel_exp_pl"]

engvar2 = ["extra_profit_loss_pl_quad", "inc_bef_tax_pl_quad",
             "profit_loss_year_pl_quad", "share_eq_bs_quad"]

engvar3 = []
for col in firms_df.columns:
    if col.endswith('flag_low') or col.endswith('flag_high') or col.endswith('flag_error') or col.endswith('flag_zero'):
        engvar3.append(col)

d1 =  ["d1_sales_mil_log_mod", "d1_sales_mil_log_mod_sq",
         "flag_low_d1_sales_mil_log", "flag_high_d1_sales_mil_log"]

hr = ["female", "ceo_age", "flag_high_ceo_age", "flag_low_ceo_age",
        "flag_miss_ceo_age", "ceo_count", "labor_avg_mod",
        "flag_miss_labor_avg", "foreign_management"]

In [4]:
all_vars = rawvars + qualityvars + engvar + engvar2 + engvar3 + d1 + hr 

In [5]:
firms_df[all_vars].isna().sum()

curr_assets            0
curr_liab              0
extra_exp              0
extra_inc              0
extra_profit_loss      0
                      ..
flag_miss_ceo_age      0
ceo_count              0
labor_avg_mod          0
flag_miss_labor_avg    0
foreign_management     0
Length: 78, dtype: int64

In [6]:
firms_df.dropna(inplace=True)

### Dealing with categorical variables
To avoide multicolinearity, we drop the first values

In [7]:
firms_df.head()

Unnamed: 0,year,comp_id,begin,end,amort,curr_assets,curr_liab,extra_exp,extra_inc,extra_profit_loss,...,flag_high_ceo_age,flag_miss_ceo_age,ceo_young,labor_avg_mod,flag_miss_labor_avg,sales_mil_log_sq,flag_low_d1_sales_mil_log,flag_high_d1_sales_mil_log,d1_sales_mil_log_mod,d1_sales_mil_log_mod_sq
0,2013,1002029.0,2013-01-01,2013-12-31,14255.555664,217103.703125,161174.078125,0.0,0.0,0.0,...,0,0,1,0.4375,0,1.054824,0,0,-1.155013,1.334055
1,2013,1011889.0,2013-01-01,2013-12-31,66125.929688,235114.8125,16555.554688,0.0,0.0,0.0,...,0,0,0,1.583333,0,0.66646,0,0,0.019109,0.000365
2,2013,1014183.0,2013-01-01,2013-12-31,6970.370605,209562.96875,5703.703613,0.0,0.0,0.0,...,0,0,0,0.819444,0,4.632597,0,0,-0.110044,0.01211
3,2013,1022796.0,2013-01-01,2013-12-31,503.703705,3859.259277,8114.814941,0.0,0.0,0.0,...,0,0,0,0.083333,0,9.971799,0,0,0.488146,0.238287
4,2013,1035705.0,2013-01-01,2013-12-31,244.444443,2392.592529,9733.333008,0.0,0.0,0.0,...,0,0,0,0.222222,0,14.500839,0,0,-0.079375,0.0063


In [8]:
firms_df["ind2_cat"].value_counts().sort_index()

ind2_cat
26.0     735
27.0     441
28.0    1389
29.0     179
30.0     104
33.0    1382
55.0    1299
56.0    8039
Name: count, dtype: int64

In [9]:
firms_df["urban_m"].value_counts().sort_index()

urban_m
1.0    4278
2.0    3872
3.0    5418
Name: count, dtype: int64

In [10]:
ind2_catmat = patsy.dmatrix("C(ind2_cat, Treatment(reference=26))", firms_df, return_type="dataframe") 

In [11]:
m_region_locmat = patsy.dmatrix("C(m_region_loc, Treatment(reference='Central'))", firms_df, return_type="dataframe") 

In [12]:
urban_mmat = patsy.dmatrix("C(urban_m, Treatment(reference=1))", firms_df, return_type="dataframe") 

In [13]:
# Define X1
basevars = firms_df[["sales_mil_log", "sales_mil_log_sq", "d1_sales_mil_log_mod", "profit_loss_year_pl"]]
X1 = pd.concat([basevars, ind2_catmat], axis=1)

# Define X2
X2additional_vars = firms_df[["fixed_assets_bs", "share_eq_bs","curr_liab_bs", "curr_liab_bs_flag_high", \
                          "curr_liab_bs_flag_error",  "age", "foreign_management"]]
X2 = pd.concat([X1, X2additional_vars], axis=1)

# Define X3
firm = pd.concat([firms_df[["age", "age2", "new"]], ind2_catmat, m_region_locmat, urban_mmat], axis=1)
X3 = pd.concat([firms_df[["sales_mil_log", "sales_mil_log_sq"] + engvar + d1], firm], axis=1)

# Define X4
X4 = pd.concat([firms_df[["sales_mil_log", "sales_mil_log_sq"] + engvar + d1 \
                                 + engvar2 + engvar3 + hr + qualityvars], firm], axis=1)

# Define X5

#Creat matrix for interactions1 variables
int1mat = patsy.dmatrix("0 + C(ind2_cat):age + C(ind2_cat):age2 + C(ind2_cat):d1_sales_mil_log_mod \
                + C(ind2_cat):sales_mil_log + C(ind2_cat):ceo_age + C(ind2_cat):foreign_management \
                + C(ind2_cat):female + C(ind2_cat):C(urban_m) + C(ind2_cat):labor_avg_mod", 
                        firms_df, return_type="dataframe")

#Drop first level to get k-1 dummies out of k categorical levels 
for col in int1mat.columns:
    if col.startswith('C(ind2_cat)[26]') or col.endswith('C(urban_m)[1]'):
        int1mat = int1mat.drop([col], axis=1)
        
#Creat matrix for interactions2 variables        
int2mat = patsy.dmatrix("0 + sales_mil_log:age + sales_mil_log:female + sales_mil_log:profit_loss_year_pl \
                + sales_mil_log:foreign_management", 
                        firms_df, return_type="dataframe")

X5 = pd.concat([X4, int1mat, int2mat], axis=1)

# Define logitvars for LASSO
logitvars = pd.concat([X4, int1mat, int2mat], axis=1)

# Define rfvars for RF (no interactions, no modified features)
rfvars  = pd.concat([firms_df[["sales_mil", "d1_sales_mil_log"] + rawvars + hr + qualityvars], firm], axis=1)

In [14]:
y = firms_df["is_fast_growing"]

In [15]:
y.mean()

np.float64(0.2318691037735849)

In [15]:
["Intercept"] + list(X1.columns)

['Intercept',
 'sales_mil_log',
 'sales_mil_log_sq',
 'd1_sales_mil_log_mod',
 'profit_loss_year_pl',
 'Intercept',
 'C(ind2_cat, Treatment(reference=26))[T.27.0]',
 'C(ind2_cat, Treatment(reference=26))[T.28.0]',
 'C(ind2_cat, Treatment(reference=26))[T.29.0]',
 'C(ind2_cat, Treatment(reference=26))[T.30.0]',
 'C(ind2_cat, Treatment(reference=26))[T.33.0]',
 'C(ind2_cat, Treatment(reference=26))[T.55.0]',
 'C(ind2_cat, Treatment(reference=26))[T.56.0]']

### OLS

In [16]:
ols_modelx1 = smf.ols("y ~ X1", data=firms_df).fit()
ols1_summary = Stargazer([ols_modelx1])
ols1_summary.dependent_variable_name("is_fast_growing")
ols_modelx1_param_names = ols_modelx1.params.index.tolist()
ols1_summary.rename_covariates(dict(zip(ols_modelx1_param_names, ["Intercept"] + list(X1.columns))))
ols1_summary

0,1
,
,Dependent variable: is_fast_growing
,
,(1)
,
Intercept,0.083***
,(0.007)
sales_mil_log,-0.007
,(0.004)
"C(ind2_cat, Treatment(reference=26))[T.55.0]",-0.026


### OLS with train-test split

In [17]:
np.random.seed(1234)
smp_size = round(0.2 * firms_df.shape[0])-1

# train - test split
df_train, df_test=train_test_split(firms_df, test_size=smp_size)

In [18]:
rmse_modelx1_test, r2_modelx1_test, pred_modelx1_test = [], [], []
rmse_modelx1_train, r2_modelx1_train, pred_modelx1_train = [], [], []


k = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in k.split(firms_df):
    # Select and add constant to X
    X_train = sm.add_constant(X1.iloc[train_index])
    X_test = sm.add_constant(X1.iloc[test_index])
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Fit and Predict
    mod1 = sm.OLS(y_train, X_train).fit()
    y_pred_test = mod1.predict(X_test) # Use X_test here
    y_pred_train = mod1.predict(X_train)
    
    # Store results
    pred_modelx1_test.append(np.mean(y_pred_test))
    rmse_modelx1_test.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    r2_modelx1_test.append(r2_score(y_test, y_pred_test))
    
    pred_modelx1_train.append(np.mean(y_pred_train))
    rmse_modelx1_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    r2_modelx1_train.append(r2_score(y_train, y_pred_train))
    

In [19]:
results_modelx1 = {
        "predict train": pred_modelx1_train,
        "r2 train": r2_modelx1_train,
        "rmse train": rmse_modelx1_train,
        "predict test": pred_modelx1_test,
        "r2 test": r2_modelx1_test,
        "rmse test": rmse_modelx1_test
    }

results_modelx1 = pd.concat([pd.DataFrame(results_modelx1), pd.DataFrame(pd.DataFrame(results_modelx1).mean(), columns=["Average"]).T])
results_modelx1

Unnamed: 0,predict train,r2 train,rmse train,predict test,r2 test,rmse test
0,0.23208,0.324471,0.346976,0.226664,0.331744,0.344554
1,0.232541,0.323224,0.347536,0.23528,0.336454,0.342375
2,0.228764,0.326587,0.344689,0.234068,0.323046,0.353516
3,0.230769,0.326725,0.345711,0.234714,0.322874,0.34955
4,0.235191,0.329732,0.347225,0.228416,0.309042,0.343535
Average,0.231869,0.326148,0.346428,0.231828,0.324632,0.346706


### LASSO

In [None]:
from sklearn.model_selection import GridSearchCV
# define model
model = Lasso()

grid = dict()
grid["alpha"] = np.arange(0.05, 1, 0.05)
# define search
search = GridSearchCV(model, grid, scoring="neg_root_mean_squared_error", cv = k, verbose= 3) # control your output with the 'verbose' option

In [21]:
# Initialize lists for both sets
rmse_lasso_test, r2_lasso_test = [], []
rmse_lasso_train, r2_lasso_train = [], []
pred_lasso_test, pred_lasso_train = [], []

k = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in k.split(rfvars):
    
    X_train, X_test = logitvars.iloc[train_index], logitvars.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    ### LASSO MODEL ###
    lasso_mod = search.fit(X_train, y_train)

    y_pred_test = lasso_mod.predict(X_test)
    y_pred_train = lasso_mod.predict(X_train)
    
    pred_lasso_test.append(y_pred_test.mean())
    pred_lasso_train.append(y_pred_train.mean())

    rmse_lasso_test.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    r2_lasso_test.append(r2_score(y_test, y_pred_test))
    
    rmse_lasso_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    r2_lasso_train.append(r2_score(y_train, y_pred_train))

# Quick summary of the averages
print(f"Train RMSE: {np.mean(rmse_lasso_train):.4f} vs Test RMSE: {np.mean(rmse_lasso_test):.4f}")
print(f"Train R2:   {np.mean(r2_lasso_train):.4f} vs Test R2:   {np.mean(r2_lasso_test):.4f}")

Fitting 5 folds for each of 19 candidates, totalling 95 fits
[CV 1/5] END .......................alpha=0.05;, score=-0.351 total time=   0.4s
[CV 2/5] END .......................alpha=0.05;, score=-0.347 total time=   0.7s
[CV 3/5] END .......................alpha=0.05;, score=-0.369 total time=   0.4s
[CV 4/5] END .......................alpha=0.05;, score=-0.359 total time=   0.4s
[CV 5/5] END .......................alpha=0.05;, score=-0.356 total time=   0.7s
[CV 1/5] END ........................alpha=0.1;, score=-0.399 total time=   0.1s
[CV 2/5] END ........................alpha=0.1;, score=-0.398 total time=   0.1s
[CV 3/5] END ........................alpha=0.1;, score=-0.420 total time=   0.1s
[CV 4/5] END ........................alpha=0.1;, score=-0.413 total time=   0.1s
[CV 5/5] END ........................alpha=0.1;, score=-0.405 total time=   0.1s
[CV 1/5] END ........alpha=0.15000000000000002;, score=-0.402 total time=   0.1s
[CV 2/5] END ........alpha=0.15000000000000002;,

In [22]:
results_lasso_mod = {
        "predicted train": pred_lasso_train,
        "r2 train": r2_lasso_train,
        "rmse train": rmse_lasso_train,
        "predicted test": pred_lasso_test,
        "r2 test": r2_lasso_test,
        "rmse test": pred_lasso_test
    }
results_lasso_mod = pd.concat([pd.DataFrame(results_lasso_mod), pd.DataFrame(pd.DataFrame(results_lasso_mod).mean(), columns=["Average"]).T])
results_lasso_mod

Unnamed: 0,predicted train,r2 train,rmse train,predicted test,r2 test,rmse test
0,0.23208,0.290936,0.355484,0.23171,0.293404,0.23171
1,0.232541,0.293518,0.355081,0.234956,0.289925,0.234956
2,0.228764,0.291686,0.353509,0.23331,0.290329,0.23331
3,0.230769,0.295778,0.353567,0.231458,0.287037,0.231458
4,0.235191,0.297414,0.355498,0.228528,0.293234,0.228528
Average,0.231869,0.293866,0.354628,0.231992,0.290786,0.231992


### RANDOM FOREST

In [23]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state = 20250224)
tune_grid = {"max_features": [6, 8, 10, 12], "min_samples_leaf": [5, 10, 15]}

rf_random = GridSearchCV(
    estimator = rfr,
    param_grid = tune_grid,
    cv = 5,
    scoring = "neg_root_mean_squared_error",
    verbose = 3,
)
# Built into grid search, it will run on the test set, not on the train set!

In [24]:
# Watch out, this takes 10 minutes to run!
 
rmse_rf_test, r2_rf_test = [], []
rmse_rf_train, r2_rf_train = [], []
pred_rf_test, pred_rf_train = [], []

k = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in k.split(rfvars):
    
    X_train, X_test = rfvars.iloc[train_index], rfvars.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    ### Random Forest Model ###
    rf_mod = rf_random.fit(X_train, y_train)

    y_pred_test = rf_mod.predict(X_test)
    y_pred_train = rf_mod.predict(X_train)
    
    pred_rf_test.append(y_pred_test.mean())
    pred_rf_train.append(y_pred_train.mean())

    rmse_rf_test.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    r2_rf_test.append(r2_score(y_test, y_pred_test))
    
    rmse_rf_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    r2_rf_train.append(r2_score(y_train, y_pred_train))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END max_features=6, min_samples_leaf=5;, score=-0.149 total time=   1.5s
[CV 2/5] END max_features=6, min_samples_leaf=5;, score=-0.148 total time=   1.5s
[CV 3/5] END max_features=6, min_samples_leaf=5;, score=-0.145 total time=   1.4s
[CV 4/5] END max_features=6, min_samples_leaf=5;, score=-0.145 total time=   1.3s
[CV 5/5] END max_features=6, min_samples_leaf=5;, score=-0.148 total time=   1.3s
[CV 1/5] END max_features=6, min_samples_leaf=10;, score=-0.156 total time=   1.2s
[CV 2/5] END max_features=6, min_samples_leaf=10;, score=-0.151 total time=   1.2s
[CV 3/5] END max_features=6, min_samples_leaf=10;, score=-0.152 total time=   1.2s
[CV 4/5] END max_features=6, min_samples_leaf=10;, score=-0.155 total time=   1.2s
[CV 5/5] END max_features=6, min_samples_leaf=10;, score=-0.149 total time=   1.2s
[CV 1/5] END max_features=6, min_samples_leaf=15;, score=-0.160 total time=   1.2s
[CV 2/5] END max_features=6, mi

In [25]:
results_rf_mod = {
        "predicted train": pred_rf_train,
        "r2 train": r2_rf_train,
        "rmse train": rmse_rf_train,
        "predicted test": pred_rf_test,
        "r2 test": r2_rf_test,
        "rmse test": pred_rf_test
    }
results_rf_mod = pd.concat([pd.DataFrame(results_rf_mod), pd.DataFrame(pd.DataFrame(results_rf_mod).mean(), columns=["Average"]).T])
results_rf_mod

Unnamed: 0,predicted train,r2 train,rmse train,predicted test,r2 test,rmse test
0,0.232221,0.954752,0.0898,0.235137,0.902993,0.235137
1,0.232867,0.952609,0.091966,0.229971,0.926402,0.229971
2,0.228745,0.954234,0.089859,0.244807,0.906237,0.244807
3,0.230763,0.953635,0.090722,0.23108,0.918962,0.23108
4,0.235084,0.954229,0.090736,0.219282,0.906147,0.219282
Average,0.231936,0.953892,0.090617,0.232055,0.912148,0.232055


### CART

In [None]:
cart = DecisionTreeRegressor(random_state=1234, criterion="squared_error",max_depth=3)

In [28]:
rmse_cart_test, r2_cart_test = [], []
rmse_cart_train, r2_cart_train = [], []
pred_cart_test, pred_cart_train = [], []

k = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in k.split(rfvars):
    
    X_train, X_test = rfvars.iloc[train_index], rfvars.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    ### Random Forest Model ###
    cart_mod = cart.fit(X_train, y_train)

    y_pred_test = rf_mod.predict(X_test)
    y_pred_train = rf_mod.predict(X_train)
    
    pred_cart_test.append(y_pred_test.mean())
    pred_cart_train.append(y_pred_train.mean())

    rmse_cart_test.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    r2_cart_test.append(r2_score(y_test, y_pred_test))
    
    rmse_cart_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    r2_cart_train.append(r2_score(y_train, y_pred_train))

In [29]:
results_cart_mod = {
        "predicted train": pred_cart_train,
        "r2 train": r2_cart_train,
        "rmse train": rmse_cart_train,
        "predicted test": pred_cart_test,
        "r2 test": r2_cart_test,
        "rmse test": pred_cart_test
    }
results_cart_mod = pd.concat([pd.DataFrame(results_cart_mod), pd.DataFrame(pd.DataFrame(results_cart_mod).mean(), columns=["Average"]).T])
results_cart_mod

Unnamed: 0,predicted train,r2 train,rmse train,predicted test,r2 test,rmse test
0,0.231669,0.943934,0.09996,0.232944,0.94938,0.232944
1,0.232471,0.940821,0.102769,0.229734,0.961984,0.229734
2,0.228835,0.943555,0.099793,0.244276,0.950566,0.244276
3,0.23156,0.942451,0.101073,0.233379,0.955125,0.233379
4,0.235084,0.954229,0.090736,0.219282,0.906147,0.219282
Average,0.231924,0.944998,0.098866,0.231923,0.94464,0.231923


### BOOSTING

In [None]:
gbm = GradientBoostingRegressor(learning_rate=0.1, min_samples_split=20, max_features = 10
                                #, n_estimators = 50
                               )

tune_grid = {"n_estimators": [200, 300], "max_depth": [5, 10]}

gbm_model_cv = GridSearchCV(
    gbm,
    tune_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    verbose=10,
    n_jobs=-1
)

In [31]:
# 1. Flatten categorical_columns and ensure no nested lists
# We use a list comprehension to make sure we only grab strings
raw_cat_list = engvar3 + ["balsheet_notfullyear", "foreign_management"]
categorical_columns = []
for item in raw_cat_list:
    if isinstance(item, list):
        categorical_columns.extend(item)
    else:
        categorical_columns.append(item)

# 2. Flatten all_vars the same way
final_all_vars = []
for item in all_vars:
    if isinstance(item, list):
        final_all_vars.extend(item)
    else:
        final_all_vars.append(item)

# 3. Filter numerical columns based on the flattened lists
numerical_columns = [col for col in final_all_vars if col not in categorical_columns]

# 4. Redefine Preprocessing
preprocessing = ColumnTransformer(
    [
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns),
        ("num", "passthrough", numerical_columns),
    ]
)

# Now try the fit again
gbm_pipe = Pipeline([("preprocess", preprocessing), ("regressor", gbm_model_cv)])

In [32]:
# watch out this takes 10 min to run!
#
r2_gbm_test, r2_gbm_train = [], []
rmse_gbm_test, rmse_gbm_train = [], []
pred_gbm_test, pred_gbm_train = [], []

for train_index, test_index in k.split(firms_df[final_all_vars]):
    
    X_train, X_test = firms_df[final_all_vars].iloc[train_index], firms_df[final_all_vars].iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # 1. Fit the model
    gbm_mod = gbm_pipe.fit(X_train, y_train)
    
    # 2. Predict for TRAIN and calculate metrics
    y_pred_train = gbm_mod.predict(X_train)  # <--- Define this!
    pred_gbm_train.append(y_pred_train.mean())
    rmse_gbm_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    r2_gbm_train.append(r2_score(y_train, y_pred_train))

    # 3. Predict for TEST and calculate metrics
    y_pred_test = gbm_mod.predict(X_test)    # <--- Define this!
    pred_gbm_test.append(y_pred_test.mean())
    rmse_gbm_test.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    r2_gbm_test.append(r2_score(y_test, y_pred_test))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START max_depth=5, n_estimators=200...............................
[CV 3/5; 1/4] START max_depth=5, n_estimators=200...............................
[CV 2/5; 1/4] START max_depth=5, n_estimators=200...............................
[CV 2/5; 2/4] START max_depth=5, n_estimators=300...............................
[CV 3/5; 2/4] START max_depth=5, n_estimators=300...............................
[CV 4/5; 1/4] START max_depth=5, n_estimators=200...............................
[CV 1/5; 2/4] START max_depth=5, n_estimators=300...............................
[CV 5/5; 1/4] START max_depth=5, n_estimators=200...............................
[CV 5/5; 1/4] END max_depth=5, n_estimators=200;, score=-0.132 total time=   6.9s
[CV 4/5; 2/4] START max_depth=5, n_estimators=300...............................
[CV 3/5; 1/4] END max_depth=5, n_estimators=200;, score=-0.130 total time=   7.4s
[CV 5/5; 2/4] START max_depth=5, n_estimators=3

In [33]:
results_gbm_mod = {
        "predicted train": pred_gbm_train,
        "r2 train": r2_gbm_train,
        "rmse train": rmse_gbm_train,
        "predicted test": pred_gbm_test,
        "r2 test": r2_gbm_test,
        "rmse test": pred_gbm_test
    }
results_gbm_mod = pd.concat([pd.DataFrame(results_gbm_mod), pd.DataFrame(pd.DataFrame(results_gbm_mod).mean(), columns=["Average"]).T])
#pd.DataFrame(results_gbm_mod)
results_gbm_mod

Unnamed: 0,predicted train,r2 train,rmse train,predicted test,r2 test,rmse test
0,0.23208,0.954391,0.090158,0.234763,0.889247,0.234763
1,0.232541,0.948994,0.095409,0.229222,0.906724,0.229222
2,0.228764,0.949745,0.094163,0.244884,0.897009,0.244884
3,0.230769,0.95008,0.094136,0.230559,0.905647,0.230559
4,0.235191,0.953813,0.091148,0.220576,0.888011,0.220576
Average,0.231869,0.951404,0.093003,0.232001,0.897327,0.232001


In [42]:
np.mean(rmse_modelx1_train)

np.float64(0.34642751635347707)

In [None]:
## comparing all models:

model_comparison = pd.DataFrame({'model': ['OLS', 'LASSO', "CART", 'GBM', 'RF'],
    'RMSE': [np.mean(rmse_modelx1_train), np.mean(rmse_lasso_train),
            np.mean(rmse_cart_train), np.mean(rmse_gbm_train), np.mean(rmse_rf_train)],
    "R2": [np.mean(r2_modelx1_train), np.mean(r2_lasso_train),
            np.mean(r2_cart_train), np.mean(r2_gbm_train), np.mean(r2_rf_train)]
})

print("The Random Forest model works best in both RMSE and R2")
model_comparison

The Random Forest model works best in both RMSE and R2


Unnamed: 0,model,RMSE,R2
0,OLS,0.346428,0.326148
1,LASSO,0.354628,0.293866
2,CART,0.098866,0.944998
3,GBM,0.093003,0.951404
4,RF,0.090617,0.953892
