### PART I: Probability prediction
- Predict probabilities.
- Look at cross-validated performance and pick your favorite model.

In [235]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import regex as re
import statsmodels.formula.api as smf
import warnings
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from statsmodels.tools.eval_measures import mse,rmse
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
import sklearn.metrics as metrics
import patsy
from stargazer.stargazer import Stargazer

In [236]:
# read in the clean dataset
firms_df = pd.read_csv("bisnode_firms_clean.csv")

In [237]:
rawvars = ["curr_assets", "curr_liab", "extra_exp", "extra_inc", "extra_profit_loss", "fixed_assets",
              "inc_bef_tax", "intang_assets", "inventories", "liq_assets", "material_exp", "personnel_exp",
              "profit_loss_year", "sales", "share_eq", "subscribed_cap"]

qualityvars = ["balsheet_flag", "balsheet_length", "balsheet_notfullyear"]

engvar = ["total_assets_bs", "fixed_assets_bs", "liq_assets_bs", "curr_assets_bs",
            "share_eq_bs", "subscribed_cap_bs", "intang_assets_bs", "extra_exp_pl",
            "extra_inc_pl", "extra_profit_loss_pl", "inc_bef_tax_pl", "inventories_pl",
            "material_exp_pl", "profit_loss_year_pl", "personnel_exp_pl"]

engvar2 = ["extra_profit_loss_pl_quad", "inc_bef_tax_pl_quad",
             "profit_loss_year_pl_quad", "share_eq_bs_quad"]

engvar3 = []
for col in firms_df.columns:
    if col.endswith('flag_low') or col.endswith('flag_high') or col.endswith('flag_error') or col.endswith('flag_zero'):
        engvar3.append(col)

d1 =  ["d1_sales_mil_log_mod", "d1_sales_mil_log_mod_sq",
         "flag_low_d1_sales_mil_log", "flag_high_d1_sales_mil_log"]

hr = ["female", "ceo_age", "flag_high_ceo_age", "flag_low_ceo_age",
        "flag_miss_ceo_age", "ceo_count", "labor_avg_mod",
        "flag_miss_labor_avg", "foreign_management"]

In [238]:
all_vars = rawvars + qualityvars + engvar + engvar2 + engvar3 + d1 + hr 

In [239]:
firms_df[all_vars].isna().sum()

curr_assets            0
curr_liab              0
extra_exp              0
extra_inc              0
extra_profit_loss      0
                      ..
flag_miss_ceo_age      0
ceo_count              0
labor_avg_mod          0
flag_miss_labor_avg    0
foreign_management     0
Length: 78, dtype: int64

In [240]:
firms_df.dropna(inplace=True)

### Dealing with categorical variables
To avoide multicolinearity, we drop the first values

In [241]:
firms_df.head()

Unnamed: 0,year,comp_id,begin,end,amort,curr_assets,curr_liab,extra_exp,extra_inc,extra_profit_loss,...,flag_high_ceo_age,flag_miss_ceo_age,ceo_young,labor_avg_mod,flag_miss_labor_avg,sales_mil_log_sq,flag_low_d1_sales_mil_log,flag_high_d1_sales_mil_log,d1_sales_mil_log_mod,d1_sales_mil_log_mod_sq
0,2013,1002029.0,2013-01-01,2013-12-31,14255.555664,217103.703125,161174.078125,0.0,0.0,0.0,...,0,0,1,0.4375,0,1.054824,0,0,-1.155013,1.334055
1,2013,1011889.0,2013-01-01,2013-12-31,66125.929688,235114.8125,16555.554688,0.0,0.0,0.0,...,0,0,0,1.583333,0,0.66646,0,0,0.019109,0.000365
2,2013,1014183.0,2013-01-01,2013-12-31,6970.370605,209562.96875,5703.703613,0.0,0.0,0.0,...,0,0,0,0.819444,0,4.632597,0,0,-0.110044,0.01211
3,2013,1022796.0,2013-01-01,2013-12-31,503.703705,3859.259277,8114.814941,0.0,0.0,0.0,...,0,0,0,0.083333,0,9.971799,0,0,0.488146,0.238287
4,2013,1035705.0,2013-01-01,2013-12-31,244.444443,2392.592529,9733.333008,0.0,0.0,0.0,...,0,0,0,0.222222,0,14.500839,0,0,-0.079375,0.0063


In [242]:
firms_df["ind2_cat"].value_counts().sort_index()

ind2_cat
26.0     735
27.0     441
28.0    1389
29.0     179
30.0     104
33.0    1382
55.0    1299
56.0    8039
Name: count, dtype: int64

In [243]:
ind2_catmat = patsy.dmatrix("C(ind2_cat, Treatment(reference=26))", firms_df, return_type="dataframe") 

In [244]:
m_region_locmat = patsy.dmatrix("C(m_region_loc, Treatment(reference='Central'))", firms_df, return_type="dataframe") 

In [245]:
urban_mmat = patsy.dmatrix("C(urban_m, Treatment(reference=1))", firms_df, return_type="dataframe") 

In [246]:
# Define X1
basevars = firms_df[["sales_mil_log", "sales_mil_log_sq", "d1_sales_mil_log_mod", "profit_loss_year_pl"]]
X1 = pd.concat([basevars, ind2_catmat], axis=1)

# Define X2
X2additional_vars = firms_df[["fixed_assets_bs", "share_eq_bs","curr_liab_bs", "curr_liab_bs_flag_high", \
                          "curr_liab_bs_flag_error",  "age", "foreign_management"]]
X2 = pd.concat([X1, X2additional_vars], axis=1)

# Define X3
firm = pd.concat([firms_df[["age", "age2", "new"]], ind2_catmat, m_region_locmat, urban_mmat], axis=1)
X3 = pd.concat([firms_df[["sales_mil_log", "sales_mil_log_sq"] + engvar + d1], firm], axis=1)

# Define X4
X4 = pd.concat([firms_df[["sales_mil_log", "sales_mil_log_sq"] + engvar + d1 \
                                 + engvar2 + engvar3 + hr + qualityvars], firm], axis=1)

# Define X5

#Creat matrix for interactions1 variables
int1mat = patsy.dmatrix("0 + C(ind2_cat):age + C(ind2_cat):age2 + C(ind2_cat):d1_sales_mil_log_mod \
                + C(ind2_cat):sales_mil_log + C(ind2_cat):ceo_age + C(ind2_cat):foreign_management \
                + C(ind2_cat):female + C(ind2_cat):C(urban_m) + C(ind2_cat):labor_avg_mod", 
                        firms_df, return_type="dataframe")

#Drop first level to get k-1 dummies out of k categorical levels 
for col in int1mat.columns:
    if col.startswith('C(ind2_cat)[26]') or col.endswith('C(urban_m)[1]'):
        int1mat = int1mat.drop([col], axis=1)
        
#Creat matrix for interactions2 variables        
int2mat = patsy.dmatrix("0 + sales_mil_log:age + sales_mil_log:female + sales_mil_log:profit_loss_year_pl \
                + sales_mil_log:foreign_management", 
                        firms_df, return_type="dataframe")

X5 = pd.concat([X4, int1mat, int2mat], axis=1)

# Define logitvars for LASSO
logitvars = pd.concat([X4, int1mat, int2mat], axis=1)

# Define rfvars for RF (no interactions, no modified features)
rfvars  = pd.concat([firms_df[["sales_mil", "d1_sales_mil_log"] + rawvars + hr + qualityvars], firm], axis=1)

In [247]:
y = firms_df["is_fast_growing"]

In [248]:
y.mean()

np.float64(0.2318691037735849)

In [249]:
["Intercept"] + list(X1.columns)

['Intercept',
 'sales_mil_log',
 'sales_mil_log_sq',
 'd1_sales_mil_log_mod',
 'profit_loss_year_pl',
 'Intercept',
 'C(ind2_cat, Treatment(reference=26))[T.27.0]',
 'C(ind2_cat, Treatment(reference=26))[T.28.0]',
 'C(ind2_cat, Treatment(reference=26))[T.29.0]',
 'C(ind2_cat, Treatment(reference=26))[T.30.0]',
 'C(ind2_cat, Treatment(reference=26))[T.33.0]',
 'C(ind2_cat, Treatment(reference=26))[T.55.0]',
 'C(ind2_cat, Treatment(reference=26))[T.56.0]']

### OLS

In [258]:
ols_modelx1 = smf.ols("y ~ X1", data=firms_df).fit()
ols1_summary = Stargazer([ols_modelx1])
ols1_summary.dependent_variable_name("is_fast_growing")
ols_modelx1_param_names = ols_modelx1.params.index.tolist()
ols1_summary.rename_covariates(dict(zip(ols_modelx1_param_names, ["Intercept"] + list(X1.columns))))
ols1_summary

0,1
,
,Dependent variable: is_fast_growing
,
,(1)
,
Intercept,0.083***
,(0.007)
sales_mil_log,-0.007
,(0.004)
"C(ind2_cat, Treatment(reference=26))[T.55.0]",-0.026


In [251]:
# define seed
np.random.seed(1234)

# holdout set
smp_size = round(0.2 * firms_df.shape[0])-1

# train - test split
firms_df_train, firms_df_test=train_test_split(firms_df, test_size=smp_size)

In [252]:
# creating folds
k = KFold(n_splits=5, shuffle=True, random_state=None)