### EV/sales modeling with SIC code and NO business descriptions in input database

In this script we are going to train different ML models with three main types of architecture: Random Forest, Gradient Boosting, and Support Vector Machine. Our dependent variable is the EV/Sales multiple, and the independent ones are the financial metrics selected from the financial features analysis + the SIC code for each firm, accounting for a qualitative description of the business' operating activity. Each model is trained and tested 50 times with different initialization seeds and hyper parameters

In [5]:
import warnings

warnings.filterwarnings("ignore")

We are now going to define all the project-related folders and loading out Pandas DataFrame

In [6]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from nltk.corpus import stopwords
import random
import statistics

import string
from sklearn.decomposition import PCA
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [1]:
from pathlib import Path
import pandas as pd

data_folder: Path = Path("/Users/giovanni/Documents/QVF - quantitative framework for valuation multiples computation in mergers and acquisitions/data/00_raw")
dataset_excel_file_name: str = (
    "sic_industrial_and_commercial_machinery_and_computer_equipment.xlsx"
)
dataset_excel_sheet_reference: str = "Filtered Results"

# Loading Excel file
data_frame: pd.DataFrame = pd.read_excel(
    io=data_folder.joinpath(dataset_excel_file_name),
    sheet_name=dataset_excel_sheet_reference,
)
data_frame.columns = data_frame.columns.astype(str)

## Setting independent and dependent variables

In [10]:
from sklearn.preprocessing import StandardScaler
from numpy.typing import NDArray

# Model Features
dataset_features_list: list = [
    "sic_prim",
    "n_empl_22",
    "n_publications",
    "cagr_revs",
    "cagr_ta",
    "2y_avg_ta",
    "ebit_m_22",
    "ni_m_22",
    "capex_revs_22",
    "cagr_capex",
    "ta_turnover_22",
    "ca_turnover_22",
    "cap_intensity_22",
    "roa_22",
    "ta_tl_22",
]

# Define Feature Dataset
feature_datasets: pd.DataFrame = data_frame[dataset_features_list]
#feature_datasets.columns = feature_datasets.columns.astype(str)

# Setting target variable
target_variable_name: str = "ev_sales_22"
target_vector: NDArray = data_frame[target_variable_name].to_numpy()

# Split the data into features (X) and target (y) + normalize numerical features
scaler: StandardScaler = StandardScaler()
X = scaler.fit_transform(feature_datasets)

In [11]:
feature_datasets

Unnamed: 0,sic_prim,n_empl_22,n_publications,cagr_revs,cagr_ta,2y_avg_ta,ebit_m_22,ni_m_22,capex_revs_22,cagr_capex,ta_turnover_22,ca_turnover_22,cap_intensity_22,roa_22,ta_tl_22
0,3571,164000,157916,0.103848,-0.008986,3.518785e+08,0.302887,0.253096,0.027155,-0.052982,1.117852,2.912212,0.964294,0.282924,1.167742
1,7372,221000,274012,0.157741,0.089593,3.493095e+08,0.420043,0.366863,0.120472,0.197077,0.543444,1.168466,2.170530,0.199370,1.839857
2,3571,133000,52015,0.030772,-0.053849,9.117300e+07,0.060508,0.023871,0.029355,0.176393,1.141612,2.415551,0.885510,0.027251,0.966333
3,3724,182000,206529,0.179107,0.043060,1.601340e+08,0.082059,0.077482,0.041372,0.104154,0.422210,1.580331,2.467292,0.032714,1.842286
4,3724,172000,284661,-0.120298,-0.117297,1.938625e+08,-0.004664,0.000861,0.025543,-0.320156,0.307645,0.995119,3.344331,0.000265,1.217176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,7371,67,3,-0.074816,0.228730,9.254000e+03,-2.162704,-2.155532,0.196440,0.644956,0.811752,1.388602,1.274564,-1.749757,0.687546
211,3679,350,1749,-0.559323,0.494800,1.226105e+05,-81.159639,-79.956325,6.564759,0.405194,0.005774,0.007296,45.998400,-0.461677,4.553395
212,3519,13,6,-0.026281,0.325932,5.439575e+04,-9.733878,-9.724274,0.037424,0.208634,0.134532,0.245627,16.390755,-1.308224,22.491748
213,3589,16,222,0.010952,0.148741,4.393500e+03,-2.009714,-1.883972,0.043713,0.242372,0.413247,0.479555,1.141839,-0.778546,7.798261


## Random Forest

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

rf_maes = []
rf_mses = []
rf_rsqr = []

for iteration in range(50):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        target_vector,
        test_size=0.25,
    )

    # Define the parameter grid for Random Forest
    param_grid: dict[str, list[int | None]] = {
        "n_estimators": [100, 200, 300],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    }

    # Initialize Random Forest
    cv_rf_regressor: RandomForestRegressor = RandomForestRegressor()

    # Grid search with Cross-Validation
    grid_search: GridSearchCV = GridSearchCV(
        estimator=cv_rf_regressor,
        param_grid=param_grid,
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1,
    )
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_

    # Train the model with the best parameters
    random_forest_model: RandomForestRegressor = RandomForestRegressor(**best_params)
    random_forest_model.fit(X_train, y_train)

    # TEST Evaluation
    test_mae = mean_absolute_error(y_test, random_forest_model.predict(X_test))
    rf_maes.append(test_mae)

    test_mse = mean_squared_error(y_test, random_forest_model.predict(X_test))
    rf_mses.append(test_mse)

    test_r2 = r2_score(y_test, random_forest_model.predict(X_test))
    rf_rsqr.append(test_r2)

rf_avg_mae = statistics.mean(rf_maes)
rf_avg_mse = statistics.mean(rf_mses)
rf_avg_r2 = statistics.mean(rf_rsqr)
rf_std_mae = statistics.stdev(rf_maes)
rf_std_mse = statistics.stdev(rf_mses)
rf_std_r2 = statistics.stdev(rf_rsqr)

print(f"Average Test MAE: {rf_avg_mae}, STD: {rf_std_mae}")
print(f"Average Test MSE: {rf_avg_mse}, STD: {rf_std_mse}")
print(f"Average Test R-squared: {rf_avg_r2}, STD: {rf_std_r2}")

Average Test MAE: 2.907438048776477, STD: 3.381915463696909
Average Test MSE: 779.3685789266142, STD: 1558.6961619505246
Average Test R-squared: -0.40819254761673973, STD: 2.7361054568747374


In [9]:
# Predict y using the trained model
predicted_y = random_forest_model.predict(X)

print(f"Predicted Target Variable: {predicted_y}")

Predicted Target Variable: [ 6.61826109  9.36530497  1.0338057   2.40932298  1.6980636   2.78031469
  2.87742336  3.59428747 10.16211047  0.7348095   0.70227386  1.44255545
  2.28789393  3.39375028  2.39750572  2.95747068  3.9512269   3.47543958
  1.21284456  3.00435251  3.47670873  1.08553623  2.34940028  4.77588495
  2.32770475  1.80197236  4.9237651   0.70926634  4.53634023  1.35902067
  2.27063932  1.93853424  2.95998462  1.0058872   2.56761208  0.56130329
  1.65360307  2.98097282  2.78411732  4.10421003  6.73181521  2.2779393
  0.9212265   4.00391035  2.02782298  1.93591711  3.379334    2.59505781
  2.31405453  1.4525825   2.25818415  1.50526412  2.76697804  2.42341677
  1.86410544  1.83537683  1.60530039  4.05351378  1.75398831  5.08955643
  4.98578953  2.35664362  1.90558977  2.27366289  3.9322936   0.98781165
  2.23236427  3.09282594  2.33303957  1.75344038  2.94298076  4.31692437
  2.01903225  1.14346568  1.37651915  5.35925019  1.59092086  2.00223857
  4.46712149  1.16645673 

## Gradient Boosting

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

gb_maes = []
gb_mses = []
gb_rsqr = []

for rsid in range(50):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        target_vector,
        test_size=0.25,
    )

    # Define the parameter grid for XGBoost
    param_grid = {
        "n_estimators": [100, 200, 300],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.05, 0.1],
    }

    # Initialize XGBoost
    cv_xgb_regressor = XGBRegressor(objective="reg:squarederror")

    # Grid search with cross-validation
    grid_search = GridSearchCV(
        estimator=cv_xgb_regressor,
        param_grid=param_grid,
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1,
    )
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_

    # Train the model with the best parameters
    xgboost_model = XGBRegressor(**best_params, random_state=42)
    xgboost_model.fit(X_train, y_train)

    # Predictions
    train_predictions = xgboost_model.predict(X_train)
    test_predictions = xgboost_model.predict(X_test)

    # TEST Evaluation
    test_mae = mean_absolute_error(y_test, test_predictions)
    gb_maes.append(test_mae)

    test_mse = mean_squared_error(y_test, test_predictions)
    gb_mses.append(test_mse)

    test_r2 = r2_score(y_test, test_predictions)
    gb_rsqr.append(test_r2)

gb_avg_mae = statistics.mean(gb_maes)
gb_avg_mse = statistics.mean(gb_mses)
gb_avg_r2 = statistics.mean(gb_rsqr)
gb_std_mae = statistics.stdev(gb_maes)
gb_std_mse = statistics.stdev(gb_mses)
gb_std_r2 = statistics.stdev(gb_rsqr)

print(f"Average Test MAE: {gb_avg_mae}, STD: {gb_std_mae}")
print(f"Average Test MSE: {gb_avg_mse}, STD: {gb_std_mse}")
print(f"Average Test R-squared: {gb_avg_r2}, STD: {gb_std_r2}")

Average Test MAE: 4.160340231447927, STD: 3.41603224442729
Average Test MSE: 1077.4946572103493, STD: 1668.3511573058925
Average Test R-squared: -81.81490779252186, STD: 287.7189316520744


In [11]:
# Predict y for the new set of features using the trained XGBoost model
predicted_y = xgboost_model.predict(X)

print(f"Predicted Target Variable: {predicted_y}")

Predicted Target Variable: [4.220229  4.220229  1.4435946 2.1104164 1.4518372 2.3864708 2.4026008
 2.2463183 3.9871473 1.4055482 1.2756628 1.6751648 2.0559688 2.6951225
 2.096041  2.3339667 2.6951225 3.5347645 1.4482124 2.2323773 3.5347645
 1.735531  2.2400284 2.6951225 2.079911  2.2200634 5.199701  1.4540391
 2.4741192 1.4056908 2.2463183 2.0492544 2.36959   1.4681818 2.1675596
 1.2756628 2.0516913 2.3864708 2.307712  5.4705915 5.015082  2.1836896
 1.328322  2.6436265 1.8494046 1.8651943 2.7435849 2.307712  2.1938818
 1.5354208 1.9818792 1.328322  2.4341807 2.079911  1.8651719 1.8429699
 1.9890797 5.248163  1.9218837 4.2686915 2.1675596 2.0294845 1.9745307
 2.0877402 4.2686915 1.4012156 2.0441606 2.2463183 1.7603837 2.0908713
 2.307712  4.146424  2.1995115 1.3238468 1.5259386 3.946886  2.0360765
 2.0131998 4.220229  1.4585143 1.5125935 2.1192503 2.2624483 5.23751
 2.509939  1.5619367 1.8230263 1.8093265 2.4584424 1.336893  2.3864708
 3.1856637 2.3339667 1.2472839 2.509939  2.5253742 2

## SVM

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

svm_maes = []
svm_mses = []
svm_rsqr = []

for rsid in range(50):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        target_vector,
        test_size=0.25,
    )

    # Define the parameter grid for SVR
    param_grid = {
        "kernel": ["linear", "poly", "rbf"],
        "C": [0.1, 1, 10],
        "gamma": ["scale", "auto"],
    }

    # Initialize SVR
    cv_svr_regressor = SVR()

    # Grid search with cross-validation
    grid_search = GridSearchCV(
        estimator=cv_svr_regressor,
        param_grid=param_grid,
        cv=5,
        scoring="neg_mean_absolute_error",
        n_jobs=-1,
    )
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_

    # Train the model with the best parameters
    svr_model = SVR(**best_params)
    svr_model.fit(X_train, y_train)

    # Predictions
    train_predictions = svr_model.predict(X_train)
    test_predictions = svr_model.predict(X_test)

    # TEST Evaluation
    test_mae = mean_absolute_error(y_test, test_predictions)
    svm_maes.append(test_mae)

    test_mse = mean_squared_error(y_test, test_predictions)
    svm_mses.append(test_mse)

    test_r2 = r2_score(y_test, test_predictions)
    svm_rsqr.append(test_r2)

svm_avg_mae = statistics.mean(svm_maes)
svm_avg_mse = statistics.mean(svm_mses)
svm_avg_r2 = statistics.mean(svm_rsqr)
svm_std_mae = statistics.stdev(svm_maes)
svm_std_mse = statistics.stdev(svm_mses)
svm_std_r2 = statistics.stdev(svm_rsqr)

print(f"Average Test MAE: {svm_avg_mae}, STD: {svm_std_mae}")
print(f"Average Test MSE: {svm_avg_mse}, STD: {svm_std_mse}")
print(f"Average Test R-squared: {svm_avg_r2}, STD: {svm_std_r2}")

Average Test MAE: 4.094211787648164, STD: 3.7459421299139013
Average Test MSE: 1172.9510606345966, STD: 1790.0465201915392
Average Test R-squared: -1.5841380349857772, STD: 3.1831455587211726


In [23]:
# Predict y for the new set of features using the trained SVR model
predicted_y = svr_model.predict(X)

print(f"Predicted Target Variable: {predicted_y}")

Predicted Target Variable: [ 5.90759251e+00  9.58598092e+00  2.11856066e+00  2.49490881e+00
  1.74951658e+00  2.21014625e+00  2.70171121e+00  3.86060683e+00
  3.78116122e+00  1.04581697e+00  8.90911076e-01  1.81369176e+00
  3.38463164e+00  3.15678143e+00  2.50126961e+00  2.57938153e+00
  3.40968902e+00  2.62494823e+00  1.17217647e+00  2.41308912e+00
  4.29284954e+00  1.73888388e+00  1.24907026e+00  3.73615915e+00
  2.58525414e+00  2.22304563e+00  3.39353556e+00  4.56830027e-01
  3.28095267e+00  6.64697842e-01  2.51291284e+00  1.58695901e+00
  2.95106146e+00  1.44958166e+00  3.55106263e+00 -1.21223014e-01
  1.96245592e+00  3.49671423e+00  3.05936237e+00  6.32135934e+00
  3.34635444e+00  2.70839441e+00  1.18195627e+00  2.49998960e+00
  2.05877289e+00  1.71251884e+00  2.45733252e+00  2.69325185e+00
  1.58099005e+00  1.78735736e+00  1.15470789e+00  1.68291010e+00
  2.07427728e+00  2.32420328e+00  2.11990011e+00  1.28749979e+00
  1.64366572e+00  4.03487370e+00  1.47391804e+00  3.54091510e+0

In [26]:
import keras
from keras import layers

input_shape: int = (feature_datasets.shape[1],)

input_layer = layers.Input(shape=input_shape)

layer = layers.Dense(64, activation="linear")(input_layer)
layer = layers.Dense(128, activation="linear")(layer)
layer = layers.Dense(256, activation="linear")(layer)
layer = layers.Dense(128, activation="relu")(layer)
layer = layers.Dense(64, activation="relu")(layer)
layer = layers.Dense(32, activation="linear")(layer)

output_layer = layers.Dense(1, activation="linear")(layer)

nn_model = keras.Model(input_layer, output_layer)
nn_model.summary()

Let's define the dataset

In [27]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    target_vector,
    test_size=0.25,
)

Let's save the model information into a new folder

In [28]:
from keras.utils import plot_model

nn_model_path: Path = Path("model_figure/")
nn_model_name: str = f"nn_model_{np.random.randint(1, 1000)}.png"
plot_model(nn_model, to_file=nn_model_name)

You must install pydot (`pip install pydot`) for `plot_model` to work.


In [None]:
nn_model.compile(
    loss=keras.losses.MeanSquaredError(),
    optimizer=keras.optimizers.AdamW(learning_rate=1e-6),
    metrics=["mse", "mae"],
)
nn_model.fit(X_train, y_train, epochs=1000, batch_size=32, shuffle=True)

Epoch 1/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 12.5815 - mae: 2.5232 - mse: 11.9969  
Epoch 2/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 11.3526 - mae: 2.4494 - mse: 11.2360
Epoch 3/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 11.2572 - mae: 2.4601 - mse: 10.7062
Epoch 4/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 10.9941 - mae: 2.3590 - mse: 11.2260
Epoch 5/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 10.6017 - mae: 2.2640 - mse: 10.2602
Epoch 6/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 10.3765 - mae: 2.3862 - mse: 10.4510
Epoch 7/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 11.3695 - mae: 2.4620 - mse: 11.4021
Epoch 8/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 10.

In [70]:
model_evaluation = nn_model.evaluate(X_test, y_test)
print(model_evaluation)

[1.1493641138076782, 3.2422587871551514, 1.1493641138076782]


In [71]:
y_test.T - nn_model.predict(X_test).flatten()



array([ 0.6455199 , -0.32320224,  2.05673388, -1.91085627,  0.46097948,
       -0.03182908, -0.23658887,  0.70209006, -0.33642493, -1.26407142,
       -0.40745157,  7.14060782,  1.923334  ,  3.53708464, -0.05360954,
        4.51605324, -2.33193166,  1.24163666, -0.29332674,  0.11361901,
       -0.81547168, -1.30514682,  1.02622401, -2.16278886, -0.10234982,
       -0.3094264 , -0.52588248,  2.16344362,  0.09150185, -1.01514744,
        0.6114198 ,  0.56674162, -0.09862026,  0.44384843, -0.72829447,
        1.13339798,  0.42322622, -0.62595552])