### EV/sales modeling with business descriptions and NO SIC code in input database

In this script we are going to train different ML models with three main types of architecture: Random Forest, Gradient Boosting, and Support Vector Machine. Our dependent variable is the EV/Sales multiple, and the independent ones are the financial metrics selected from the financial features analysis + the encoded business description of the firm, accounting for a qualitative description of the business' operating activity. Each model is trained and tested 50 times with different initialization seeds and hyper parameters

In [6]:
import warnings

warnings.filterwarnings("ignore")

We are now going to define all the project-related folders and loading out Pandas DataFrame

In [7]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from nltk.corpus import stopwords
import random
import statistics

import string
from sklearn.decomposition import PCA
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [1]:
from pathlib import Path
import pandas as pd

data_folder: Path = Path("/Users/giovanni/Documents/QVF - quantitative framework for valuation multiples computation in mergers and acquisitions/data/00_raw")
dataset_excel_file_name: str = (
    "sic_industrial_and_commercial_machinery_and_computer_equipment.xlsx"
)
dataset_excel_sheet_reference: str = "Filtered Results"

# Load Excel file
data_frame: pd.DataFrame = pd.read_excel(
    io=data_folder.joinpath(dataset_excel_file_name),
    sheet_name=dataset_excel_sheet_reference,
)
data_frame.columns = data_frame.columns.astype(str)

## Setting independent and dependent variables

In [23]:
# Including Business Descriptions
bd = pd.read_excel("/Users/giovanni/Documents/QVF - quantitative framework for valuation multiples computation in mergers and acquisitions/data/10_draft_processing/encoded_business_descriptions_with_pca.xlsx")
bd.columns = bd.columns.astype(str)
bd.drop([bd.columns[0]], axis = 1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,2.940031,-1.200982,1.386171,1.421958,1.295752,1.127542,-0.830551,1.265011,-0.948901,-0.650675,0.457350,-1.682913,0.035319,-1.430782,-1.052864
1,2.969031,0.568739,-0.995405,-0.047311,0.065348,0.300497,-1.188035,0.392371,0.262520,-1.473936,0.133124,-0.383836,-0.231574,-0.033198,-0.238394
2,1.991280,0.521885,-0.436549,0.004570,-0.152528,-0.051827,-0.032181,0.157405,-0.084538,0.317521,0.186401,-0.063512,0.148592,0.760500,-0.068546
3,0.071584,0.996061,-0.075963,0.183511,0.772674,2.509305,0.796459,-1.710631,0.078906,0.619381,-0.470390,-0.409396,-0.884893,-0.360189,0.075540
4,-0.378382,0.166162,0.388914,-1.702416,-1.438576,1.328583,-0.152559,0.229676,-0.745418,-0.293819,0.358246,-0.725692,0.428501,0.600632,-0.138775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,-0.038565,0.198642,0.312745,0.838248,0.058639,1.084777,0.696707,-0.766096,-0.045153,0.662046,-0.647555,0.462762,-0.617401,-0.578446,-0.377706
148,-1.125430,-0.742834,-0.235994,-0.097729,-0.267132,0.020630,1.631020,0.505772,0.506033,-0.286671,0.347114,-0.337646,0.321922,-0.495215,-0.202751
149,-1.198544,0.054829,-0.717960,1.128502,-0.740486,-0.855033,-0.099541,0.878284,-0.312854,-0.082944,-0.972884,-0.442968,-0.379204,-0.656778,-0.249108
150,-0.245823,1.087030,0.727674,-0.287521,0.254816,-0.525746,0.820819,0.105950,0.174259,-0.134876,-0.716738,0.015089,0.183705,0.059694,-0.352881


In [25]:
from sklearn.preprocessing import StandardScaler
from numpy.typing import NDArray

# Model Financial Features
dataset_features_list: list = [
    "sic_prim",
    "n_empl_22",
    "n_publications",
    "cagr_revs",
    "cagr_ta",
    "2y_avg_ta",
    "ebit_m_22",
    "ni_m_22",
    "capex_revs_22",
    "cagr_capex",
    "ta_turnover_22",
    "ca_turnover_22",
    "cap_intensity_22",
    "roa_22",
    "ta_tl_22",
]

# Define Financial Feature Dataset
feature_datasets: pd.DataFrame = data_frame[dataset_features_list]
#feature_datasets.columns = feature_datasets.columns.astype(str)

# Including Business Descriptions
feature_datasets = pd.concat([feature_datasets, bd], axis=1)

In [26]:
# One-dimensional Model Target Variable
target_variable_name: str = "ev_sales_22"
target_vector: NDArray = data_frame[target_variable_name].to_numpy()

# Split the data into features (X) and target (y)
# Normalize numerical features
scaler: StandardScaler = StandardScaler()
X = scaler.fit_transform(feature_datasets)

In [27]:
feature_datasets

Unnamed: 0,sic_prim,n_empl_22,n_publications,cagr_revs,cagr_ta,2y_avg_ta,ebit_m_22,ni_m_22,capex_revs_22,cagr_capex,...,5,6,7,8,9,10,11,12,13,14
0,3571,164000,157916,0.103848,-0.008986,3.518785e+08,0.302887,0.253096,0.027155,-0.052982,...,1.127542,-0.830551,1.265011,-0.948901,-0.650675,0.457350,-1.682913,0.035319,-1.430782,-1.052864
1,7372,221000,274012,0.157741,0.089593,3.493095e+08,0.420043,0.366863,0.120472,0.197077,...,0.300497,-1.188035,0.392371,0.262520,-1.473936,0.133124,-0.383836,-0.231574,-0.033198,-0.238394
2,3571,133000,52015,0.030772,-0.053849,9.117300e+07,0.060508,0.023871,0.029355,0.176393,...,-0.051827,-0.032181,0.157405,-0.084538,0.317521,0.186401,-0.063512,0.148592,0.760500,-0.068546
3,3724,182000,206529,0.179107,0.043060,1.601340e+08,0.082059,0.077482,0.041372,0.104154,...,2.509305,0.796459,-1.710631,0.078906,0.619381,-0.470390,-0.409396,-0.884893,-0.360189,0.075540
4,3724,172000,284661,-0.120298,-0.117297,1.938625e+08,-0.004664,0.000861,0.025543,-0.320156,...,1.328583,-0.152559,0.229676,-0.745418,-0.293819,0.358246,-0.725692,0.428501,0.600632,-0.138775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,3827,88,9,0.017859,0.099436,1.969600e+04,0.073583,0.057320,0.011482,0.113792,...,1.084777,0.696707,-0.766096,-0.045153,0.662046,-0.647555,0.462762,-0.617401,-0.578446,-0.377706
148,3533,86,30,0.011482,-0.027028,1.610637e+04,0.094570,0.055774,0.174378,0.453951,...,0.020630,1.631020,0.505772,0.506033,-0.286671,0.347114,-0.337646,0.321922,-0.495215,-0.202751
149,3669,200,11,0.148172,0.071812,5.408350e+04,0.272390,0.171980,0.018809,-0.347916,...,-0.855033,-0.099541,0.878284,-0.312854,-0.082944,-0.972884,-0.442968,-0.379204,-0.656778,-0.249108
150,3559,84,134,0.067183,0.133993,1.890012e+04,0.045356,0.042230,0.036915,0.003915,...,-0.525746,0.820819,0.105950,0.174259,-0.134876,-0.716738,0.015089,0.183705,0.059694,-0.352881


## Random Forest

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

rf_maes = []
rf_mses = []
rf_rsqr = []

for iteration in range(50):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        target_vector,
        test_size=0.25,
    )

    # Define the parameter grid for Random Forest
    param_grid: dict[str, list[int | None]] = {
        "n_estimators": [100, 200, 300],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    }

    # Initialize Random Forest Regressor
    cv_rf_regressor: RandomForestRegressor = RandomForestRegressor()

    # Grid search with Cross-Validation
    grid_search: GridSearchCV = GridSearchCV(
        estimator=cv_rf_regressor,
        param_grid=param_grid,
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1,
    )
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_

    # Train the model with the best parameters
    random_forest_model: RandomForestRegressor = RandomForestRegressor(**best_params)
    random_forest_model.fit(X_train, y_train)


    test_mae = mean_absolute_error(y_test, random_forest_model.predict(X_test))
    rf_maes.append(test_mae)

    test_mse = mean_squared_error(y_test, random_forest_model.predict(X_test))
    rf_mses.append(test_mse)

    test_r2 = r2_score(y_test, random_forest_model.predict(X_test))
    rf_rsqr.append(test_r2)

rf_avg_mae = statistics.mean(rf_maes)
rf_avg_mse = statistics.mean(rf_mses)
rf_avg_r2 = statistics.mean(rf_rsqr)
rf_std_mae = statistics.stdev(rf_maes)
rf_std_mse = statistics.stdev(rf_mses)
rf_std_r2 = statistics.stdev(rf_rsqr)

print(f"Average Test MAE: {rf_avg_mae}, STD: {rf_std_mae}")
print(f"Average Test MSE: {rf_avg_mse}, STD: {rf_std_mse}")
print(f"Average Test R-squared: {rf_avg_r2}, STD: {rf_std_r2}")

Average Test MAE: 37.77709304811455, STD: 17.69928393546866
Average Test MSE: 15758.560127381506, STD: 24246.470206710623
Average Test R-squared: -2.5917177144266845, STD: 5.966937757214315


## Gradient Boosting

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

gb_maes = []
gb_mses = []
gb_rsqr = []

for rsid in range(50):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        target_vector,
        test_size=0.25,
    )

    # Define the parameter grid for XGBoost
    param_grid = {
        "n_estimators": [100, 200, 300],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.05, 0.1],
    }

    # Initialize XGBoost Regressor
    cv_xgb_regressor = XGBRegressor(objective="reg:squarederror")

    # Grid search with cross-validation
    grid_search = GridSearchCV(
        estimator=cv_xgb_regressor,
        param_grid=param_grid,
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1,
    )
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_

    # Train the model with the best parameters
    xgboost_model = XGBRegressor(**best_params, random_state=42)
    xgboost_model.fit(X_train, y_train)

    # Predictions
    train_predictions = xgboost_model.predict(X_train)
    test_predictions = xgboost_model.predict(X_test)

    # Evaluation
    test_mae = mean_absolute_error(y_test, test_predictions)
    gb_maes.append(test_mae)

    test_mse = mean_squared_error(y_test, test_predictions)
    gb_mses.append(test_mse)

    test_r2 = r2_score(y_test, test_predictions)
    gb_rsqr.append(test_r2)

gb_avg_mae = statistics.mean(gb_maes)
gb_avg_mse = statistics.mean(gb_mses)
gb_avg_r2 = statistics.mean(gb_rsqr)
gb_std_mae = statistics.stdev(gb_maes)
gb_std_mse = statistics.stdev(gb_mses)
gb_std_r2 = statistics.stdev(gb_rsqr)

print(f"Average Test MAE: {gb_avg_mae}, STD: {gb_std_mae}")
print(f"Average Test MSE: {gb_avg_mse}, STD: {gb_std_mse}")
print(f"Average Test R-squared: {gb_avg_r2}, STD: {gb_std_r2}")

Average Test MAE: 44.44548191779294, STD: 20.37351138416319
Average Test MSE: 34280.530197984335, STD: 37217.899974718726
Average Test R-squared: -10.914935585122752, STD: 31.76002818445584


## SVM

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

svm_maes = []
svm_mses = []
svm_rsqr = []

for rsid in range(50):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        target_vector,
        test_size=0.25,
    )

    # Define the parameter grid for SVR
    param_grid = {
        "kernel": ["linear", "poly", "rbf"],
        "C": [0.1, 1, 10],
        "gamma": ["scale", "auto"],
    }

    # Initialize SVR
    cv_svr_regressor = SVR()

    # Grid search with cross-validation
    grid_search = GridSearchCV(
        estimator=cv_svr_regressor,
        param_grid=param_grid,
        cv=5,
        scoring="neg_mean_absolute_error",
        n_jobs=-1,
    )
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_

    # Train the model with the best parameters
    svr_model = SVR(**best_params)
    svr_model.fit(X_train, y_train)

    # Predictions
    train_predictions = svr_model.predict(X_train)
    test_predictions = svr_model.predict(X_test)

    # Evaluation
    test_mae = mean_absolute_error(y_test, test_predictions)
    svm_maes.append(test_mae)

    test_mse = mean_squared_error(y_test, test_predictions)
    svm_mses.append(test_mse)

    test_r2 = r2_score(y_test, test_predictions)
    svm_rsqr.append(test_r2)

svm_avg_mae = statistics.mean(svm_maes)
svm_avg_mse = statistics.mean(svm_mses)
svm_avg_r2 = statistics.mean(svm_rsqr)
svm_std_mae = statistics.stdev(svm_maes)
svm_std_mse = statistics.stdev(svm_mses)
svm_std_r2 = statistics.stdev(svm_rsqr)

print(f"Average Test MAE: {svm_avg_mae}, STD: {svm_std_mae}")
print(f"Average Test MSE: {svm_avg_mse}, STD: {svm_std_mse}")
print(f"Average Test R-squared: {svm_avg_r2}, STD: {svm_std_r2}")

Average Test MAE: 34.3274439911594, STD: 22.256201512489653
Average Test MSE: 28246.17293511209, STD: 39402.81390423328
Average Test R-squared: -0.026494028212099, STD: 0.0561012208384979
