In [1]:
import os
import sys
import warnings

import pandas as pd
pd.set_option('display.max_columns', 500)
import matplotlib.pyplot as plt
import numpy as np
import patsy
import sklearn.metrics as metrics #pip install scikit-learn
import statsmodels.formula.api as smf
#from helper_functions import *
from plotnine import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
    LogisticRegressionCV,
)

from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from statsmodels.tools.eval_measures import rmse

warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'patsy'

In [None]:
# Read the data
path = "~/Documents/GeorgiaTech/VIP/VIP-team-Machine-Learning-for-Financial-Markets/data/intro/"
data = pd.read_excel(path+"ma_data.xlsx",engine='openpyxl')

In [None]:
data.head

In [None]:
######
# EDA
#
data.describe(include = 'all')

data['period'] = pd.to_datetime(data['period'], format='%Y/%m/%d')
data['event_period'] = pd.to_datetime(data['event_period'], format='%Y/%m/%d')

In [None]:
# Histograms data.columns.tolist()
data[["ret","sale",'prc', 'ret2yr', 
      'logsize', 'logBM', 'mombcz', 'roa', 'beta', 
      'dividend', 'irisk', 'illiq', 'turnover', 'leverage', 
      'salesprice','event_firm', ]].hist(
    bins=15,
    figsize=(10, 8),
    grid = False,
    rwidth = 0.9,
)
plt.show()


In [None]:
# Show descriptives for event vs non-event firms
data.groupby(['event_firm'])['ret','sale','logsize'].describe(include = 'all')

In [None]:
# Returns: Box-plot
(
    ggplot(data, aes(y="ret", x="event_firm", group = "event_firm"))
    + geom_boxplot(color="blue", size=0.5, width=0.1, alpha=0.5)
    + stat_boxplot(geom = "errorbar", width = 0.05,  size = 0.5, color = 'red')
    + stat_summary(fun_data="mean_se", geom="point", size=4, color="red", fill="red")
    + labs(x="M&A", y="Return")
    + theme_bw()
)

In [None]:
# Size: Box-plot
( ggplot(data, aes(y="logsize", x="event_firm", group = "event_firm"))
    + geom_boxplot(color="blue", size=0.5, width=0.1, alpha=0.5)
    + stat_boxplot(geom = "errorbar", width = 0.05,  size = 0.5, color = 'red')
    + stat_summary(fun_data="mean_se", geom="point", size=4, color="red", fill="red")
    + labs(x="M&A", y="Log Size")
    + theme_bw()
)

In [None]:
# We want to check the short-run differences
ma_short = data.loc[data['event_firm'] == 1]
ma_short['devent_period'] = ((ma_short['event_period']-ma_short['period'])/np.timedelta64(1,'M'))
ma_short['devent_period'] = ma_short['devent_period'].astype(int)
ma_short = ma_short.loc[ma_short['devent_period'].isin([-1,1])]

ma_short.groupby(['devent_period'])['ret','sale','logsize'].describe(include = 'all')

In [None]:
# Long-short strategy in 2016
np.mean(ma_short['ret'].loc[ma_short['devent_period']==-1]) - np.mean(ma_short['ret'].loc[ma_short['devent_period']==1])

In [None]:
# Box-plot for making it more complex
( ggplot(ma_short, aes(y="ret", x="devent_period", group = "devent_period"))
    + geom_boxplot(color="blue", size=0.5, width=0.1, alpha=0.5)
    + stat_boxplot(geom = "errorbar", width = 0.05,  size = 0.5, color = 'red')
    + stat_summary(fun_data="mean_se", geom="point", size=4, color="red", fill="red")
    + labs(x="M&A", y="Return")
    + theme_bw()
)

In [None]:
#####
# Modeling

# Handling the missing values:
# options
#   a) remove
#   b) impute and flag
#
# as we would loose many variable we will impute and add flag variables

###
# Prepare the data
features = ['ret','sale','prc','ret2yr','logsize','logBM','mombcz','roa',
              'beta','dividend','irisk','illiq','turnover',
              'leverage','salesprice']


# Assign flag variables and impute with mean
for col in features:
    #data[col+"_missing"] = data[col].isnull()
    data[col] = data[col].fillna(data[col].mean())

In [None]:
######################
# STEP 0)
# Create train and holdout samples
data_train, data_holdout = train_test_split(data, train_size=0.8, random_state=42)

print("Total")
print(data["event_firm"].value_counts(normalize=True))
print("Train")
print(data_train["event_firm"].value_counts(normalize=True))
print("Holdout")
print(data_holdout["event_firm"].value_counts(normalize=True))


In [None]:
###
# GENERAL MODEL SETUP 
# Specify 5 fold cross-validation method
k = KFold(n_splits=5, shuffle=True, random_state=42)

# Create trainin matrices
model_equation = "event_firm~" + "+".join(features) #.join(model_vars)
y_train, X_train = patsy.dmatrices(model_equation, data_train)

In [None]:
########
# Simple Linear Probability Model
#   without CV
LPM_brier = LinearRegression()
lpm = LPM_brier.fit(X_train, y_train)

In [None]:
##########
# Simple Logit Model with CV
LRCV_brier = LogisticRegressionCV(
        Cs=10^20,
        cv=k,
        refit=True,
        scoring="neg_brier_score",
        solver="newton-cg",
        tol=1e-7,
        random_state=42 )
        
logit = LRCV_brier.fit(X_train, y_train)

In [None]:
########
# LASSO with logit model

# Normalise X vars for Lasso
normalized_logitvars = pd.DataFrame(
    StandardScaler().fit_transform(X_train),
    columns=X_train.design_info.column_names,
)

# Set regularization parameters
lambdas = list(10 ** np.arange(-1, -4.01, -1 / 5))
n_obs = normalized_logitvars.shape[0] * 4 / 5
C_values = [
    1 / (l * n_obs) for l in lambdas
]  # Cs are the inverse of regularization strength

# Initialize and fit Logit Lasso
logLasso_brier = LogisticRegressionCV(
    Cs=C_values,
    penalty="l1",
    cv=k,
    refit=True,
    scoring="neg_brier_score",
    solver="liblinear",
    random_state=42,
)
# Estimate
lasso = logLasso_brier.fit(normalized_logitvars, y_train)

In [None]:
#################
# Random Forest
#
# a) show a single tree
#
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

# Each Random Forest is composed by trees
data_for_graph = data_train[["ret", "logsize", "logBM","mombcz"]]
rf_for_graph = DecisionTreeClassifier(
    ccp_alpha=0.00005, min_samples_leaf=100, max_depth=5, random_state=41
).fit(data_for_graph, y_train)

plt.figure()
plt.figure(figsize=(11, 11))
plot_tree(
    rf_for_graph,
    feature_names=data_for_graph.columns,
    filled=True,
    rounded=True,
    proportion=True,
    fontsize=10,
)
plt.title("Decision tree")
plt.show()

In [None]:
######
# b) Do the actual forest
grid = {
    "max_features": [5],
    "criterion": ["gini"],
    "min_samples_split": [16],
} 
    
# 5 fold CV
prob_forest = RandomForestClassifier(random_state=42, n_estimators=100, oob_score=True)
prob_forest_grid = GridSearchCV(
    prob_forest,
    grid,
    cv=k,
    refit="roc_auc",
    scoring=["roc_auc"],
)

# Fit the model
prob_forest_fit = prob_forest_grid.fit(X_train, y_train)

In [None]:
###########
# Compare predictions
_, X_holdout = patsy.dmatrices(model_equation, data_holdout)

# Predict on holdut sample
data_holdout["lpm_pred"] = lpm.predict(X_holdout)
data_holdout["logit_pred"] = logit.predict_proba(X_holdout)[:, 1]
data_holdout["lasso_pred"] = lasso.predict_proba(X_holdout)[:, 1]
data_holdout["rf_pred"] = prob_forest_fit.predict_proba(X_holdout)[:, 1]

# Calculate the RMSE
print("Linear Probability Model")
round(rmse(data_holdout["lpm_pred"], data_holdout["event_firm"]), 4)
print("Logit Model")
round(rmse(data_holdout["logit_pred"], data_holdout["event_firm"]), 4)
print("LASSO with Logit")
round(rmse(data_holdout["lasso_pred"], data_holdout["event_firm"]), 4)
print("Random Forest")
round(rmse(data_holdout["rf_pred"], data_holdout["event_firm"]), 4)
