In [1]:
from numpy.random import seed
seed(1)
import tensorflow
tensorflow.random.set_seed(1)

# Base Models for Ensemble Learner

#### Import data and packages

In [2]:
# Import packages
import os
import pandas as pd
import numpy as np
import warnings
from matplotlib import pyplot as plt
import pickle

from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LinearRegression as lin
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import callbacks
from keras.models import load_model

from datetime import date
import datetime as dt

In [3]:
# Your directory
os.chdir("/Users/benjaminlian/Documents/School/5th_year/Spring_2021/Master_Thesis")

# Define periodic prediction function
def periods_lags(df, y_variable, n_periods, n_lags, frequency):
    # Copy input data
    df_copy = df.copy()
    y = df_copy.loc[:, y_variable]
    name_ = y.name
    y_out = pd.DataFrame()
    
    # Drop y-variable from dataframe
    df_copy.drop(columns = [y_variable], inplace = True)
    
    # Create periodic y-variables
    for period in range(n_periods + 1):
        name = name_ + "_q_" + str(period)
        add_period = y.pct_change(period + 1)
        add_period = add_period.shift(-period)
        y_out = pd.concat([y_out, add_period.rename(name)], axis = 1)
    
    # Create lags of y-variables
    for lag in range(1, n_lags + 1):
        name = name_ + "_for_pred_lag_" + str(lag)
        add_lag = y.pct_change().replace(np.nan, 0)
        add_lag.drop(index = add_lag.index[0], inplace = True)
        add_lag = add_lag.shift(lag)
        df_copy[name] = add_lag
        
    # Make sure X and y are of equal index
    y_out.replace(to_replace = np.nan, value = 0, inplace = True)
    index = list(set(df_copy.dropna().index) & set(y_out.index))
    X = df_copy.loc[index, :]
    y = y_out.loc[index, :]
    X.sort_index(inplace = True)
    y.sort_index(inplace = True)
    
    return X, y

# User inputs
n_periods = 4
n_lags = 1
frequency = 1

# Import data and publication dates
df = pd.read_excel("Data/master_thesis_data.xlsx", index_col = 0)

# Extract NO_folio_publication_date
pub_dates = df.NO_folio_publication_date

df, folio_rate = periods_lags(df, "NO_folio_publication_date", n_periods, n_lags, frequency)
folio_rate.rename(columns = {"NO_folio_publication_date":"NO_folio_rate"}, inplace = True)
                
print("Data succesfully imported. Shape of X = {}, y = {}".format(df.shape, folio_rate.shape))

Data succesfully imported. Shape of X = (159, 687), y = (159, 5)


In [4]:
# Train-test split
X_train, X_test, y_train, y_test = tts(df, folio_rate, test_size = 0.65, shuffle = False, random_state = None)

# === Standardization ===
# Extract indeces and columns from data
X_train_index = X_train.index
X_test_index = X_test.index
y_train_index = y_train.index
y_test_index = y_test.index
X_columns = X_train.columns
y_columns = y_train.columns

# y-variables to arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

# Print shapes
print("X train: {}".format(X_train.shape))
print("X test: {}".format(X_test.shape))
print("y train: {}".format(y_train.shape))
print("y test: {}".format(y_test.shape))

X train: (55, 687)
X test: (104, 687)
y train: (55, 5)
y test: (104, 5)


#### Define functions for base models

In [5]:
# === !IMPORTANT! ===
    # In order to get similar results for each compiling, this segment of code needs to be compiled first every time.
# Set seed for keras models
import random as rn
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.compat.v1.keras import backend as K

tf.keras.backend.clear_session()

seed_num = 2
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(seed_num)
rn.seed(seed_num)
tf.random.set_seed(seed_num)
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
K.set_session(sess)

# Set seed for random forest algos
np.random.seed(2)

def prediction_outputs(model, X_train, X_test):
    train = X_train.copy()
    test = X_test.copy()
    gap = len(X_train_index) - len(X_train)
    is_predictions = pd.Series(model.predict(train).reshape(-1), index = X_train_index[gap:])
    oos_predictions = pd.Series(model.predict(test).reshape(-1), index = X_test_index[gap:])
    
    return is_predictions, oos_predictions

def elastic_net(X_train, y_train, X_test, y_test):
    warnings.filterwarnings('ignore')

    # Define alphas and lambdas for grid search
    alphas = np.arange(0, 1, 0.1)
    lambdas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0]

    # Cross validations
    tscv = TimeSeriesSplit(n_splits = 3)

    # Instantiate model
    model_tscv = ElasticNetCV(l1_ratio = alphas,
                            alphas = lambdas,
                            cv = tscv,
                            n_jobs = -1,
                            verbose = 0
                            )

    # Fit model
    model_tscv.fit(X_train, y_train)

    # Predictions
    is_pred, oos_pred = prediction_outputs(model_tscv, X_train, X_test)
    warnings.resetwarnings()
    
    return model_tscv, is_pred, oos_pred, y_train, y_test 
    
def linear_regression(X_train, y_train, X_test, y_test):
    # Instantiate model
    model = lin()
    
    # Fit model
    model.fit(X_train, y_train)
    
    # Predictions
    is_pred, oos_pred = prediction_outputs(model, X_train, X_test)
    
    return model, is_pred, oos_pred, y_train, y_test

def k_nearest(X_train, y_train, X_test, y_test):
    # Instantiate model
    model = KNeighborsRegressor()
    
    # Fit model
    model.fit(X_train, y_train)
    
    # Predictions
    is_pred, oos_pred = prediction_outputs(model, X_train, X_test)
    
    return model, is_pred, oos_pred, y_train, y_test

def decision_tree(X_train, y_train, X_test, y_test):
    # Instantiate model
    model = DecisionTreeRegressor(random_state = 0)
    
    # Fit model
    model.fit(X_train, y_train)
    
    # Predictions
    is_pred, oos_pred = prediction_outputs(model, X_train, X_test)
    
    return model, is_pred, oos_pred, y_train, y_test

def ada_booster(X_train, y_train, X_test, y_test):
    # Instantiate model
    model = AdaBoostRegressor(random_state = 0)
    
    # Fit model
    model.fit(X_train, y_train)
    
    # Predictions
    is_pred, oos_pred = prediction_outputs(model, X_train, X_test)
    
    return model, is_pred, oos_pred, y_train, y_test

def bagging_regressor(X_train, y_train, X_test, y_test):
    # Instantiate model
    model = BaggingRegressor(random_state = 0)
    
    # Fit model
    model.fit(X_train, y_train)
    
    # Predictions
    is_pred, oos_pred = prediction_outputs(model, X_train, X_test)
    
    return model, is_pred, oos_pred, y_train, y_test

def random_forest(X_train, y_train, X_test, y_test):
    # Instantiate model
    model = RandomForestRegressor(random_state = 0)
    
    # Fit model
    model.fit(X_train, y_train)
    
    # Predictions
    is_pred, oos_pred = prediction_outputs(model, X_train, X_test)
    
    return model, is_pred, oos_pred, y_train, y_test

def extra_trees(X_train, y_train, X_test, y_test):
    # Instantiate model
    model = ExtraTreesRegressor(random_state = 0)
    
    # Fit model
    model.fit(X_train, y_train)
    
    # Predictions
    is_pred, oos_pred = prediction_outputs(model, X_train, X_test)
    
    return model, is_pred, oos_pred, y_train, y_test

def neural_network(X_train, y_train, X_test, y_test):
    # Inputs
    units = 250
    act = "selu"
    dr = 0.5
    batch_size = 256
    epochs = 500

    # Construction
    model = keras.Sequential([
        layers.Dense(units = units, activation = act, input_shape = [X_train.shape[1]]),
        layers.BatchNormalization(),
        layers.Dropout(rate = dr),
        layers.Dense(units = units, activation = act),
        layers.BatchNormalization(),
        layers.Dropout(rate = dr),
        layers.Dense(units = units, activation = act),
        layers.BatchNormalization(),
        layers.Dropout(rate = dr),
        layers.Dense(units = 1, activation = "linear")
    ])

    # Define the loss function and optimizer algorithm
    model.compile(
        optimizer = "adam",
        loss = "mse"
    )

    # Early stopping
    early_stopping = EarlyStopping(
        min_delta = 0.001, 
        patience = 20, 
        restore_best_weights = True
    )

    # Fit model
    history = model.fit(
        X_train, y_train,
        validation_data = (X_test, y_test),
        batch_size = batch_size,
        epochs = epochs,
        callbacks = [early_stopping],
        verbose = 0
    )

    # Predictions
    is_pred, oos_pred = prediction_outputs(model, X_train, X_test)
    
    return model, is_pred, oos_pred, y_train, y_test

def convolutional_neural_network(X_train, y_train, X_test, y_test):
    def single_splitter(x_train, y_train, timesteps):
        x, y = list(), list()
        for i in range(len(x_train)):

            end_ix = i + timesteps

            if end_ix > len(x_train)-1:
                break

            seq_x = x_train[i:end_ix]
            seq_y = y_train[end_ix]
            x.append(seq_x)
            y.append(seq_y)
        return np.array(x), np.array(y)

    ts = 5
    x_cnn_train, y_cnn_train = single_splitter(X_train, y_train, timesteps = ts)
    x_cnn_test, y_cnn_test = single_splitter(X_test, y_test, timesteps = ts)

    cnn = keras.models.Sequential()
    cnn.add(keras.layers.Conv1D(filters=75, kernel_size=4, activation='relu', input_shape=(x_cnn_train.shape[1], x_cnn_train.shape[2])))
    cnn.add(keras.layers.MaxPool1D(pool_size=10, padding='same'))
    #cnn.add(keras.layers.Conv1D(filters=100, kernel_size=2, activation='tanh'))
    #cnn.add(keras.layers.MaxPool1D(pool_size=2, padding='same'))
    cnn.add(keras.layers.Flatten())
    cnn.add(keras.layers.Dropout(rate=0.8))
    cnn.add(keras.layers.Dense(500, activation='relu'))
    cnn.add(keras.layers.Dense(1, activation='linear'))
    cnn.compile(optimizer='adam', loss='mse')

    # Fit model:
    early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.00005, patience = 20, restore_best_weights=True)

    history = cnn.fit(x_cnn_train,
                        y_cnn_train,
                        batch_size=500,
                        epochs=1000,
                        callbacks=[early_stopping],
                        validation_split=0.2,
                        verbose=0)

    # Predictions
    is_pred, oos_pred = prediction_outputs(cnn, x_cnn_train, x_cnn_test)
    
    return cnn, is_pred, oos_pred, y_cnn_train, y_cnn_test

def lstm(X_train, y_train, X_test, y_test):
    # Data split and reshape
    X_train_array = np.asarray(X_train).reshape(X_train.shape[0], 1, X_train.shape[1])
    y_train_array = np.asarray(y_train)
    X_test_array = np.asarray(X_test).reshape(X_test.shape[0], 1, X_test.shape[1])
    y_test_array = np.asarray(y_test)

    # Construction
    lstm = keras.Sequential([
        layers.LSTM(50, input_shape = (X_train_array.shape[1], X_train_array.shape[2])),
        layers.Dense(1)
    ])

    # Define the loss function and optimizer algorithm
    lstm.compile(
        optimizer = "adam",
        loss = "mse"
    )

    # Early stopping
    early_stopping = EarlyStopping(
        min_delta = 0.001, 
        patience = 20, 
        restore_best_weights = True
    )

    # Fit the model
    lstm.fit(X_train_array, y_train_array, 
                        epochs = 50, 
                        batch_size = 500,
                        callbacks = [early_stopping], 
                        validation_data = (X_test_array, y_test_array), 
                        verbose = 0, 
                        shuffle = False);

    # Predictions
    is_pred, oos_pred = prediction_outputs(lstm, X_train_array, X_test_array)
    
    return lstm, is_pred, oos_pred, y_train_array, y_test_array

In [6]:
base_models = {"linear_regression":linear_regression,
               "elastic_net":elastic_net,
               "k_nearest_neighbors":k_nearest,
               "decision_tree_regressor":decision_tree,
               "ada_boost_regressor":ada_booster,
               "bagging_regressor":bagging_regressor,
               "random_forest":random_forest,
               "extra_trees":extra_trees,
               "neural_network":neural_network,
               "convolutional_neural_network":convolutional_neural_network,
               "lstm":lstm}

fitted_models, is_pred_dict, oos_pred_dict = {}, {}, {}

z_ = [0]
for period in range(y_train.shape[1]):
    total_time = list()
    is_predictions = pd.DataFrame()
    oos_predictions = pd.DataFrame()
    for model in base_models:
        start = dt.datetime.now()
        model_from_dict = base_models[model]
        mdl, is_pred, oos_pred, y_train_, y_test_ = model_from_dict(X_train, y_train[:, period], X_test, y_test[:, period])
        end = dt.datetime.now()
        diff = (end - start).total_seconds()
        total_time.append(diff)
        #print("{} fitted. Completed in {} seconds".format(model, round(diff, 2)))
        fitted_models[model + "_{}".format(period)] = mdl
        is_predictions = pd.concat([is_predictions, is_pred.rename(model)], axis = 1)
        oos_predictions = pd.concat([oos_predictions, oos_pred.rename(model)], axis = 1)
    
    is_pred_dict["is_pred_{}".format(period)] = is_predictions
    oos_pred_dict["oos_pred_{}".format(period)] = oos_predictions

    x = period
    y = round(sum(total_time))
    z_.append(y)
    z = sum(z_)
    print("Total time of fitting all base models for t = {}: {} ({})".format(x, y, z))

Total time of fitting all base models for t = 0: 9 (9)
Total time of fitting all base models for t = 1: 9 (18)
Total time of fitting all base models for t = 2: 9 (27)
Total time of fitting all base models for t = 3: 10 (37)
Total time of fitting all base models for t = 4: 9 (46)


#### Save models

In [7]:
if frequency == 0:
    freq = "daily"
elif frequency == 1:
    freq = "pdates"

for period in range(y_test.shape[1]):
    export = oos_pred_dict["oos_pred_{}".format(period)].dropna()
    export.to_excel(os.getcwd() + "/Data/base_models_pred_{}_{}.xlsx".format(freq, period))

pub_dates.to_excel(os.getcwd() + "/Data/base_models_predictions_{}.xlsx".format(freq))