In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import Functions needed for Data Preprocessing
from data import ( preprocess_stock_data,
                  clean_financial_ratios,
                  merge_and_clean_data,
                  filter_useful_features,
                  calculate_technical_indicators,
                  normalize_data,
                  merge_macro_data)

# import helper functions to run LSTM Training and Predictions
from helper import (create_df_per_stock,
                    run_for_stocks,
                    get_best_configuration,
                    final_df_cleaning,
                    create_return_arrays,
                    create_returns_for_cov, 
                    calculate_covariance_matrix)

# Load Dataset

In [2]:
# Load Data
stock = pd.read_csv('../data/stocks_1.csv')
stock_factor = pd.read_csv('../data/ratios_2.csv')
macro = pd.read_csv('../data/bond_and_cpi.csv')

In [3]:
# Select stocks
stock_avg = preprocess_stock_data(stock)
# Clean financial ratios data
stock_factor_1 = clean_financial_ratios(stock_avg, stock_factor)
# Merge trading data and financial ratios data and select proper companies
stock_all_final = merge_and_clean_data(stock_avg, stock_factor_1)
# Select features that are meaningful and useful
stock_final = filter_useful_features(stock_all_final)
# Calculate momentum technical indicators
stock_final_1 = stock_final.groupby('gvkey').apply(calculate_technical_indicators).reset_index(drop=True)
stock_use = stock_final_1.dropna()
# Merge macro data
stock_use = merge_macro_data(stock_use, macro)
# Normalization
stock_n = normalize_data(stock_use)
# Save to CSV
stock_n.to_csv('../data/normalized_data.csv', index=False)

In [4]:
data = pd.read_csv("../data/normalized_data.csv") # Load Dataset from File generated previously. 
data = final_df_cleaning(data) # Drop unnecessary columns and arrange data by ticker and dates
tickers = data['tic'].unique() # Create a List of the Unique Stock Tickers
data.head()

Unnamed: 0,index,datadate,tic,cshtrm,prccm,prchm,prclm,trt1m,CAPEI,evm,...,b30ret,b20ret,b10ret,b7ret,b5ret,b2ret,b1ret,t90ret,t30ret,cpiret
0,3388,2011-03-31,AMZN,0.057669,0.050441,0.047726,0.04827,0.449252,0.657229,0.104348,...,0.447572,0.505968,0.485734,0.460448,0.479048,0.409137,0.349846,0.074603,0.021886,0.009751
1,3389,2011-04-30,AMZN,0.055855,0.054863,0.05203,0.052741,0.479247,0.657454,0.104348,...,0.524495,0.625134,0.632978,0.696662,0.777043,0.612828,0.390447,0.068691,0.018939,0.006439
2,3390,2011-05-31,AMZN,0.051408,0.055111,0.054307,0.057432,0.4272,0.657571,0.104227,...,0.577259,0.700208,0.734779,0.755825,0.762814,0.561705,0.414261,0.03934,0.010522,0.004704
3,3391,2011-06-30,AMZN,0.046094,0.05731,0.05427,0.054622,0.449368,0.6577,0.104227,...,0.352766,0.407129,0.432365,0.440776,0.524773,0.489151,0.358103,0.047289,0.015152,-0.001071
4,3392,2011-07-31,AMZN,0.044648,0.062395,0.059825,0.061283,0.479955,0.657969,0.104227,...,0.62488,0.760939,0.794987,0.803837,0.83912,0.529782,0.281747,0.009376,0.001894,0.000886


In [5]:
tickers

array(['AMZN', 'AVY', 'AXON', 'BBWI', 'BKNG', 'BLDR', 'CBRE', 'CDNS',
       'CE', 'CF', 'CHD', 'CMCSA', 'CMS', 'CNC', 'COST', 'DECK', 'DLTR',
       'EA', 'EQIX', 'FI', 'FICO', 'GOOGL', 'INCY', 'LULU', 'MA', 'MOH',
       'NDAQ', 'NI', 'NVDA', 'ODFL', 'OKE', 'PKG', 'SBAC', 'STLD', 'TDG',
       'TGT', 'TYL', 'UNH', 'URI', 'V', 'VLO', 'WST'], dtype=object)

In [6]:
# Dictionary to hold dataframe for each stock
df_per_stock = create_df_per_stock(tickers=tickers, dataframe=data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['trt1m'] = df['trt1m'].shift(-1) # Shift Target Return up


# Hyperparameter Tuning for LSTMs and Return Predictions

In [7]:
# Test with small list
param_grid = {
    'lstm_units': [100, 150],
    'dense_units1': [50, 100, 150],
    'dense_units2': [50, 75],
    'batch_size': [32, 64],
    'optimizer': ['adam']
}

In [8]:
# Only Run the below command when running a new model - otherwise use the saved data within the 
# past_results folder

# models = run_for_stocks(tickers, df_per_stock, param_grid)

In [9]:
# Get Best Configuration for each stock via the CSV files
# Only run when you've finished running a new model above to confirm all best_configurations are saved
# Otherwise use best_configs inside the past_results folder
"""
best_configurations = get_best_configuration(tickers=tickers)
best_configurations_df = pd.DataFrame(best_configurations).T.reset_index()
best_configurations_df.columns = ['ticker',
                                  'lstm_units',
                                  'dense_units1',
                                  'dense_units2',
                                  'batch_size',
                                  'optimizer', 
                                  'avg_val_mse']
best_configurations_df
best_configurations_df.to_csv("../results/best_configs.csv")
"""

'\nbest_configurations = get_best_configuration(tickers=tickers)\nbest_configurations_df = pd.DataFrame(best_configurations).T.reset_index()\nbest_configurations_df.columns = [\'ticker\',\n                                  \'lstm_units\',\n                                  \'dense_units1\',\n                                  \'dense_units2\',\n                                  \'batch_size\',\n                                  \'optimizer\', \n                                  \'avg_val_mse\']\nbest_configurations_df\nbest_configurations_df.to_csv("../results/best_configs.csv")\n'

In [10]:
# Create a Numpy Array of Returns
without_macro_features = "../past_results/without_macro_data/"
with_macro_features = "../past_results/with_macro_data/"
default_lstm_no_burning_window_12m = create_return_arrays(tickers=tickers,
                                                          folder=f"{without_macro_features}default_lstm_no_burning_window_12m")
default_lstm_with_burning_window_12m = create_return_arrays(tickers=tickers,
                                                            folder=f"{without_macro_features}default_lstm_burning_window_12m")
default_lstm_no_burning_window_3m = create_return_arrays(tickers=tickers,
                                                         folder=f"{without_macro_features}default_lstm_no_burning_window_3m")
default_lstm_with_burning_window_3m = create_return_arrays(tickers=tickers,
                                                            folder=f"{without_macro_features}default_lstm_with_burning_window_3m")
default_lstm_with_burning_window_6m = create_return_arrays(tickers=tickers,
                                                           folder=f"{without_macro_features}default_lstm_burning_window_6m")

# Covariance Matrix via Shrinkage

In [12]:
returns_data_12m_no_burning_window = create_returns_for_cov(df_per_stock=df_per_stock,
                                      return_array=default_lstm_no_burning_window_12m)

returns_data_12m_with_burning_window = create_returns_for_cov(df_per_stock=df_per_stock,
                                      return_array=default_lstm_with_burning_window_12m)

returns_data_3m_no_burning_window = create_returns_for_cov(df_per_stock=df_per_stock,
                                      return_array=default_lstm_no_burning_window_3m)

returns_data_3m_with_burning_window = create_returns_for_cov(df_per_stock=df_per_stock,
                                      return_array=default_lstm_with_burning_window_3m)

returns_data_6m_with_burning_window = create_returns_for_cov(df_per_stock=df_per_stock,
                                      return_array=default_lstm_with_burning_window_6m)

In [13]:
cov_matrices_12m_no_burning_window, shrinkages_12m_no_burning_window = calculate_covariance_matrix(returns_data_12m_no_burning_window)

cov_matrices_12m_with_burning_window, shrinkages_12m_with_burning_window = calculate_covariance_matrix(returns_data_12m_with_burning_window)

cov_matrices_3m_no_burning_window, shrinkages_3m_no_burning_window = calculate_covariance_matrix(returns_data_3m_no_burning_window)

cov_matrices_3m_with_burning_window, shrinkages_3m_with_burning_window = calculate_covariance_matrix(returns_data_3m_with_burning_window)

cov_matrices_6m_with_burning_window, shrinkages_6m_with_burning_window = calculate_covariance_matrix(returns_data_6m_with_burning_window)

In [21]:
print(f"The Shape of 12M No Burning Window Returns: {default_lstm_no_burning_window_12m.shape}")
print(f"The Length of the list of Cov Matrix for 12M No Burning Window Returns: {len(cov_matrices_12m_no_burning_window)}")

print(f"The Shape of 12M With Burning Window Returns: {default_lstm_with_burning_window_12m.shape}")
print(f"The Length of the list of Cov Matrix for 12M with Burning Window Returns: {len(cov_matrices_12m_with_burning_window)}")

print(f"The Shape of 3M No Burning Window Returns: {default_lstm_no_burning_window_3m.shape}")
print(f"The Length of the list of Cov Matrix for 3M No Burning Window Returns: {len(cov_matrices_3m_no_burning_window)}")

print(f"The Shape of 3M With Burning Window Returns: {default_lstm_with_burning_window_3m.shape}")
print(f"The Length of the list of Cov Matrix for 3M with Burning Window Returns: {len(cov_matrices_3m_with_burning_window)}")

print(f"The Shape of 6M With Burning Window Returns: {default_lstm_with_burning_window_6m.shape}")
print(f"The Length of the list of Cov Matrix for 6M with Burning Window Returns: {len(cov_matrices_6m_with_burning_window)}")


The Shape of 12M No Burning Window Returns: (29, 42)
The Length of the list of Cov Matrix for 12M No Burning Window Returns: 29
The Shape of 12M With Burning Window Returns: (19, 42)
The Length of the list of Cov Matrix for 12M with Burning Window Returns: 19
The Shape of 3M No Burning Window Returns: (30, 42)
The Length of the list of Cov Matrix for 3M No Burning Window Returns: 30
The Shape of 3M With Burning Window Returns: (28, 42)
The Length of the list of Cov Matrix for 3M with Burning Window Returns: 28
The Shape of 6M With Burning Window Returns: (25, 42)
The Length of the list of Cov Matrix for 6M with Burning Window Returns: 25


# Mean Variance Portfolio Optimization

In [22]:
# Code of Kangjin

# Backtesting Results

In [23]:
# Code of Kangjin

# Use below Data for Presentation Slides

In [14]:
# Lets first try to get the model to run for 1 stock
amzn = data[data['tic'] == 'AMZN'].iloc[:, :]
y = amzn['trt1m'].values
amzn.drop(columns=['trt1m'], inplace=True)
X = amzn.iloc[:, 2:].values
print(f"The Shape of X Features is: {X.shape}")
print(f"The Shape of y Features is: {y.shape}")

The Shape of X Features is: (154, 74)
The Shape of y Features is: (154,)


In [15]:
sequence_length = 12 # Feed past 12 month returns into sequence for LSTM
# Converting Features into 3D space for LSTM to add a time component
X_features, y_target = [], []
for i in range(X.shape[0] - sequence_length):
    X_features.append(X[i:i+sequence_length])
    y_target.append(y[i + sequence_length])
X_features = np.array(X_features)
y_target = np.array(y_target)

print(f"The Shape of X Features is: {X_features.shape}")
print(f"The Shape of y Features is: {y_target.shape}")

The Shape of X Features is: (142, 12, 74)
The Shape of y Features is: (142,)


In [23]:
train_size = int(len(amzn) * 0.8)
print(f"The size of my training set will be : {train_size} and the test set will be : {int(len(amzn)) - train_size}")
X_train, y_train = X_features[:train_size], y_target[:train_size]
y_train.reshape(-1, 1)
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
X_test, y_test = X_features[train_size:], y_target[train_size:]
y_target.reshape(-1, 1)
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

The size of my training set will be : 123 and the test set will be : 31
Shape of X_train: (123, 12, 74)
Shape of y_train: (123,)
Shape of X_test: (19, 12, 74)
Shape of y_test: (19,)
