In [1]:
# Import dependencies
import yfinance as yf
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import hvplot.pandas
import holoviews as hv
hv.extension('bokeh')

In [3]:
# Set pandas display options
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [5]:
# Fetch S&P 500 tickers
#Define dates
start_date = '2020-01-01'
end_date = '2024-10-14'

#Get tickers from wikipedia
sp500_tickers = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]['Symbol'].tolist()

# Filter out Class B shares that have a '.B' in the ticker name
sp500_tickers = [ticker for ticker in sp500_tickers if '.B' not in ticker]

print(f"Initial total S&P 500 tickers: {len(sp500_tickers)}")

Initial total S&P 500 tickers: 501


In [7]:
# Download historical prices
historical_prices = yf.download(sp500_tickers, start=start_date, end=end_date)['Adj Close']

# Fill NaN values with 0
historical_prices.fillna(0, inplace=True)

# Check if data was downloaded for all tickers
print(f"Successfully downloaded historical_prices: {len(historical_prices.columns)} out of {len(sp500_tickers)}")

# Display tickers that were successfully downloaded
downloaded_tickers = historical_prices.columns.get_level_values(0).unique()
print(f"Successfully downloaded tickers: {len(downloaded_tickers)} out of {len(sp500_tickers)}")
# Display data Frame
historical_prices.head()

[*********************100%***********************]  501 of 501 completed


Successfully downloaded historical_prices: 501 out of 501
Successfully downloaded tickers: 501 out of 501


Ticker,A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02 00:00:00+00:00,83.215,72.876,72.265,0.0,79.69,43.4,195.264,334.43,110.378,40.694,...,191.208,24.533,140.138,54.0,56.088,75.451,93.541,139.626,259.14,129.515
2020-01-03 00:00:00+00:00,81.879,72.168,71.579,0.0,78.718,43.35,194.938,331.81,108.435,40.614,...,191.255,24.575,138.059,54.26,55.637,75.895,93.248,139.261,256.05,129.534
2020-01-06 00:00:00+00:00,82.121,72.743,72.144,0.0,79.131,43.52,193.665,333.71,107.161,40.297,...,191.489,24.525,137.786,54.182,56.065,75.404,93.193,138.456,258.01,128.54
2020-01-07 00:00:00+00:00,82.373,72.401,71.732,0.0,78.691,43.16,189.484,333.39,109.599,39.811,...,191.096,24.366,138.44,54.07,55.606,75.12,93.358,138.335,256.47,128.974
2020-01-08 00:00:00+00:00,83.186,73.565,72.241,0.0,79.012,42.73,189.856,337.87,110.589,39.37,...,191.311,24.458,139.309,54.018,54.767,75.376,93.523,139.991,247.64,128.694


In [17]:
# Function to calculate forward returns for a given horizon
def calculate_forward_returns(historical_prices, horizon):
    f_returns = historical_prices.pct_change(horizon)
    f_returns = f_returns.shift(-horizon)
    return f_returns

In [19]:
# Initialize DataFrame to hold all forward returns
total_returns = pd.DataFrame()

In [23]:
# Iterate over forecast horizons and calculate forward returns for each
for horizon in range(1, 6):
    f_returns = calculate_forward_returns(historical_prices, horizon)
    f_returns = pd.DataFrame(f_returns.unstack())
    name = f"F_{horizon}_d_returns"
    f_returns.rename(columns={0: name}, inplace=True)
    if total_returns.empty:
        total_returns = f_returns
    else:
        total_returns = total_returns.merge(f_returns, left_index=True, right_index=True, how='outer')
        
# Drop rows with any NaN values
total_returns.dropna(axis=0, how='any', inplace=True)
total_returns.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,F_1_d_returns_x,F_2_d_returns_x,F_3_d_returns_x,F_4_d_returns_x,F_5_d_returns_x,F_1_d_returns_y,F_2_d_returns_y,F_3_d_returns_y,F_4_d_returns_y,F_5_d_returns_y
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
A,2020-01-02 00:00:00+00:00,-0.016,-0.013,-0.01,-0.0,0.015,-0.016,-0.013,-0.01,-0.0,0.015
A,2020-01-03 00:00:00+00:00,0.003,0.006,0.016,0.032,0.036,0.003,0.006,0.016,0.032,0.036
A,2020-01-06 00:00:00+00:00,0.003,0.013,0.029,0.033,0.031,0.003,0.013,0.029,0.033,0.031
A,2020-01-07 00:00:00+00:00,0.01,0.026,0.03,0.028,0.034,0.01,0.026,0.03,0.028,0.034
A,2020-01-08 00:00:00+00:00,0.016,0.019,0.018,0.024,0.031,0.016,0.019,0.018,0.024,0.031


In [9]:
# Initialize the forecast horizon (how many days out from today it will forcast)
forecast_horizon = 5
# Compute forward returns by taking percentage change of close prices
f_returns = historical_prices.pct_change(forecast_horizon)

In [8]:
list_of_momentums = [1, 2, 3, 4 ,5]

In [13]:
# We then shift the forward returns
f_returns = f_returns.shift(-forecast_horizon)
f_returns.iloc[:,0:10].head()

Ticker,A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-01-02 00:00:00+00:00,0.015,0.031,0.007,,-0.006,0.005,-0.019,0.018,0.002,-0.043
2020-01-03 00:00:00+00:00,0.036,0.043,0.004,,-0.006,0.004,-0.01,0.024,0.002,-0.045
2020-01-06 00:00:00+00:00,0.031,0.057,-0.01,,-0.014,0.004,0.006,0.036,0.018,-0.027
2020-01-07 00:00:00+00:00,0.034,0.048,0.006,,0.003,0.012,0.019,0.034,-0.0,-0.014
2020-01-08 00:00:00+00:00,0.031,0.027,0.011,,0.018,0.031,0.018,0.015,-0.026,-0.005


In [15]:
# Pivot the dataframe
f_returns = pd.DataFrame(f_returns.unstack())
# Name the column based on the forecast horizon
name = "F_" + str(forecast_horizon) + "_d_returns"
f_returns.rename(columns={0: name}, inplace=True)
f_returns.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,F_5_d_returns
Ticker,Date,Unnamed: 2_level_1
A,2020-01-02 00:00:00+00:00,0.015
A,2020-01-03 00:00:00+00:00,0.036
A,2020-01-06 00:00:00+00:00,0.031
A,2020-01-07 00:00:00+00:00,0.034
A,2020-01-08 00:00:00+00:00,0.031


In [9]:
# Initialize total_returns with forward returns
total_returns = f_returns

In [10]:
# Iterate over the list of momentum values
for i in list_of_momentums:  
    # Compute returns for each momentum value
    feature = historical_prices.pct_change(i)
    feature = pd.DataFrame(feature.unstack())
    # Name the column based on the momentum value
    name = str(i) + "_d_returns"        
    feature.rename(columns={0: name}, inplace=True)
    # Rename columns and reset index
    feature.rename(columns={0: name, 'level_0': 'Ticker'}, inplace=True)
    # Merge computed feature returns with total_returns based on Ticker and Date
    total_returns = pd.merge(total_returns, feature, left_index=True, right_index=True,how='outer')

In [11]:
# Drop rows with any NaN values
total_returns.dropna(axis=0, how='any', inplace=True) 
total_returns.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,F_1_d_returns,1_d_returns,2_d_returns,3_d_returns,4_d_returns,5_d_returns
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A,2020-01-09 00:00:00+00:00,0.004,0.016,0.026,0.029,0.032,0.015
A,2020-01-10 00:00:00+00:00,-0.001,0.004,0.019,0.03,0.033,0.036
A,2020-01-13 00:00:00+00:00,0.006,-0.001,0.002,0.018,0.028,0.031
A,2020-01-14 00:00:00+00:00,0.007,0.006,0.005,0.008,0.024,0.034
A,2020-01-15 00:00:00+00:00,0.009,0.007,0.013,0.012,0.015,0.031


In [12]:
total_returns.corr()

Unnamed: 0,F_1_d_returns,1_d_returns,2_d_returns,3_d_returns,4_d_returns,5_d_returns
F_1_d_returns,1.0,-0.044,0.003,-0.011,-0.029,-0.02
1_d_returns,-0.044,1.0,0.688,0.581,0.49,0.423
2_d_returns,0.003,0.688,1.0,0.81,0.71,0.618
3_d_returns,-0.011,0.581,0.81,1.0,0.861,0.771
4_d_returns,-0.029,0.49,0.71,0.861,1.0,0.887
5_d_returns,-0.02,0.423,0.618,0.771,0.887,1.0


In [13]:
# Function to create 'go_long' based on bottom n% of '1_d_returns'
def create_go_long_signal(df, column,n=0.50):
    # Calculate the nth percentile threshold for '1_d_returns'
    threshold = df[column].quantile(n)
    
    # Create a 'go_long' column: 1 if '1_d_returns' is in the bottom 20%, else 0
    df['go_long'] = (df[column] <= threshold).astype(int)
    
    return df

# Apply the function to create the 'go_long' column
total_returns_with_signal = create_go_long_signal(total_returns, '1_d_returns')
total_returns_with_signal


Unnamed: 0_level_0,Unnamed: 1_level_0,F_1_d_returns,1_d_returns,2_d_returns,3_d_returns,4_d_returns,5_d_returns,go_long
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,2020-01-09 00:00:00+00:00,0.004,0.016,0.026,0.029,0.032,0.015,0
A,2020-01-10 00:00:00+00:00,-0.001,0.004,0.019,0.030,0.033,0.036,0
A,2020-01-13 00:00:00+00:00,0.006,-0.001,0.002,0.018,0.028,0.031,1
A,2020-01-14 00:00:00+00:00,0.007,0.006,0.005,0.008,0.024,0.034,0
A,2020-01-15 00:00:00+00:00,0.009,0.007,0.013,0.012,0.015,0.031,0
...,...,...,...,...,...,...,...,...
ZTS,2024-10-04 00:00:00+00:00,-0.014,-0.005,-0.013,-0.023,-0.027,-0.025,1
ZTS,2024-10-07 00:00:00+00:00,0.000,-0.014,-0.019,-0.027,-0.037,-0.041,1
ZTS,2024-10-08 00:00:00+00:00,0.015,0.000,-0.014,-0.019,-0.026,-0.036,1
ZTS,2024-10-09 00:00:00+00:00,0.001,0.015,0.015,0.001,-0.004,-0.012,0


In [14]:
# Compare performance of go_long with rest of the universe, notice we double the mean
total_returns_with_signal[['F_1_d_returns']].describe()

Unnamed: 0,F_1_d_returns
count,591722.0
mean,0.001
std,0.024
min,-0.539
25%,-0.01
50%,0.001
75%,0.011
max,0.442


In [15]:
# Compare performance of go_long with rest of the universe
total_returns_with_signal[total_returns_with_signal['go_long']==1]['F_1_d_returns'].describe()

count   295861.000
mean         0.001
std          0.025
min         -0.539
25%         -0.010
50%          0.001
75%          0.012
max          0.440
Name: F_1_d_returns, dtype: float64

In [16]:
# Compute 'realized_returns' by multiplying 'go_long' with 'F_1_d_returns'
total_returns_with_signal['realized_returns'] = total_returns_with_signal['go_long'] * total_returns_with_signal['F_1_d_returns']


In [17]:
# Calculate daily average realized returns (only where 'go_long' is 1)
average_realized_per_day = total_returns_with_signal.groupby('Date')['realized_returns'].mean()

# Calculate daily average for the entire universe (using 'F_1_d_returns')
average_all_per_day = total_returns_with_signal.groupby('Date')['F_1_d_returns'].mean()

# Compute cumulative returns for both realized and all universe
cumulative_realized = average_realized_per_day.cumsum()
cumulative_all = average_all_per_day.cumsum()

In [18]:
# Create a DataFrame for plotting
cumulative_returns = pd.DataFrame({
    'Cumulative Realized Returns (Bottom 20%)': cumulative_realized,
    'Cumulative Average Returns (All)': cumulative_all
})

In [19]:
# Plot using hvplot
cumulative_returns.hvplot(
    title='Cumulative Returns: Realized (Bottom n%) vs All Universe',
    xlabel='Date', ylabel='Cumulative Return', 
    width=800, height=400, grid=True, legend=True
)