## Packages & Preamble

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from return_utils import factors_to_pos,pos_to_return,collecting_stock_data_avg,spearman_corr,collecting_stock_percentages
import os
import seaborn as sns
import matplotlib.pyplot as plt
import empyrical
from Stock_tda_calc import s_and_p



In [2]:
%load_ext autoreload
%autoreload 2

Collect stock close and returns

In [3]:
stock_closes_df = collecting_stock_data_avg(tickers=s_and_p)
stock_closes_df.index=pd.to_datetime(stock_closes_df.index.values,format='%Y-%m-%d')
stock_closes_df.fillna(stock_closes_df.mean(axis=1),inplace=True)
stock_perc_df = collecting_stock_percentages(s_and_p)
stock_perc_df.index=pd.to_datetime(stock_perc_df.index.values,format='%Y-%m-%d')
stock_perc_df.fillna(stock_perc_df.mean(axis=1),inplace=True)

Collect bnh returns only

In [4]:
start = pd.to_datetime("2018-01-01")
end = pd.to_datetime("2022-06-01")
bnh_returns_df = pd.read_csv('csvs//bnh_returns.csv', index_col = 0)

This will be used to collect various factors

In [5]:
def factor_collecting(tickers, col='Avg_%_Betti',inputpath='Betti_experiments/betti_dim_3_points_7', index_col = 0):
    fact_df = pd.DataFrame(dtype=np.float64)
    for t in tqdm(tickers):
        df = pd.read_csv(f'{inputpath}/{t}.csv', index_col = index_col)
        if not col in df.columns:
            print(t)
            continue
        else: 
            df2 = pd.DataFrame({t: df[col]}, index=df.index)
            fact_df = pd.concat([fact_df, df2], axis=1)
            continue
    fact_df.index.name = 'Date'
    return(fact_df)

# Optimising Embedding Dim

In [6]:
dimlow = 2
dimup = 6
pointlow = 5
pointup = 9
outpath = 'Betti_experiments'

In [7]:
for d in range(dimlow, dimup+1):
    for point in range(pointlow, pointup+1):
        if not os.path.exists(f'{outpath}/Sharpe_values/betti_dim_{d}_points_{point}'):
            os.makedirs(
                    f'{outpath}/Sharpe_values/betti_dim_{d}_points_{point}')

Using values calculate meandiff & sort factors

In [8]:
for d in range(dimlow, dimup+1):
    for point in tqdm(range(pointlow, pointup+1)):
        data_read=f'{outpath}/betti_dim_{d}_points_{point}'
        data_output=f'{outpath}/Sharpe_values/betti_dim_{d}_points_{point}/'
        if not os.path.exists(f'{data_output}relative_betti_values.csv'):   
            unweighted_betti_df=factor_collecting(tickers=s_and_p,col='Avg_rel_%_Betti',inputpath=data_read)
            unweighted_betti_df.to_csv(f'{data_output}relative_betti_values.csv')   
        else: 
            unweighted_betti_df=pd.read_csv(f'{data_output}relative_betti_values.csv',index_col=0)
        # if not os.path.exists(f"{data_output}relative_unweighted_prop_factors.csv"):
        #     prop_factors_df=unweighted_betti_df.rolling(window=d*point).apply(proportional_corr)
        #     prop_factors_df.to_csv(f"{data_output}relative_unweighted_prop_factors.csv")
        if not os.path.exists(f"{data_output}relative_sort_factors.csv"):
            sort_factors_df=unweighted_betti_df.rolling(window=d*point).apply(spearman_corr)
            sort_factors_df.to_csv(f"{data_output}relative_sort_factors.csv")
        if not os.path.exists(f"{data_output}relative_meandiff_factors.csv"):
            meandiff_factors_df=unweighted_betti_df-unweighted_betti_df.rolling(window=d*point).mean()
            meandiff_factors_df.to_csv(f"{data_output}relative_meandiff_factors.csv")

100%|██████████| 5/5 [00:01<00:00,  3.57it/s]
100%|██████████| 5/5 [00:01<00:00,  3.96it/s]
100%|██████████| 5/5 [00:00<00:00,  5.34it/s]
100%|██████████| 5/5 [00:00<00:00,  5.70it/s]
100%|██████████| 5/5 [00:00<00:00,  6.01it/s]


Loop over dimension, N_p, and hold length for meandiff/sort strategies,
calculate various strategy return values

In [9]:
perf_stats=pd.DataFrame(dtype=np.float64)
for d in range(dimlow, dimup+1):
    for point in range(pointlow, pointup+1):

        data_read=f'{outpath}/Sharpe_values/betti_dim_{d}_points_{point}'
        data_output=f'{outpath}/Sharpe_values/betti_dim_{d}_points_{point}'

        sort_factors_df=pd.read_csv(f"{data_read}/relative_sort_factors.csv",index_col=0)
        sort_factors_df.index.name='Date'
        meandiff_factors_df=pd.read_csv(f"{data_read}/relative_meandiff_factors.csv",index_col=0)
        meandiff_factors_df.index.name='Date'   
        for df in [sort_factors_df,meandiff_factors_df]:
            df.index=pd.to_datetime(df.index.values,format='%Y-%m-%d')
        
        sort_pos=factors_to_pos(stock_factor_df=sort_factors_df, collected_stocks_df=stock_perc_df, tickers=s_and_p, start='2017-01-01', end='2022-08-01')
        md_pos=factors_to_pos(stock_factor_df=meandiff_factors_df, collected_stocks_df=stock_perc_df, tickers=s_and_p, start='2017-01-01', end='2022-08-01')    
        for hold_len in tqdm(range(1,31)):
            #if not os.path.exists(f'{data_output}/hold_len_{hold_len}_alpha_beta'):
            sort_rets=pos_to_return(positions_df=sort_pos, tickers=s_and_p, start='2018-01-01', end='2022-06-01',  hold_len=hold_len)
            sort_rets = sort_rets.loc["2018-01-01":"2022-06-01"]
            sort_rets.to_csv(f"{data_output}/sort_hold_len_{hold_len}.csv")

            md_rets=pos_to_return(positions_df=md_pos, tickers=s_and_p, start='2018-01-01', end='2022-06-01',  hold_len=hold_len)
            md_rets.to_csv(f"{data_output}/meandiff_hold_len_{hold_len}.csv")

            stats_df=pd.DataFrame({
                'Dimension':[d,d],
                'N_p':[point,point],
                'Hold length':[hold_len,hold_len],
                'Sharpe ratio':[empyrical.sharpe_ratio(sort_rets),empyrical.sharpe_ratio(md_rets)],
                'Sortino ratio':[empyrical.sortino_ratio(sort_rets),empyrical.sortino_ratio(md_rets)],    
                'alpha':[empyrical.alpha(sort_rets,sort_rets[:,]),empyrical.alpha(md_rets,sort_rets[:,])],
                'beta':[empyrical.beta(sort_rets,sort_rets[:,]),empyrical.beta(md_rets,sort_rets[:,])]
            },
            index=['sort','meandiff'])
            stats_df.index.name='Strategy'
            stats_df.to_csv(f'{data_output}/hold_len_{hold_len}_alpha_beta')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  columnlist[:] = pd.NA
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  columnlist[:] = pd.NA
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  columnlist[:] = pd.NA
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  columnlist[:] = pd.NA
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

In [10]:
perf_stats=pd.DataFrame(dtype=np.float64)
for d in range(dimlow, dimup+1):
    for point in range(pointlow, pointup+1):
        for hold_len in range(1,31):
            data_read=f'{outpath}/Sharpe_values/betti_dim_{d}_points_{point}'
            stats_df=pd.read_csv(f'{data_read}/hold_len_{hold_len}_alpha_beta',index_col='Strategy')
            perf_stats=pd.concat([perf_stats,stats_df])
perf_stats.to_csv(f'{outpath}/Sharpe_values/relative_perf_stats')

In [11]:
perf_stats

Unnamed: 0_level_0,Dimension,N_p,Hold length,Sharpe ratio,Sortino ratio,alpha,beta
Strategy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
sort,2,5,1,0.675476,0.929698,0.000000,1.000000
meandiff,2,5,1,0.659977,0.913391,-0.002666,0.997488
sort,2,5,2,0.684645,0.940386,0.000000,1.000000
meandiff,2,5,2,0.594218,0.810007,-0.019699,1.000699
sort,2,5,3,0.694239,0.957585,0.000000,1.000000
...,...,...,...,...,...,...,...
meandiff,6,9,28,0.779586,1.087129,0.021484,0.978451
sort,6,9,29,0.682178,0.946930,0.000000,1.000000
meandiff,6,9,29,0.741841,1.036214,0.014593,0.977669
sort,6,9,30,0.795490,1.112126,0.000000,1.000000


Now we'll plot to see the relationshio

In [None]:
fig1,ax1=plt.subplots(1,3,figsize=(15,5),constrained_layout=True)
fig2,ax2=plt.subplots(1,2,figsize=(10,5),constrained_layout=True)
ax=[ax1[0],ax1[1],ax1[2],ax2[0],ax2[1]]

(
    perf_stats.loc[perf_stats.index.values=='meandiff']
    .groupby(['Dimension','N_p'])
    .apply(
        lambda df: sns.regplot(
            data=df,
            x='Hold length',
            y='Sharpe ratio',
            ci=None,
            ax=ax[df.Dimension.values[0]-2],
            label=f'N_p={df.N_p.values[0]}').set(title=f'Sharpe ratios for dimension d={df.Dimension.values[0]}')
    )
)
for i in range(5):
    ax[i].legend()

In [67]:
max_df=(
    perf_stats
    .reset_index()
    .loc[lambda df:   (df['Hold length']==21)
    & (df['N_p']==7)
    & (df['Strategy'] == 'meandiff')    
        ])
max_df=max_df.reset_index().set_index('Dimension')
print(max_df[['Sharpe ratio','Sortino ratio','alpha','beta']].to_latex())


\begin{tabular}{lrrrr}
\toprule
{} &  Sharpe ratio &  Sortino ratio &     alpha &      beta \\
Dimension &               &                &           &           \\
\midrule
2         &      0.661640 &       0.923719 &  0.002659 &  0.982689 \\
3         &      0.794776 &       1.108833 &  0.021516 &  0.986231 \\
4         &      0.645463 &       0.891503 &  0.021033 &  0.979205 \\
5         &      0.593076 &       0.818965 &  0.003026 &  0.978761 \\
6         &      0.726548 &       1.009086 &  0.005093 &  0.992692 \\
\bottomrule
\end{tabular}



  print(max_df[['Sharpe ratio','Sortino ratio','alpha','beta']].to_latex())
