<h1> Determine historical periods highly correlated with a recent one (target)

In [27]:
import yfinance as yf
import datetime as dt
import pandas as pd
import statistics as stats
import plotly.express as px

target_sample_start_date = '2022-02-24'
target_sample_end_date = '2022-09-26'
asset = '^GSPC'
correlation_coefficient_threshold = 0.75
target_sample_start_date = dt.datetime.strptime(target_sample_start_date, '%Y-%m-%d')
target_sample_end_date = dt.datetime.strptime(target_sample_end_date, '%Y-%m-%d')

In [28]:
def sort_dict_by_desc_values(dictionary):
    # deliver tuples of dates (dict keys) sorted in the descending order of correl coeffs
    
    return {k: v for k, v in sorted(dictionary.items(), key=lambda item: item[1], reverse=True)}

In [29]:
asset_object = yf.Ticker(asset)
df = asset_object.history(start='1900-01-01')
df.reset_index(inplace=True)
df = df[['Date', 'Close', 'Low']].iloc[:]
df.Close = round(df.Close.astype('float64'), 2)
df.Low = round(df.Low.astype('float64'), 2)
target_series = df.Close[df.Date.between(target_sample_start_date, target_sample_end_date)].values.tolist()
target_series_length = len(target_series)

In [30]:
df = df[df.Date < target_sample_start_date]
df['dma10'] = round(df['Close'].rolling(10).mean(), 2)
large_series_dates = df.Date.values.tolist()
large_series_quotes = df.Close.values.tolist()
large_series_dates = [dt.datetime.fromtimestamp(x/1000000000).replace(hour=0) for x in large_series_dates]
df.head

<bound method NDFrame.head of             Date    Close      Low    dma10
0     1927-12-30    17.66    17.66      NaN
1     1928-01-03    17.76    17.76      NaN
2     1928-01-04    17.72    17.72      NaN
3     1928-01-05    17.55    17.55      NaN
4     1928-01-06    17.66    17.66      NaN
...          ...      ...      ...      ...
23645 2022-02-16  4475.01  4429.68  4484.10
23646 2022-02-17  4380.26  4373.81  4474.38
23647 2022-02-18  4348.87  4327.22  4459.22
23648 2022-02-22  4304.76  4267.11  4441.31
23649 2022-02-23  4225.50  4221.51  4411.70

[23650 rows x 4 columns]>

In [31]:
results = {}
i = 0
while len(large_series_quotes) >= target_series_length + i:
    r = stats.correlation(target_series, large_series_quotes[i:i+target_series_length])
    if r >= correlation_coefficient_threshold:
        result_period_start, result_period_end = (large_series_dates[i], \
                                                  large_series_dates[i+target_series_length-1])
        results[result_period_start, result_period_end] = round(r, 2)
        i += target_series_length
    else:
        i += 1
results = sort_dict_by_desc_values(results)

In [32]:
print(f'There are {len(results)} periods for {asset} with a Pearson-r of minimum \
{correlation_coefficient_threshold} with the target period btw {target_sample_start_date.date()} and \
{target_sample_end_date.date()} (desc order of r):\n')

for k, v in results.items():
    print(f'{k[0].date()} -- {k[1].date()}: r={v}')   


There are 27 periods for ^GSPC with a Pearson-r of minimum 0.75 with the target period btw 2022-02-24 and 2022-09-26 (desc order of r):

1971-03-23 -- 1971-10-20: r=0.77
1929-08-09 -- 1930-03-14: r=0.76
1953-01-29 -- 1953-08-27: r=0.76
1956-06-29 -- 1957-01-31: r=0.76
1969-09-22 -- 1970-04-22: r=0.76
1973-09-07 -- 1974-04-08: r=0.76
1975-05-21 -- 1975-12-18: r=0.76
1977-06-10 -- 1978-01-11: r=0.76
1978-07-31 -- 1979-02-28: r=0.76
1990-06-01 -- 1990-12-31: r=0.76
1993-12-10 -- 1994-07-13: r=0.76
2007-11-01 -- 2008-06-04: r=0.76
2008-07-16 -- 2009-02-13: r=0.76
2011-05-23 -- 2011-12-20: r=0.76
1931-01-20 -- 1931-08-18: r=0.75
1931-09-16 -- 1932-04-20: r=0.75
1937-01-26 -- 1937-08-26: r=0.75
1941-08-07 -- 1942-03-13: r=0.75
1946-06-18 -- 1947-01-20: r=0.75
1959-06-29 -- 1960-01-28: r=0.75
1962-02-15 -- 1962-09-17: r=0.75
1966-05-27 -- 1966-12-28: r=0.75
1968-10-25 -- 1969-06-10: r=0.75
1974-05-06 -- 1974-12-03: r=0.75
1981-06-19 -- 1982-01-19: r=0.75
1987-08-04 -- 1988-03-03: r=0.75
2000-

In [33]:
period_start_date, period_end_date = '1971-03-23', '1971-10-20'
extra_charting_sessions = 160

chart_start_date = dt.datetime.strptime(period_start_date+' 00:00:00', '%Y-%m-%d %H:%M:%S')
chart_end_date = dt.datetime.strptime(period_end_date+' 00:00:00', '%Y-%m-%d %H:%M:%S') + dt.timedelta(extra_charting_sessions)
df1 = df[df.Date.between(chart_start_date, chart_end_date)]
df1.set_index('Date', inplace=True)
fig = px.line(df1.Close)
fig.show()