# Distribution Shifts

+ Consider our stock data. 
+ We are interested in testing changes in return distribution for our sample data around the time of the onset of the COVID 19 pandemic.

In [1]:
%load_ext dotenv
%dotenv ../05_src/.env


In [None]:

import sys
import os
sys.path.append("../05_src/utils")
from logger import get_logger

_logs = get_logger(__name__)


In [3]:
notebook_dir = os.getcwd() 
print(f"Notebook directory: {notebook_dir}")
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))
print(f"Project root directory: {project_root}")

Notebook directory: c:\Users\J2F\Desktop\UoTDS\production\01_materials\labs
Project root directory: c:\Users\J2F\Desktop\UoTDS\production\01_materials


In [2]:
import sys
import os

# 1. Get the current directory (which is .../01_materials/labs)
notebook_dir = os.getcwd() 

# 2. Go UP TWO LEVELS to reach the project root (production)
#    We join '..' and '..' to traverse two parent directories
project_root = os.path.abspath(os.path.join(notebook_dir, '..', '..')) 
print(f"Project root directory (fixed): {project_root}")
# 3. Append the '05_src' folder from the project root
sys.path.append(os.path.join(project_root, '05_src')) 

# The import should now work!
from utils.logger import get_logger 

_logs = get_logger(__name__)

Project root directory (fixed): c:\Users\J2F\Desktop\UoTDS\production


In [5]:
import dask
import os
dask.config.set({'dataframe.query-planning': True})
import dask.dataframe as dd
import pandas as pd
import numpy as np
from glob import glob

ft_dir = os.getenv("FEATURES_DATA", '../../05_src/data/features/stock_features')
# print(f"Checking for files in: {os.path.abspath(ft_dir)}")
print(f"Dask is searching in this directory: {ft_dir}")

ft_glob = glob(ft_dir+'/*.parquet')
print(f"Files found by glob: {ft_glob}")
# If this list is empty ([]), that's the source of your error.

df = dd.read_parquet(ft_glob).compute().reset_index()



Dask is searching in this directory: ../../05_src/data/features/stock_features
Files found by glob: ['../../05_src/data/features/stock_features\\part.0.parquet', '../../05_src/data/features/stock_features\\part.1.parquet', '../../05_src/data/features/stock_features\\part.10.parquet', '../../05_src/data/features/stock_features\\part.11.parquet', '../../05_src/data/features/stock_features\\part.12.parquet', '../../05_src/data/features/stock_features\\part.13.parquet', '../../05_src/data/features/stock_features\\part.14.parquet', '../../05_src/data/features/stock_features\\part.15.parquet', '../../05_src/data/features/stock_features\\part.16.parquet', '../../05_src/data/features/stock_features\\part.17.parquet', '../../05_src/data/features/stock_features\\part.18.parquet', '../../05_src/data/features/stock_features\\part.19.parquet', '../../05_src/data/features/stock_features\\part.2.parquet', '../../05_src/data/features/stock_features\\part.20.parquet', '../../05_src/data/features/stock_

In [6]:
df

Unnamed: 0,ticker,Date,Open,High,Low,Close,Adj Close,Volume,source,Year,Close_lag_1
0,ACN,2001-07-19,15.10,15.29,15.00,15.17,11.404394,34994300.0,ACN.csv,2001,
1,ACN,2001-07-20,15.05,15.05,14.80,15.01,11.284108,9238500.0,ACN.csv,2001,15.17
2,ACN,2001-07-23,15.00,15.01,14.55,15.00,11.276587,7501000.0,ACN.csv,2001,15.01
3,ACN,2001-07-24,14.95,14.97,14.70,14.86,11.171341,3537300.0,ACN.csv,2001,15.00
4,ACN,2001-07-25,14.70,14.95,14.65,14.95,11.238999,4208100.0,ACN.csv,2001,14.86
...,...,...,...,...,...,...,...,...,...,...,...
353260,ZIXI,2020-03-26,4.06,4.53,3.88,4.51,4.510000,1668500.0,ZIXI.csv,2020,4.00
353261,ZIXI,2020-03-27,4.49,4.71,4.10,4.60,4.600000,1146800.0,ZIXI.csv,2020,4.51
353262,ZIXI,2020-03-30,4.83,4.87,4.44,4.64,4.640000,1212000.0,ZIXI.csv,2020,4.60
353263,ZIXI,2020-03-31,4.60,4.69,4.10,4.31,4.310000,1057200.0,ZIXI.csv,2020,4.64


In [3]:
ft_dir = os.getenv("FEATURES_DATA", './data/features/stock_features')
ft_glob = glob(ft_dir+'/*.parquet')
# df = dd.read_parquet(ft_glob).compute().reset_index()

## Data Preparation

+ First, prepare four datasets, each with returns between March of a given year and March of the following year.
+ For each data set, we can compute some descriptive statistics.
+ We observe that there may be some distribution changes.

In [7]:
df_2018 = df[(df['Date'] >= '2018-03-01') & (df['Date']  < '2019-03-01')]
df_2019 = df[(df['Date'] >= '2019-03-01') & (df['Date']  < '2020-03-01')]
df_2020 = df[(df['Date'] >= '2020-03-01') & (df['Date']  < '2021-03-01')]
df_2021 = df[(df['Date'] >= '2021-03-01') & (df['Date']  < '2022-03-01')]
df_2022 = df[(df['Date'] >= '2022-03-01') & (df['Date']  < '2023-03-01')]

In [None]:
df_2018['returns'].describe()

In [6]:
df_2019['returns'].describe()

count    2267.000000
mean        0.007072
std         0.216974
min        -0.303547
25%        -0.007273
50%         0.001091
75%         0.008462
max         9.660822
Name: returns, dtype: float64

In [7]:
df_2020['returns'].describe()

count    2259.000000
mean        0.009681
std         0.177753
min        -0.345949
25%        -0.010707
50%         0.002011
75%         0.014895
max         5.675929
Name: returns, dtype: float64

In [8]:
df_2021['returns'].describe()

count    2277.000000
mean        0.034039
std         1.116057
min        -0.101915
25%        -0.007290
50%         0.001064
75%         0.009198
max        51.348436
Name: returns, dtype: float64

In [9]:
df_2022['returns'].describe()

count    2259.000000
mean        0.016357
std         0.510301
min        -0.167932
25%        -0.012159
50%        -0.000541
75%         0.011908
max        22.977526
Name: returns, dtype: float64

# Komogorov-Smirnov Test

+ The KS test can be accessed via the scipy library: `scipy.stats.kstest`
+ This function can be used to perform two sample tests.
+ The null hypothesis is that the two distributions are identical.

In [10]:
from scipy.stats import kstest

kstest(df_2018['returns'].dropna(), 
       df_2019['returns'].dropna())

KstestResult(statistic=0.034314065596832595, pvalue=0.13480604903839485, statistic_location=0.013485812569593802, statistic_sign=-1)

In [11]:
kstest(df_2019['returns'].dropna(), 
       df_2020['returns'].dropna())

KstestResult(statistic=0.13064753191322345, pvalue=2.670261822755509e-17, statistic_location=0.013644776357206068, statistic_sign=1)

In [12]:
kstest(df_2020['returns'].dropna(), 
       df_2021['returns'].dropna())

KstestResult(statistic=0.1100472943535476, pvalue=2.0196636826634023e-12, statistic_location=0.01143078433526279, statistic_sign=-1)

In [13]:
kstest(df_2021['returns'].dropna(), 
       df_2022['returns'].dropna())

KstestResult(statistic=0.0940567987164211, pvalue=3.4449944295519247e-09, statistic_location=-0.007372480262691217, statistic_sign=-1)