##### Dependencies

In [85]:
!pip install arch
!pip install statsmodels
!pip install stylefact
!pip install yfinance
!pip install seaborn

Collecting arch
  Downloading arch-7.0.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Downloading arch-7.0.0-cp311-cp311-win_amd64.whl (924 kB)
   ---------------------------------------- 0.0/924.9 kB ? eta -:--:--
   - ------------------------------------- 41.0/924.9 kB 991.0 kB/s eta 0:00:01
   --- ------------------------------------ 92.2/924.9 kB 1.7 MB/s eta 0:00:01
   -------- ------------------------------- 194.6/924.9 kB 1.5 MB/s eta 0:00:01
   -------- ------------------------------- 194.6/924.9 kB 1.5 MB/s eta 0:00:01
   ------------ --------------------------- 286.7/924.9 kB 1.3 MB/s eta 0:00:01
   ------------------ --------------------- 419.8/924.9 kB 1.6 MB/s eta 0:00:01
   ---------------------- ----------------- 522.2/924.9 kB 1.7 MB/s eta 0:00:01
   ------------------------- -------------- 583.7/924.9 kB 1.7 MB/s eta 0:00:01
   ------------------------------- -------- 727.0/924.9 kB 1.8 MB/s eta 0:00:01
   ------------------------------------- -- 860.2/924.9 kB 1.9 MB/s 


[notice] A new release of pip is available: 23.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import datetime as dt
import pandas as pd
import yfinance as yf
import numpy as np
import stylefact.finance as sff
import stylefact.visualize as sfv
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.stattools import kpss
import seaborn as sns
from scipy.stats import kurtosis, norm, jarque_bera
from arch import arch_model

##### Preprocessing

Downloaded data from cTrader and Yahoo Finance

In [2]:
def clean_data_structure(df):
    # Remove unwanted column
    df.drop("Empty", axis=1, inplace=True)

    # Remove trailing comma
    df["Time"] = df["Time"].str.replace(r",", "")
    df["Ask"] = df["Ask"].str.replace(r",", ".", 1)
    df["Ask"] = df["Ask"].str.replace(r",", "", 1)
    #df["Ask"] = df["Ask"].str.extract(r"([0-9],[0-9]*)")
    df["Bid"] = df["Bid"].str.replace(r",", ".")

    # Convert to float values
    df["Ask"] = df["Ask"].astype("float")
    df["Bid"] = df["Bid"].astype("float")

    # Build datetime
    df[4] = df["Date"] + " " + df["Time"]
    df["Date"] = pd.to_datetime(df[4])
    df.drop(4, axis=1, inplace=True)
    df.drop("Time", axis=1, inplace=True)

    # Compute mid-price
    df["Midprice"] = (df["Ask"] + df["Bid"]) / 2

    return df

In [3]:
df_tick = pd.read_csv("./dax/tick_dax.txt", sep=" ", names=["Date", "Time", "Ask", "Bid", "Empty"])
df_tick = clean_data_structure(df_tick)

df_minute = pd.read_csv("./dax/minutely_dax.txt", sep=" ", names=["Date", "Time", "Ask", "Bid", "Empty"])
df_minute = clean_data_structure(df_minute)

df_hour = pd.read_csv("./dax/hourly_dax.txt", sep=" ", names=["Date", "Time", "Ask", "Bid", "Empty"])
df_hour = clean_data_structure(df_hour)

df_day = pd.read_csv("./dax/daily_dax.txt", sep=" ", names=["Date", "Time", "Ask", "Bid", "Empty"])
df_day = clean_data_structure(df_day)

  df["Date"] = pd.to_datetime(df[4])
  df["Date"] = pd.to_datetime(df[4])
  df["Date"] = pd.to_datetime(df[4])


In [91]:
def create_analysis(df, prefix="null"):
    prefix = str(prefix) + "_"
    prices = df['Midprice'].to_numpy()
    log_prices = np.log(prices)
    returns = np.diff(log_prices)

    x,y = sff.linear_distribution(returns)
    sfv.linear_distribution(x,y, prefix+'linear_distribution')
    x,y = sff.log_distribution(returns,'positive')
    sfv.log_distribution(x,y,prefix+'log_positive_distribution')
    x,y = sff.log_distribution(returns,'negative')
    sfv.log_distribution(-x,y,prefix+'log_negative_distribution')
    
    x,y = sff.autocorrelation(returns)
    sfv.autocorrelation(x,y,prefix+'autocorrelation',scale='linear')
    x,y = sff.autocorrelation(np.abs(returns))
    sfv.autocorrelation(x,y,prefix+'abs_autocorrelation',scale='log')

    x,y = sff.leverage_effect(returns)
    sfv.leverage_effect(x,y,prefix+'leverage_effect')

    x,y = sff.coarsefine_volatility(returns)
    sfv.coarsefine_volatility(x,y,prefix+'coarsefine')

In [None]:
def gain_loss_asymmetry(df, prefix):
    prefix = str(prefix) + "_"
    prices = df['Midprice'].to_numpy()
    log_prices = np.log(prices)
    returns = np.diff(log_prices)
    positive_dist,negative_dist = sff.gainloss_asymmetry(returns)
    sfv.gainloss_asymmetry(positive_dist,negative_dist,'gainloss_asymmetry')

In [None]:
create_analysis(df_tick, prefix="tick")

In [None]:
create_analysis(df_minute, prefix="minute")

In [None]:
create_analysis(df_hour, prefix="hour")

In [None]:
create_analysis(df_day, prefix="day")

#### Visualizations

In [14]:
plt.plot(df_hour["Date"], df_hour["Midprice"])
plt.savefig("hour")
plt.clf()

plt.plot(df_day["Date"], df_day["Midprice"])
plt.savefig("day")
plt.clf()

plt.plot(df_minute["Date"], df_minute["Midprice"])
plt.savefig("minute")
plt.clf()

plt.plot(df_tick["Date"], df_tick["Midprice"])
plt.savefig("tick")
plt.clf()

plt.acorr(df_tick["Midprice"])
plt.savefig("tick_acorr")
plt.clf()

![Unbenannt.png](attachment:77cd8576-2ccf-4d8a-b0ab-20301151d88b.png)

#### Autocorrelation

1. Check first for stationarity by applying the Augmented Dickey-Fuller (ADF) test.

2. Compute the autocorrelation on the returns series of the data



In [69]:
def calculate_autocorrelation(df):
    # check for stationarity first
    df["Returns"] = df['Midprice'].pct_change().dropna()
    result = adfuller(df["Returns"].dropna())
    print('ADF Statistic:', result[0])
    print('p-value:', result[1])
    autocorrelation = df["Returns"].autocorr(lag=1)
    print("autocorrelation for lag=1: " + str(autocorrelation))
    autocorrelation = df["Returns"].autocorr(lag=5)
    print("autocorrelation for lag=5: " + str(autocorrelation))
    autocorrelation = df["Returns"].autocorr(lag=10)
    print("autocorrelation for lag=10: " + str(autocorrelation))
    autocorrelation = df["Returns"].autocorr(lag=20)
    print("autocorrelation for lag=20: " + str(autocorrelation))
    plt.cla()
    plot_acf(df["Returns"], lags=20)
    plt.title('Autocorrelation Function of DAX Tick-by-Tick Returns')
    plt.savefig("acf")

In [70]:
calculate_autocorrelation(df_tick)

ADF Statistic: -90.14689560380943
p-value: 0.0
autocorrelation for lag=1: -0.11206504800187123
autocorrelation for lag=5: -0.010173013441416162
autocorrelation for lag=10: -0.0018500302090013957
autocorrelation for lag=20: 0.005926171470852754


Additional KPSS Test to crosscheck the extreme values of the ADF test.

In [11]:
df = df_tick
df["Returns"] = df['Midprice'].pct_change()
df = df.loc[1:]
kpss_test = kpss(df["Returns"], regression='c')
print('KPSS Test Statistic:', kpss_test[0])
print('p-value:', kpss_test[1])

KPSS Test Statistic: 0.22862198862839078
p-value: 0.1


look-up table. The actual p-value is greater than the p-value returned.

  kpss_test = kpss(df["Returns"], regression='c')


KPSS test for the tick data due to extreme high ADF-values:
KPSS Test Statistic: 0.22862198862839078
p-value: 0.1

In [None]:
calculate_autocorrelation(df_minute)

calculate_autocorrelation(df_minute):

ADF Statistic: -74.60797822505579
p-value: 0.0
autocorrelation for lag=1: 0.001955406488161188
autocorrelation for lag=5: -0.005807167639498408
autocorrelation for lag=10: 0.0031824493885383985
autocorrelation for lag=20: 0.008993355861628527

In [None]:
calculate_autocorrelation(df_hour)

calculate_autocorrelation(df_hour)

ADF Statistic: -115.93954444131936
p-value: 0.0
autocorrelation for lag=1: -0.003918922743758015
autocorrelation for lag=5: -0.00342393648256756
autocorrelation for lag=10: 0.0008667056346734177
autocorrelation for lag=20: 0.0018134632474832249

In [None]:
calculate_autocorrelation(df_day)

ADF Statistic: -75.9852901454594
p-value: 0.0
autocorrelation for lag=1: -0.008409652985268596
autocorrelation for lag=5: -0.0027683475824439533
autocorrelation for lag=10: 0.007041048707762627
autocorrelation for lag=20: 0.0022470519690978305

#### Heavy Tails

In [33]:
def calculate_heavy_tails(df, prefix):
    df["Returns"] = df['Midprice'].pct_change().dropna()
    df = df.loc[1:]
    plt.cla()
    plt.figure(figsize=(12, 6))
    sns.histplot(df["Returns"], bins=100, kde=True, stat='density', color='blue', label='Returns')
    xmin, xmax = plt.xlim()
    x = np.linspace(xmin, xmax, 100)
    p = norm.pdf(x, df["Returns"].mean(), df["Returns"].std())
    plt.plot(x, p, 'k', linewidth=2, label='Normal distribution')
    plt.title(f'Distribution of {prefix} Returns')
    plt.legend()
    plt.savefig(str(prefix) + "heavytails")

    # Calculate kurtosis
    returns_kurtosis = kurtosis(df["Returns"])
    print('Kurtosis for ' + str(prefix) + " : " f'{returns_kurtosis}')

In [None]:
calculate_heavy_tails(df_tick, "tick")

Kurtosis for tick : 5.226867177871771

![Unbenannt.png](attachment:c2675b0d-c3b6-4f4f-ada9-d87c3ced9048.png)

In [None]:
calculate_heavy_tails(df_minute, "minute")

Kurtosis for minute : 101.91556483016107

![Unbenannt.png](attachment:a437269d-9257-430d-a602-096dcf32d2e1.png)

In [None]:
calculate_heavy_tails(df_hour, "hour")

Kurtosis for hour : 80.5711534283122

![Unbenannt-1.png](attachment:efea214f-a12c-47e4-9011-b9b66e1ba689.png)

In [None]:
calculate_heavy_tails(df_day, "day")

Kurtosis for day : 326.63121316256814

![Unbenannt.png](attachment:400e6d9b-a74d-421c-ae82-43f3b80c233a.png)

#### Volatility Clustering

In [28]:
df = df_tick
df["Returns"] = df['Midprice'].pct_change().dropna()
df = df.loc[1:]

plt.cla()
# Plot the returns
plt.figure(figsize=(12, 6))
plt.plot(df["Returns"], label='Returns')
plt.title('Tick-by-tick Returns')
plt.legend()
plt.savefig("plot_returns_tick")

![Unbenannt-1.png](attachment:4ea756be-96ca-4557-8caf-fe2966932d98.png)

![Unbenannt.png](attachment:7207a10d-af38-4266-9295-5bdc1cdf025b.png)

![Unbenannt-1.png](attachment:1afa29c6-23d6-4834-b15e-df54a33175ed.png)

##### ARCH Models

In [None]:
df = df_day

# Calculate tick-by-tick returns
df["Returns"] = df['Midprice'].pct_change().dropna()
df = df.loc[1:]

# Optionally filter out extreme returns
df = df[(df['Returns'] > -0.1) & (df['Returns'] < 0.1)]

# Rescale the returns
df['Returns'] = df['Returns'] * 1000

# Define and fit a GARCH(1,1) model
garch_model = arch_model(df['Returns'], vol='Garch', p=1, q=1)
garch_fit = garch_model.fit(disp='off')

print(garch_fit.summary())

In [21]:
plt.cla()
plt.figure(figsize=(12, 6))
plt.plot(garch_fit.conditional_volatility, label='Conditional Volatility', color='blue')
plt.title('Conditional Volatility Estimated by GARCH(1,1)')
plt.legend()
plt.savefig("conditional_volatility_garch.png")

![grafik.png](attachment:d8c0a78b-d198-4c38-a950-412637440534.png)

![Unbenannt.png](attachment:d0839715-24c6-4156-80af-991191c58104.png)

#### Gain Loss Asymmetry and Aggregational Gaussianity

We can observe the gain/loss asymmetry throughout the different frequencies aswell as the aggregational gaussianity that shows that with lower sampling frequency the distribution of returns are converging towards normal distribution

In [54]:
def calculate_gain_loss_asymmetry(df, prefix):
    df["Returns"] = df['Midprice'].pct_change().dropna()
    df = df.loc[1:]
    positive_returns = df["Returns"][df["Returns"] > 0]
    negative_returns = df["Returns"][df["Returns"] < 0]

    # Calculate descriptive statistics
    positive_stats = positive_returns.describe()
    negative_stats = negative_returns.describe()

    print("Positive Returns Statistics:\n", positive_stats)
    print("\nNegative Returns Statistics:\n", negative_stats)

    plt.cla()
    df["Returns"].hist()
    plt.savefig(f'{prefix} hist_returns')

In [55]:
calculate_gain_loss_asymmetry(df_tick, "tick")

Positive Returns Statistics:
 count    2.711400e+04
mean     4.002526e-05
std      2.337352e-05
min      2.220446e-16
25%      2.739471e-05
50%      2.771749e-05
75%      5.460125e-05
max      6.012506e-04
Name: Returns, dtype: float64

Negative Returns Statistics:
 count    2.751300e+04
mean    -4.031865e-05
std      2.419977e-05
min     -9.828976e-04
25%     -5.461303e-05
50%     -2.771826e-05
75%     -2.739171e-05
max     -2.220446e-16
Name: Returns, dtype: float64




![Unbenannt.png](attachment:637f1cf4-997d-403d-9b9a-e45d49c50f29.png)

In [56]:
calculate_gain_loss_asymmetry(df_minute, "minute")

Positive Returns Statistics:
 count    21606.000000
mean         0.000181
std          0.000199
min          0.000005
25%          0.000055
50%          0.000135
75%          0.000243
max          0.010316
Name: Returns, dtype: float64

Negative Returns Statistics:
 count    20870.000000
mean        -0.000189
std          0.000192
min         -0.007451
25%         -0.000249
50%         -0.000136
75%         -0.000059
max         -0.000005
Name: Returns, dtype: float64


![Unbenannt-1.png](attachment:ff88a686-9659-4875-a0f8-ed4b2b602258.png)

In [57]:
calculate_gain_loss_asymmetry(df_hour, "hour")

Positive Returns Statistics:
 count    95983.000000
mean         0.000174
std          0.000190
min          0.000005
25%          0.000060
50%          0.000127
75%          0.000226
max          0.010316
Name: Returns, dtype: float64

Negative Returns Statistics:
 count    93225.000000
mean        -0.000177
std          0.000185
min         -0.008687
25%         -0.000236
50%         -0.000128
75%         -0.000060
max         -0.000005
Name: Returns, dtype: float64


![Unbenannt.png](attachment:eb3b9bf5-961f-48f6-9b3f-a295241eff6f.png)

In [58]:
calculate_gain_loss_asymmetry(df_day, "day")

Positive Returns Statistics:
 count    297535.000000
mean          0.000238
std           0.000311
min           0.000005
25%           0.000071
50%           0.000158
75%           0.000298
max           0.026110
Name: Returns, dtype: float64

Negative Returns Statistics:
 count    291617.000000
mean         -0.000243
std           0.000319
min          -0.032448
25%          -0.000307
50%          -0.000159
75%          -0.000072
max          -0.000005
Name: Returns, dtype: float64


![Unbenannt-1.png](attachment:f1649b95-2667-4193-9041-5f365af1a858.png)