In [34]:
import pandas as pd
import numpy as np
import os
from joblib import load
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import scipy.stats as stats
from statsmodels.stats.stattools import durbin_watson

In [11]:
path = os.getcwd()

In [12]:
folder_path = path + '/FX-Data'

# List all files in the directory
all_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.csv')]

# Create a dictionary to hold dataframes
dfs = {}

for file in all_files:
    # The key will be the filename without .csv and the value will be the dataframe
    dfs[file.split('.csv')[0]] = pd.read_csv(os.path.join(folder_path, file))


In [14]:
dfs['USDGBP']

Unnamed: 0,Date,Trend,Open,High,Low,Adj Close,Parabolic_SAR,Coppock_Curve,Typical_Price,RSI,...,KAMA,MI,MSD,TRIX,VORTEX_NEG,VORTEX_POS,MACD,PPO,APO,DO_UP
0,2005-01-03,1,0.51975,0.52067,0.51584,0.51682,0.512239,-1.006718,0.517777,41.287423,...,0.519233,25.785806,0.003656,-0.143640,0.934489,1.047003,-0.003811,-0.729161,-0.002870,0.521455
1,2005-01-04,1,0.51690,0.51883,0.51520,0.51889,0.512462,-0.762175,0.517640,44.534664,...,0.519231,25.660571,0.002886,-0.140113,1.027002,1.016570,-0.003525,-0.674811,-0.002461,0.520370
2,2005-01-05,0,0.51905,0.52277,0.51712,0.52140,0.512681,-0.481777,0.520430,48.270591,...,0.519330,25.601936,0.002704,-0.135842,0.990563,1.016863,-0.003061,-0.585986,-0.001803,0.520370
3,2005-01-06,1,0.52154,0.52189,0.51846,0.51980,0.512896,-0.317082,0.520050,46.137256,...,0.519336,25.525607,0.002726,-0.131269,1.058428,0.975828,-0.002790,-0.534269,-0.001267,0.520370
4,2005-01-07,1,0.51989,0.52219,0.51674,0.52089,0.513107,-0.111390,0.519940,47.828876,...,0.519408,25.461470,0.002771,-0.126297,1.015088,0.966683,-0.002458,-0.470926,-0.000463,0.520370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4784,2023-05-29,0,0.80249,0.80548,0.80176,0.80250,0.793703,-0.016280,0.803247,51.299017,...,0.802117,24.623867,0.004404,-0.061180,0.789194,0.835741,-0.001097,-0.136858,-0.002398,0.802140
4785,2023-05-30,0,0.80412,0.80819,0.80341,0.80422,0.795018,0.335212,0.805273,53.341117,...,0.802308,24.597436,0.004457,-0.057341,0.722974,0.874913,-0.000696,-0.086775,-0.001519,0.804285
4786,2023-05-31,1,0.80525,0.80919,0.80230,0.80533,0.796599,0.782585,0.805607,54.662287,...,0.802649,24.649405,0.004659,-0.053193,0.703377,0.950857,-0.000285,-0.035516,-0.000281,0.804795
4787,2023-06-01,0,0.80851,0.81175,0.80737,0.80834,0.798361,1.325136,0.809153,58.125092,...,0.803432,24.708855,0.004883,-0.048552,0.643325,1.026408,0.000280,0.034951,0.000873,0.806560


In [18]:
for df in dfs:
  print(df)

EURNZD
USDCHF
EURCHF
USDEUR
USDGBP
EURNOK
EURSEK
USDNZD
USDCAD
EURDKK
USDSEK
USDJPY
USDDKK
EURJPY
EURCAD
EURGBP
USDNOK


In [40]:
import warnings
warnings.filterwarnings("ignore")

# Basic

In [41]:
# Initialize a DataFrame to store statistics about the datasets
statistics_df = pd.DataFrame(columns=['Name', 'Mean', 'Std', 'Skewness', 'Kurtosis', 'K-S test p-value', 'Durbin-Watson statistic'])

# Loop through each DataFrame in the dfs dictionary
for df_name, df in dfs.items():
    # Filter the DataFrame to only consider data we used (before 2019-12-31) and get the 'Adj Close' column
    subset = df[df['Date'] <= '2019-12-31']['Adj Close']

    # Compute basic statistical metrics
    mean = subset.mean()
    std = subset.std()
    skewness = subset.skew()
    kurtosis = subset.kurt()

    # Standardize the data for the Kolmogorov–Smirnov test (zero mean and unit variance)
    standardized_data = (subset - mean) / std

    # Perform the Kolmogorov–Smirnov test against a normal distribution
    ks_stat, ks_p_value = stats.kstest(standardized_data, 'norm')

    # Calculate the Durbin–Watson statistic to detect the presence of autocorrelation
    dw_stat = durbin_watson(subset)

    # Append the results to the statistics DataFrame
    statistics_df = statistics_df.append({
        'Name': df_name,
        'Mean': mean,
        'Std': std,
        'Skewness': skewness,
        'Kurtosis': kurtosis,
        'K-S test p-value': ks_p_value,
        'Durbin-Watson statistic': dw_stat
    }, ignore_index=True)

# Display the statistics DataFrame
statistics_df


Unnamed: 0,Name,Mean,Std,Skewness,Kurtosis,K-S test p-value,Durbin-Watson statistic
0,EURNZD,1.762382,0.204566,1.253305,1.680997,1.686426e-49,5.031752e-05
1,USDCHF,1.03209,0.120508,0.760071,-0.406385,3.8363579999999997e-110,4.845835e-05
2,EURCHF,1.309713,0.201481,0.405645,-1.40765,2.751056e-141,2.682381e-05
3,USDEUR,0.795413,0.077484,0.138942,-0.914814,9.079269999999999e-26,5.316531e-05
4,USDGBP,0.642179,0.089392,0.202618,-0.783526,7.798306e-23,3.670658e-05
5,EURNOK,8.499107,0.754044,0.466459,-1.058186,2.100161e-64,2.533174e-05
6,EURSEK,9.48659,0.616952,0.726761,0.015132,1.898354e-67,2.157474e-05
7,USDNZD,1.393209,0.14434,0.841282,1.622468,1.800653e-06,7.110804e-05
8,USDCAD,1.153546,0.128349,0.15725,-1.407375,1.39925e-45,3.252015e-05
9,EURDKK,7.451356,0.009658,-0.113283,-0.178133,7.533932e-21,8.451651e-08


In [42]:
statistics_df.to_csv(path + '/statistics_df.csv', index=False)

# Advance (with quantile, min, max and mode)

In [43]:
# Initialize a DataFrame to store statistics about the datasets
statistics_df_advance = pd.DataFrame(columns=['Name', 'Mean', 'Std', 'Min', 'Q1', 'Median', 'Q3', 'Max', 'Mode', 'Skewness', 'Kurtosis', 'K-S test p-value', 'Durbin-Watson statistic'])

# Loop through each DataFrame in the dfs dictionary
for df_name, df in dfs.items():
    # Filter the DataFrame to only consider data we used (before 2019-12-31) and get the 'Adj Close' column
    subset = df[df['Date'] <= '2019-12-31']['Adj Close']

    # Compute basic statistical metrics
    mean = subset.mean()
    std = subset.std()
    min_val = subset.min()
    max_val = subset.max()
    q1 = subset.quantile(0.25)
    median = subset.quantile(0.5)
    q3 = subset.quantile(0.75)
    mode = subset.mode().iloc[0]
    skewness = subset.skew()
    kurtosis = subset.kurt()

    # Standardize the data for the Kolmogorov–Smirnov test (zero mean and unit variance)
    standardized_data = (subset - mean) / std

    # Perform the Kolmogorov–Smirnov test against a normal distribution
    ks_stat, ks_p_value = stats.kstest(standardized_data, 'norm')

    # Calculate the Durbin–Watson statistic to detect the presence of autocorrelation
    dw_stat = durbin_watson(subset)

    # Append the results to the statistics DataFrame
    statistics_df_advance = statistics_df_advance.append({
        'Name': df_name,
        'Mean': mean,
        'Std': std,
        'Min': min_val,
        'Q1': q1,
        'Median': median,
        'Q3': q3,
        'Max': max_val,
        'Mode': mode,
        'Skewness': skewness,
        'Kurtosis': kurtosis,
        'K-S test p-value': ks_p_value,
        'Durbin-Watson statistic': dw_stat
    }, ignore_index=True)
    
# Display the statistics DataFrame
statistics_df_advance


Unnamed: 0,Name,Mean,Std,Min,Q1,Median,Q3,Max,Mode,Skewness,Kurtosis,K-S test p-value,Durbin-Watson statistic
0,EURNZD,1.762382,0.204566,1.4002,1.61587,1.7152,1.8627,2.5498,1.8942,1.253305,1.680997,1.686426e-49,5.031752e-05
1,USDCHF,1.03209,0.120508,0.7228,0.9494,0.99266,1.108,1.3248,0.9553,0.760071,-0.406385,3.8363579999999997e-110,4.845835e-05
2,EURCHF,1.309713,0.201481,0.98054,1.137,1.22302,1.5372,1.6795,1.201,0.405645,-1.40765,2.751056e-141,2.682381e-05
3,USDEUR,0.795413,0.077484,0.62547,0.73681,0.7807,0.865752,0.96242,0.77821,0.138942,-0.914814,9.079269999999999e-26,5.316531e-05
4,USDGBP,0.642179,0.089392,0.47434,0.581482,0.63541,0.700922,0.83181,0.56532,0.202618,-0.783526,7.798306e-23,3.670658e-05
5,EURNOK,8.499107,0.754044,7.2704,7.9001,8.2386,9.2231,10.25342,7.8296,0.466459,-1.058186,2.100161e-64,2.533174e-05
6,EURSEK,9.48659,0.616952,8.1803,9.1185,9.34098,9.75146,11.6816,9.2751,0.726761,0.015132,1.898354e-67,2.157474e-05
7,USDNZD,1.393209,0.14434,1.1333,1.282325,1.39305,1.4738,2.0292,1.4738,0.841282,1.622468,1.800653e-06,7.110804e-05
8,USDCAD,1.153546,0.128349,0.9187,1.0295,1.1367,1.281535,1.4578,1.018,0.15725,-1.407375,1.39925e-45,3.252015e-05
9,EURDKK,7.451356,0.009658,7.4131,7.4429,7.4529,7.45918,7.5227,7.459,-0.113283,-0.178133,7.533932e-21,8.451651e-08


In [44]:
statistics_df_advance.to_csv(path + '/statistics_df_advance.csv', index=False)