In [75]:
# imports 
from statsmodels.stats.diagnostic import het_arch 
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import datetime as dt
# timestamp 
import time

In [76]:
# load data from csv 
original_data = pd.read_csv('../data/processed/BTCUSDT_1m_2024-12-01_to_2025-01-01_cleaned_robust.csv')
volume_data = pd.read_csv('../data/processed/BTCUSDT_1m_2024-12-01_to_2025-01-volume.csv')
dolar_data = pd.read_csv('../data/processed/BTCUSDT_1m_2024-12-01_to_2025-01-01_dollar_bars_dyn.csv')
  

In [77]:
# date object to datetime

def init_date_conversion(data):
    # 1) Inspeccionar dtype original y primeros valores
    print("dtype original:", data['open_time'].dtype)
    print("primeros 10 valores (raw):")
    display(data['open_time'].head(10))

    # 2) Normalizar strings: strip y reemplazos comunes
    if data['open_time'].dtype == object:
        # eliminar espacios, comillas, saltos de línea
        data['open_time'] = data['open_time'].astype(str).str.strip().str.replace('"', '', regex=False).str.replace("'", "", regex=False)
        data['close_time'] = data['close_time'].astype(str).str.strip().str.replace('"', '', regex=False).str.replace("'", "", regex=False)

    # 3) Detectar si la columna es numérica codificada como strings (solo dígitos)
    sample_vals = data['open_time'].dropna().astype(str).head(50)
    n_digit_samples = sample_vals.apply(lambda x: x.isdigit()).sum()
    print(f"de los primeros 50 valores, {n_digit_samples} parecen solo dígitos (epoch)")

# 4) Función que intenta convertir robustamente cada valor (vectorizada)
def robust_to_datetime(series):
    # si ya es datetime, devolver
    if np.issubdtype(series.dtype, np.datetime64):
        return series
    
    s = series.copy()
    # 4a) Si todos (o mayoría) son dígitos -> intentar como epoch ms o s
    as_str = s.dropna().astype(str)
    is_all_digits = as_str.str.match(r'^\d+$').mean()  # proporción de strings sólo dígitos
    
    if is_all_digits > 0.5:
        # la mayoría son timestamps numéricos; inferir ms vs s por magnitud
        # convertir a float para inspección del tamaño (usa sample para evitar overflow)
        sample_num = as_str.sample(min(100, len(as_str))).astype(float)
        median_sample = sample_num.median()
        print("mediana de muestra numérica:", median_sample)
        if median_sample > 1e12:  # típico de ms epoch ( > ~10^12)
            print("-> inferido como epoch en MILLISEGUNDOS")
            out = pd.to_datetime(s.astype(float), unit='ms', errors='coerce')
        elif median_sample > 1e9:  # típico de s epoch ( > ~10^9)
            print("-> inferido como epoch en SEGUNDOS")
            out = pd.to_datetime(s.astype(float), unit='s', errors='coerce')
        else:
            # números pequeños: tratar como strings normalmente
            out = pd.to_datetime(s, errors='coerce')
        return out
    else:
        # 4b) hay muchos strings legibles - intentar parse directo
        out = pd.to_datetime(s, errors='coerce', utc=False)
        # si demasiados NaT, intentar forzar formatos comunes
        nat_frac = out.isna().mean()
        print(f"frac NaT tras parse directo: {nat_frac:.3f}")
        if nat_frac > 0.2:
            # intentar parse con varias plantillas comunes
            fmts = [
                "%Y-%m-%d %H:%M:%S.%f",
                "%Y-%m-%d %H:%M:%S",
                "%Y-%m-%dT%H:%M:%S.%fZ",
                "%Y-%m-%dT%H:%M:%SZ",
            ]
            for fmt in fmts:
                try:
                    test = pd.to_datetime(s, format=fmt, errors='coerce')
                    nat_frac2 = test.isna().mean()
                    print(f"  intento con format {fmt} => NaT frac: {nat_frac2:.3f}")
                    if nat_frac2 < nat_frac:
                        out = test
                        nat_frac = nat_frac2
                except Exception:
                    pass
        return out

# Aplicar la función a open_time y close_time
def convert_datetime_columns(data):
    data['open_time_parsed'] = robust_to_datetime(data['open_time'])
    data['close_time_parsed'] = robust_to_datetime(data['close_time'])

    # 5) Informar resultados y mostrar ejemplos problemáticos
    n_total = len(data)
    n_nat_open = data['open_time_parsed'].isna().sum()
    n_nat_close = data['close_time_parsed'].isna().sum()
    print(f"open_time -> NaT: {n_nat_open}/{n_total} ({n_nat_open/n_total:.3%})")
    print(f"close_time -> NaT: {n_nat_close}/{n_total} ({n_nat_close/n_total:.3%})")

    if n_nat_open > 0:
        print("Ejemplos de open_time problemáticos:")
        display(data.loc[data['open_time_parsed'].isna(), 'open_time'].head(10))

    # 6) Reemplazar columnas originales solo si conversión OK en la mayoría
    if (n_nat_open / n_total) < 0.05:
        data['open_time'] = data['open_time_parsed']
    else:
        print("Advertencia: demasiados NaT en open_time — revisa los ejemplos anteriores.")
        # no sobrescribimos para evitar pérdida de info

    if (n_nat_close / n_total) < 0.05:
        data['close_time'] = data['close_time_parsed']

    # borrar columnas auxiliares
    data = data.drop(columns=[c for c in ['open_time_parsed', 'close_time_parsed'] if c in data.columns])

    # 7) Eliminar filas con open_time NaT si quieres (opcional)
    data = data.dropna(subset=['open_time'])

    print("Conversión final: dtype open_time =>", data['open_time'].dtype)
    display(data.head())

    return pd.DataFrame(data)


In [78]:
print(f"original data: {len(original_data)}")
print(f"original data: {len(volume_data)}")
print(f"dollar data: {len(dolar_data)}")

original data: 43138
original data: 5286
dollar data: 29397


In [79]:
# datetime for original data
init_date_conversion(original_data)
original_data_d = convert_datetime_columns(original_data)

# datetime for volume data
init_date_conversion(volume_data)
volume_data_d = convert_datetime_columns(volume_data)

# datetime for dolar data
init_date_conversion(dolar_data)
dolar_data_d = convert_datetime_columns(dolar_data)



dtype original: object
primeros 10 valores (raw):


0    2024-12-01 05:01:00
1    2024-12-01 05:02:00
2    2024-12-01 05:03:00
3    2024-12-01 05:04:00
4    2024-12-01 05:05:00
5    2024-12-01 05:06:00
6    2024-12-01 05:07:00
7    2024-12-01 05:08:00
8    2024-12-01 05:09:00
9    2024-12-01 05:10:00
Name: open_time, dtype: object

de los primeros 50 valores, 0 parecen solo dígitos (epoch)
frac NaT tras parse directo: 0.000
frac NaT tras parse directo: 0.000
open_time -> NaT: 0/43138 (0.000%)
close_time -> NaT: 0/43138 (0.000%)
Conversión final: dtype open_time => datetime64[ns]


Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_volume,trades,taker_buy_base,taker_buy_quote,ignore,mid_price,return
0,2024-12-01 05:01:00,96473.19,96473.19,96464.16,96464.97,2.51941,2024-12-01 05:01:59.999,243038.6,1044.0,0.82286,79377.044051,0.0,96468.675,-8.5e-05
1,2024-12-01 05:02:00,96464.97,96509.99,96425.7,96509.99,40.57111,2024-12-01 05:02:59.999,3913119.0,4030.0,9.65762,931709.38092,0.0,96467.845,0.000467
2,2024-12-01 05:03:00,96509.99,96510.0,96476.0,96480.0,6.33996,2024-12-01 05:03:59.999,611807.0,2520.0,0.81025,78177.723577,0.0,96493.0,-0.000311
3,2024-12-01 05:04:00,96480.01,96480.01,96472.0,96472.0,2.02027,2024-12-01 05:04:59.999,194911.9,603.0,0.10671,10295.342507,0.0,96476.005,-8.3e-05
4,2024-12-01 05:05:00,96472.0,96472.01,96415.65,96420.02,8.16665,2024-12-01 05:05:59.999,787566.4,2192.0,0.70645,68130.564183,0.0,96443.83,-0.000539


dtype original: object
primeros 10 valores (raw):


0    2024-12-01 05:01:00
1    2024-12-01 05:24:00
2    2024-12-01 05:54:00
3    2024-12-01 06:21:00
4    2024-12-01 06:51:00
5    2024-12-01 07:24:00
6    2024-12-01 08:02:00
7    2024-12-01 08:16:00
8    2024-12-01 08:29:00
9    2024-12-01 08:32:00
Name: open_time, dtype: object

de los primeros 50 valores, 0 parecen solo dígitos (epoch)
frac NaT tras parse directo: 0.000
frac NaT tras parse directo: 0.000
open_time -> NaT: 0/5286 (0.000%)
close_time -> NaT: 0/5286 (0.000%)
Conversión final: dtype open_time => datetime64[ns]


Unnamed: 0,open_time,open,high,low,close,volume,close_time,return
0,2024-12-01 05:01:00,96473.19,96510.0,96385.47,96417.05,154.0115,2024-12-01 05:23:59.999,
1,2024-12-01 05:24:00,96417.04,96468.66,96355.0,96400.0,142.63492,2024-12-01 05:53:59.999,-0.000177
2,2024-12-01 05:54:00,96400.0,96483.98,96284.51,96388.01,141.1216,2024-12-01 06:20:59.999,-0.000124
3,2024-12-01 06:21:00,96388.01,96464.0,96341.88,96355.0,145.17075,2024-12-01 06:50:59.999,-0.000342
4,2024-12-01 06:51:00,96355.0,96459.77,96299.1,96449.48,141.15671,2024-12-01 07:23:59.999,0.000981


dtype original: object
primeros 10 valores (raw):


0    2024-12-01 05:01:00
1    2024-12-01 05:03:00
2    2024-12-01 05:07:00
3    2024-12-01 05:13:00
4    2024-12-01 05:18:00
5    2024-12-01 05:23:00
6    2024-12-01 05:26:00
7    2024-12-01 05:32:00
8    2024-12-01 05:38:00
9    2024-12-01 05:43:00
Name: open_time, dtype: object

de los primeros 50 valores, 0 parecen solo dígitos (epoch)
frac NaT tras parse directo: 0.000
frac NaT tras parse directo: 0.000
open_time -> NaT: 0/29397 (0.000%)
close_time -> NaT: 0/29397 (0.000%)
Conversión final: dtype open_time => datetime64[ns]


Unnamed: 0,open_time,close_time,open,high,low,close,volume,quote_volume,dollar_value,mid_price,num_ticks,return
0,2024-12-01 05:01:00,2024-12-01 05:02:59.999,96473.19,96509.99,96425.7,96509.99,43.09052,4156158.0,4158552.0,96468.26,2,0.000382
1,2024-12-01 05:03:00,2024-12-01 05:06:59.999,96509.99,96510.0,96415.65,96459.11,21.81001,2103749.0,2103613.0,96463.1,4,-0.000527
2,2024-12-01 05:07:00,2024-12-01 05:12:59.999,96459.11,96470.5,96409.54,96409.57,22.958,2213962.0,2213943.0,96433.87,6,-0.000514
3,2024-12-01 05:13:00,2024-12-01 05:17:59.999,96409.56,96460.0,96385.47,96460.0,21.20811,2044785.0,2045027.0,96417.936,5,0.000523
4,2024-12-01 05:18:00,2024-12-01 05:22:59.999,96460.0,96500.0,96428.57,96457.11,29.258,2822420.0,2822329.0,96470.548,5,-3e-05


In [80]:
dolar_data_d.head() # no tiene return 

Unnamed: 0,open_time,close_time,open,high,low,close,volume,quote_volume,dollar_value,mid_price,num_ticks,return
0,2024-12-01 05:01:00,2024-12-01 05:02:59.999,96473.19,96509.99,96425.7,96509.99,43.09052,4156158.0,4158552.0,96468.26,2,0.000382
1,2024-12-01 05:03:00,2024-12-01 05:06:59.999,96509.99,96510.0,96415.65,96459.11,21.81001,2103749.0,2103613.0,96463.1,4,-0.000527
2,2024-12-01 05:07:00,2024-12-01 05:12:59.999,96459.11,96470.5,96409.54,96409.57,22.958,2213962.0,2213943.0,96433.87,6,-0.000514
3,2024-12-01 05:13:00,2024-12-01 05:17:59.999,96409.56,96460.0,96385.47,96460.0,21.20811,2044785.0,2045027.0,96417.936,5,0.000523
4,2024-12-01 05:18:00,2024-12-01 05:22:59.999,96460.0,96500.0,96428.57,96457.11,29.258,2822420.0,2822329.0,96470.548,5,-3e-05


In [81]:
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import het_arch
from scipy.stats import jarque_bera, shapiro
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mutual_info_score

# ljung box test for returns
def returns(df): 
    lb_test = acorr_ljungbox(df['return'].dropna(), lags=[10], return_df=True)
    print("Ljung-Box test results for returns:")
    display(lb_test)

# arch test
def arch_test(df): 
    arch_result = het_arch(df['return'].dropna())
    print(f"ARCH test p-value: {arch_result[1]}")

# normality tests
def normality_tests(df):
    jb_stat, jb_pvalue = jarque_bera(df['return'].dropna())
    shapiro_stat, shapiro_pvalue = shapiro(df['return'].dropna())
    print(f"Jarque-Bera test p-value: {jb_pvalue}")
    print(f"Shapiro-Wilk test p-value: {shapiro_pvalue}")

# kurtosis and skewness
def kurtosis_skewness(df):
    kurtosis = df['return'].kurtosis()
    skewness = df['return'].skew()
    print(f"Kurtosis: {kurtosis}")
    print(f"Skewness: {skewness}")

# predictability with AutoReg
def predictability(df):
    model = AutoReg(df['return'], lags=1).fit()
    print(model.params)

# mutual information
def mutual_information(df):
    returns = df['return'].dropna()
    mi = mutual_info_score(np.digitize(returns[:-1], bins=20), 
                           np.digitize(returns[1:], bins=20))
    print(f"Mutual Information between returns and lagged returns: {mi}")



In [82]:
# comparative table 

from statsmodels.stats.diagnostic import acorr_ljungbox, het_arch
from scipy.stats import jarque_bera

def get_return_col(df):
    for col in df.columns:
        if 'return' in col.lower():
            return col
    raise ValueError("No return column found")

def return_test(df):
    col = get_return_col(df)
    lb_test = acorr_ljungbox(df[col].dropna(), lags=[10], return_df=True)
    print("Ljung-Box test results for returns:")
    display(lb_test)

def arch_test(df):
    col = get_return_col(df)
    lb_test = het_arch(df[col].dropna())
    print("ARCH test results:")
    display(lb_test)

def normality_tests(df):
    col = get_return_col(df)
    jb_stat, jb_pvalue = jarque_bera(df[col].dropna())
    shapiro_stat, shapiro_pvalue = shapiro(df[col].dropna())
    print(f"Jarque-Bera test p-value: {jb_pvalue}")
    print(f"Shapiro-Wilk test p-value: {shapiro_pvalue}")

def kurtosis_skewness(df):
    col = get_return_col(df)
    kurtosis = df[col].kurtosis()
    skewness = df[col].skew()
    print(f"Kurtosis: {kurtosis}")
    print(f"Skewness: {skewness}")

def predictability(df):
    col = get_return_col(df)
    model = AutoReg(df[col].dropna(), lags=1).fit()
    print("AutoReg model parameters:")
    print(model.params)

def mutual_information(df):
    if 'return' not in df.columns:
        print("Column 'return' not found.")
        return
    
    returns = df['return'].dropna().values
    if len(returns) < 3:
        print("Not enough data for MI.")
        return
    
    # define consistent bins
    bins = np.histogram_bin_edges(returns, bins=20)
    x = np.digitize(returns[:-1], bins)
    y = np.digitize(returns[1:], bins)
    
    try:
        mi = mutual_info_score(x, y)
        print(f"Mutual Information between returns and lagged returns: {mi:.6f}")
    except Exception as e:
        print(f"Error computing MI: {e}")

# Summary table
def stats_summary():
    datasets = {
        'original_data_d': original_data_d,
        'volume_data_d': volume_data_d,
        'dolar_data_d': dolar_data_d
    }

    for name, df in datasets.items():
        print(f"\n{'='*10} Statistics for {name} {'='*10}\n")
        try:
            return_test(df)
            arch_test(df)
            normality_tests(df)
            kurtosis_skewness(df)
            predictability(df)
            mutual_information(df)
        except Exception as e:
            print(f"Error processing {name}: {e}")
        print("\n")


In [83]:
stats_summary()



Ljung-Box test results for returns:


Unnamed: 0,lb_stat,lb_pvalue
10,20.781633,0.022669


ARCH test results:


(np.float64(4265.560420705975), np.float64(0.0), 473.25430582997, 0.0)

Jarque-Bera test p-value: 4.895556295957343e-89
Shapiro-Wilk test p-value: 4.406711175654514e-41
Kurtosis: 0.47345681004569107
Skewness: 0.0238874003962344
AutoReg model parameters:
const       -0.000002
return.L1   -0.004775
dtype: float64
Mutual Information between returns and lagged returns: 0.025488




Ljung-Box test results for returns:


  res = hypotest_fun_out(*samples, **kwds)


Unnamed: 0,lb_stat,lb_pvalue
10,8.541645,0.576083


ARCH test results:


(np.float64(140.66559265865746),
 np.float64(3.078377846414865e-25),
 14.421804678254276,
 1.3665223150404916e-25)

Jarque-Bera test p-value: 0.0
Shapiro-Wilk test p-value: 5.914716830156528e-29
Kurtosis: 3.9305760759643293
Skewness: -0.0029506524887492765
AutoReg model parameters:
const       -0.000003
return.L1    0.023383
dtype: float64
Mutual Information between returns and lagged returns: 0.014274






  res = hypotest_fun_out(*samples, **kwds)
  self._init_dates(dates, freq)


Ljung-Box test results for returns:


Unnamed: 0,lb_stat,lb_pvalue
10,11.343937,0.331357


ARCH test results:


(np.float64(592.5115196409017),
 np.float64(7.081051812690305e-121),
 60.44774302152857,
 3.832726282460205e-122)

Jarque-Bera test p-value: 5.377711061767652e-07
Shapiro-Wilk test p-value: 3.4065622594447027e-12
Kurtosis: -0.15051199262964632
Skewness: 0.014693334976086778
AutoReg model parameters:
const       -0.000002
return.L1   -0.008100
dtype: float64
Mutual Information between returns and lagged returns: 0.008713




  res = hypotest_fun_out(*samples, **kwds)
