In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from darts import TimeSeries
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from typing import List, Dict, Tuple
import os
import warnings

# Suppress specific warnings that might clutter output
warnings.filterwarnings("ignore", category=UserWarning, module='statsmodels')
warnings.filterwarnings("ignore", category=FutureWarning, module='statsmodels')

## Data Loading

In [2]:
def load_data(file_path: str = '../data/CRE.csv') -> List[TimeSeries]:
    """
    Loads data from a CSV file into a list of Darts TimeSeries.
    Converts data to float32 for compatibility.
    """
    try:
        df = pd.read_csv(file_path)
        df = df.astype('float32')
        print(f"Successfully loaded data from {file_path}")
        return [TimeSeries.from_series(df[col]) for col in df.columns]
    except FileNotFoundError:
        print(f"Error: Data file not found at '{file_path}'. Please ensure the path is correct.")
        return []
    except Exception as e:
        print(f"An error occurred during data loading: {e}")
        return []

# Load the dataset
all_series = load_data()

Successfully loaded data from ../data/CRE.csv


## Descriptive Statistics

In [3]:
def get_descriptive_stats(series: TimeSeries) -> pd.Series:
    """
    Returns descriptive statistics for a given TimeSeries.
    """
    return series.to_series().describe()

if all_series:
    print(f"Performing descriptive statistics for {len(all_series)} series.")
    for i, series in enumerate(all_series):
        print(f"\n--- Series {i+1} (Column: {series.columns[0]}) ---")
        display(get_descriptive_stats(series)) # Use display for cleaner output in notebooks

Performing descriptive statistics for 41 series.

--- Series 1 (Column: time-axis) ---


count    626.000000
mean       2.757778
std        0.726523
min        1.502417
25%        2.130097
50%        2.757778
75%        3.385459
max        4.013140
Name: time-axis, dtype: float64


--- Series 2 (Column: 1) ---


count    626.000000
mean       0.193127
std        0.088824
min       -0.071438
25%        0.140657
50%        0.173126
75%        0.222595
max        0.372102
Name: 1, dtype: float64


--- Series 3 (Column: 2) ---


count    626.000000
mean       0.326562
std        0.171588
min       -0.131583
25%        0.178665
50%        0.307455
75%        0.477493
max        0.599394
Name: 2, dtype: float64


--- Series 4 (Column: 3) ---


count    626.000000
mean       0.014788
std        0.138334
min       -0.159414
25%       -0.088223
50%       -0.032128
75%        0.066604
max        0.404775
Name: 3, dtype: float64


--- Series 5 (Column: 4) ---


count    626.000000
mean       0.857897
std        0.632654
min       -0.206133
25%        0.248752
50%        0.767486
75%        1.449553
max        1.952338
Name: 4, dtype: float64


--- Series 6 (Column: 5) ---


count    626.000000
mean       0.110370
std        0.109403
min       -0.098612
25%        0.030227
50%        0.074963
75%        0.178977
max        0.372941
Name: 5, dtype: float64


--- Series 7 (Column: 6) ---


count    626.000000
mean       0.369888
std        0.273839
min       -0.507847
25%        0.145878
50%        0.370444
75%        0.526687
max        0.814828
Name: 6, dtype: float64


--- Series 8 (Column: 7) ---


count    626.000000
mean       0.085346
std        0.297607
min       -0.283201
25%       -0.134977
50%       -0.041850
75%        0.361451
max        0.633775
Name: 7, dtype: float64


--- Series 9 (Column: 8) ---


count    626.000000
mean       0.262107
std        0.290785
min       -0.153749
25%        0.086490
50%        0.166413
75%        0.224646
max        1.107216
Name: 8, dtype: float64


--- Series 10 (Column: 9) ---


count    626.000000
mean       0.113022
std        0.187956
min       -0.105982
25%       -0.008869
50%        0.018531
75%        0.252365
max        0.581263
Name: 9, dtype: float64


--- Series 11 (Column: 10) ---


count    626.000000
mean       0.100582
std        0.137052
min       -0.107220
25%       -0.038369
50%        0.108930
75%        0.207608
max        0.350897
Name: 10, dtype: float64


--- Series 12 (Column: 11) ---


count    626.000000
mean       0.278661
std        0.206681
min       -0.251133
25%        0.147249
50%        0.202803
75%        0.398128
max        0.763799
Name: 11, dtype: float64


--- Series 13 (Column: 12) ---


count    626.000000
mean       0.192531
std        0.302221
min       -0.266375
25%       -0.016570
50%        0.120802
75%        0.369464
max        0.899899
Name: 12, dtype: float64


--- Series 14 (Column: 13) ---


count    626.000000
mean      -0.046348
std        0.091397
min       -0.208709
25%       -0.121286
50%       -0.047832
75%        0.001285
max        0.169105
Name: 13, dtype: float64


--- Series 15 (Column: 14) ---


count    626.000000
mean       0.095942
std        0.150962
min       -0.531582
25%        0.000555
50%        0.114991
75%        0.212228
max        0.305765
Name: 14, dtype: float64


--- Series 16 (Column: 15) ---


count    626.000000
mean      -0.177415
std        0.107667
min       -0.392605
25%       -0.263672
50%       -0.148983
75%       -0.088903
max       -0.002213
Name: 15, dtype: float64


--- Series 17 (Column: 16) ---


count    626.000000
mean       0.384699
std        0.339790
min       -0.039809
25%        0.105956
50%        0.257107
75%        0.665062
max        1.050695
Name: 16, dtype: float64


--- Series 18 (Column: 17) ---


count    626.000000
mean      -0.125191
std        0.123353
min       -0.376415
25%       -0.210947
50%       -0.119775
75%       -0.040735
max        0.171354
Name: 17, dtype: float64


--- Series 19 (Column: 18) ---


count    626.000000
mean       0.107594
std        0.145778
min       -0.100741
25%       -0.017149
50%        0.085814
75%        0.208175
max        0.376851
Name: 18, dtype: float64


--- Series 20 (Column: 19) ---


count    626.000000
mean      -0.033672
std        0.318927
min       -0.385937
25%       -0.233125
50%       -0.173548
75%        0.136035
max        0.863887
Name: 19, dtype: float64


--- Series 21 (Column: 20) ---


count    626.000000
mean      -0.018419
std        0.058492
min       -0.094516
25%       -0.060896
50%       -0.042046
75%        0.006949
max        0.149279
Name: 20, dtype: float64


--- Series 22 (Column: 21) ---


count    626.000000
mean       0.524813
std        0.466660
min       -0.210916
25%        0.085892
50%        0.515517
75%        0.822957
max        1.753811
Name: 21, dtype: float64


--- Series 23 (Column: 22) ---


count    626.000000
mean      -0.092539
std        0.182655
min       -0.251113
25%       -0.201842
50%       -0.164322
75%       -0.073017
max        0.416394
Name: 22, dtype: float64


--- Series 24 (Column: 23) ---


count    626.000000
mean       0.034168
std        0.093520
min       -0.084655
25%       -0.033219
50%        0.006416
75%        0.071653
max        0.253963
Name: 23, dtype: float64


--- Series 25 (Column: 24) ---


count    626.000000
mean       0.478375
std        0.494303
min       -0.047067
25%        0.148134
50%        0.264729
75%        0.812896
max        1.736553
Name: 24, dtype: float64


--- Series 26 (Column: 25) ---


count    626.000000
mean       0.214804
std        0.435959
min       -0.060701
25%       -0.016059
50%        0.026203
75%        0.082211
max        1.499007
Name: 25, dtype: float64


--- Series 27 (Column: 26) ---


count    626.000000
mean       0.081985
std        0.044148
min       -0.086077
25%        0.042317
50%        0.086794
75%        0.119432
max        0.156104
Name: 26, dtype: float64


--- Series 28 (Column: 27) ---


count    626.000000
mean       0.391915
std        0.594811
min       -0.458958
25%       -0.194509
50%        0.286312
75%        1.075423
max        1.427382
Name: 27, dtype: float64


--- Series 29 (Column: 28) ---


count    626.000000
mean       0.016402
std        0.047418
min       -0.106589
25%       -0.022615
50%        0.033710
75%        0.049995
max        0.086333
Name: 28, dtype: float64


--- Series 30 (Column: 29) ---


count    626.000000
mean      -0.168229
std        0.151939
min       -0.390040
25%       -0.301003
50%       -0.175916
75%       -0.019078
max        0.102914
Name: 29, dtype: float64


--- Series 31 (Column: 30) ---


count    626.000000
mean      -0.044682
std        0.047546
min       -0.130273
25%       -0.082411
50%       -0.047404
75%       -0.001808
max        0.037825
Name: 30, dtype: float64


--- Series 32 (Column: 31) ---


count    626.000000
mean       0.036269
std        0.284200
min       -0.203972
25%       -0.100465
50%       -0.062615
75%        0.017501
max        1.113498
Name: 31, dtype: float64


--- Series 33 (Column: 32) ---


count    626.000000
mean       0.198819
std        0.302807
min       -0.137355
25%        0.015179
50%        0.052242
75%        0.238719
max        1.109930
Name: 32, dtype: float64


--- Series 34 (Column: 33) ---


count    626.000000
mean       0.208691
std        0.287848
min       -0.104532
25%       -0.019242
50%        0.139243
75%        0.284103
max        0.908646
Name: 33, dtype: float64


--- Series 35 (Column: 34) ---


count    626.000000
mean       0.760605
std        0.947123
min       -0.168406
25%       -0.086531
50%        0.435571
75%        1.515520
max        2.652871
Name: 34, dtype: float64


--- Series 36 (Column: 35) ---


count    626.000000
mean      -0.106716
std        0.081268
min       -0.283920
25%       -0.176151
50%       -0.095639
75%       -0.055474
max        0.078578
Name: 35, dtype: float64


--- Series 37 (Column: 36) ---


count    626.000000
mean       0.130880
std        0.154775
min       -0.135475
25%       -0.025844
50%        0.182924
75%        0.255522
max        0.378522
Name: 36, dtype: float64


--- Series 38 (Column: 37) ---


count    626.000000
mean       1.043830
std        1.162541
min       -0.208049
25%       -0.039952
50%        0.854787
75%        1.729016
max        3.799403
Name: 37, dtype: float64


--- Series 39 (Column: 38) ---


count    626.000000
mean       0.008657
std        0.092254
min       -0.137059
25%       -0.067368
50%        0.003123
75%        0.098395
max        0.160203
Name: 38, dtype: float64


--- Series 40 (Column: 39) ---


count    626.000000
mean       0.116079
std        0.240502
min       -0.085074
25%       -0.030389
50%        0.014538
75%        0.175288
max        0.909034
Name: 39, dtype: float64


--- Series 41 (Column: 40) ---


count    626.000000
mean       0.011244
std        0.164422
min       -0.151848
25%       -0.075894
50%       -0.047023
75%       -0.006947
max        0.510110
Name: 40, dtype: float64

## Stationarity Analysis (Augmented Dickey-Fuller Test)

In [4]:
def perform_adf_test(series: TimeSeries) -> Dict:
    """
    Performs the Augmented Dickey-Fuller test on a TimeSeries.
    Returns a dictionary of test results.
    """
    results = {"ADF Statistic": np.nan, "p-value": np.nan, "Critical Values": {}}
    try:
        if series.to_series().nunique() > 1 and len(series) > 8: # ADF needs enough data points
            adf_result = adfuller(series.values().flatten())
            results["ADF Statistic"] = adf_result[0]
            results["p-value"] = adf_result[1]
            results["Critical Values"] = adf_result[4]
            results["Conclusion"] = "Stationary (reject H0)" if adf_result[1] <= 0.05 else "Non-stationary (fail to reject H0)"
        else:
            results["Conclusion"] = "Cannot perform ADF test: Series is constant or too short."
    except Exception as e:
        results["Conclusion"] = f"Error during ADF test: {e}"
    return results

if all_series:
    print(f"Performing ADF tests for {len(all_series)} series.")
    for i, series in enumerate(all_series):
        print(f"\n--- Series {i+1} (Column: {series.columns[0]}) ---")
        adf_results = perform_adf_test(series)
        for key, value in adf_results.items():
            if isinstance(value, dict):
                print(f"  {key}:")
                for k, v in value.items():
                    print(f"    {k}: {v:.2f}")
            elif isinstance(value, float):
                print(f"  {key}: {value:.3f}")
            else:
                print(f"  {key}: {value}")

Performing ADF tests for 41 series.

--- Series 1 (Column: time-axis) ---
  ADF Statistic: -0.305
  p-value: 0.925
  Critical Values:
    1%: -3.44
    5%: -2.87
    10%: -2.57
  Conclusion: Non-stationary (fail to reject H0)

--- Series 2 (Column: 1) ---
  ADF Statistic: -1.211
  p-value: 0.669
  Critical Values:
    1%: -3.44
    5%: -2.87
    10%: -2.57
  Conclusion: Non-stationary (fail to reject H0)

--- Series 3 (Column: 2) ---
  ADF Statistic: -2.352
  p-value: 0.156
  Critical Values:
    1%: -3.44
    5%: -2.87
    10%: -2.57
  Conclusion: Non-stationary (fail to reject H0)

--- Series 4 (Column: 3) ---
  ADF Statistic: -3.661
  p-value: 0.005
  Critical Values:
    1%: -3.44
    5%: -2.87
    10%: -2.57
  Conclusion: Stationary (reject H0)

--- Series 5 (Column: 4) ---
  ADF Statistic: -1.589
  p-value: 0.489
  Critical Values:
    1%: -3.44
    5%: -2.87
    10%: -2.57
  Conclusion: Non-stationary (fail to reject H0)

--- Series 6 (Column: 5) ---
  ADF Statistic: -1.160
  p-

## Autocorrelation and Partial Autocorrelation (ACF/PACF) Plots

In [5]:
def create_acf_pacf_plots(series: TimeSeries, series_name: str, save_dir: str = 'plots'):
    """
    Generates and saves ACF and PACF plots for a given TimeSeries.
    """
    os.makedirs(save_dir, exist_ok=True)
    file_path = os.path.join(save_dir, f'{series_name}_acf_pacf.png')
    
    try:
        if len(series) > 2 and series.to_series().nunique() > 1:
            fig, axes = plt.subplots(1, 2, figsize=(14, 5))
            # Determine max lags dynamically, up to a reasonable limit
            max_lags = min(len(series) // 2 - 1, 40) # Max 40 lags or half series length
            
            plot_acf(series.values().flatten(), ax=axes[0], lags=max_lags, title=f'ACF for {series_name}')
            plot_pacf(series.values().flatten(), ax=axes[1], lags=max_lags, title=f'PACF for {series_name}')
            
            plt.tight_layout()
            plt.savefig(file_path)
            plt.close(fig)
            print(f"  ACF/PACF plots saved to {file_path}")
        else:
            print(f"  Cannot generate ACF/PACF plots for {series_name}: Not enough data or series is constant.")
    except Exception as e:
        print(f"  Error generating ACF/PACF plots for {series_name}: {e}")

if all_series:
    print(f"Generating ACF/PACF plots for {len(all_series)} series. Plots will be saved in 'models/plots/'.")
    # Create a 'plots' directory if it doesn't exist
    os.makedirs('models/plots', exist_ok=True)
    for i, series in enumerate(all_series):
        series_name = series.columns[0].replace(' ', '_').replace('/', '_') # Sanitize name for filename
        create_acf_pacf_plots(series, series_name, save_dir='models/plots')

print("\nComprehensive statistical analysis complete. Review the output and generated plots.")

Generating ACF/PACF plots for 41 series. Plots will be saved in 'models/plots/'.
  ACF/PACF plots saved to models/plots/time-axis_acf_pacf.png
  ACF/PACF plots saved to models/plots/1_acf_pacf.png
  ACF/PACF plots saved to models/plots/2_acf_pacf.png
  ACF/PACF plots saved to models/plots/3_acf_pacf.png
  ACF/PACF plots saved to models/plots/4_acf_pacf.png
  ACF/PACF plots saved to models/plots/5_acf_pacf.png
  ACF/PACF plots saved to models/plots/6_acf_pacf.png
  ACF/PACF plots saved to models/plots/7_acf_pacf.png
  ACF/PACF plots saved to models/plots/8_acf_pacf.png
  ACF/PACF plots saved to models/plots/9_acf_pacf.png
  ACF/PACF plots saved to models/plots/10_acf_pacf.png
  ACF/PACF plots saved to models/plots/11_acf_pacf.png
  ACF/PACF plots saved to models/plots/12_acf_pacf.png
  ACF/PACF plots saved to models/plots/13_acf_pacf.png
  ACF/PACF plots saved to models/plots/14_acf_pacf.png
  ACF/PACF plots saved to models/plots/15_acf_pacf.png
  ACF/PACF plots saved to models/plots/16