# CRE Statistical Analysis

In [5]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from scipy.stats import shapiro
import matplotlib.pyplot as plt
import os

In [12]:
# Get the absolute path of the directory where the script is located
# script_dir = os.path.dirname(os.path.abspath(__file__))

data_path = os.path.join('data', 'CRE.csv')
plot_dir = os.path.join('models', 'plots')
os.makedirs(plot_dir, exist_ok=True)

In [28]:
df = pd.read_csv('../data/CRE.csv')
df['time-axis'] = pd.to_datetime(df['time-axis'], unit='s')
df.set_index('time-axis', inplace=True)

time_series = df['1'].dropna()

print("--- Data Loading and Preparation ---")
print(f"Time series range: {time_series.index.min()} to {time_series.index.max()}")
print(f"Number of observations: {len(time_series)}")
print("\n")
print(time_series.describe())


--- Data Loading and Preparation ---
Time series range: 1970-01-01 00:00:01.502416596 to 1970-01-01 00:00:04.013139518
Number of observations: 626


count    626.000000
mean       0.193127
std        0.088824
min       -0.071438
25%        0.140657
50%        0.173126
75%        0.222595
max        0.372102
Name: 1, dtype: float64


In [21]:
adf_result = adfuller(time_series)
print(f'ADF Statistic: {adf_result[0]}')
print(f'p-value: {adf_result[1]}')
print('Critical Values:')
for key, value in adf_result[4].items():
    print(f'\t{key}: {value}')

if adf_result[1] > 0.05:
    print("Result: The series is likely non-stationary (p-value > 0.05).")
else:
    print("Result: The series is likely stationary (p-value <= 0.05).")
print("\n")

ADF Statistic: -1.2107349478708258
p-value: 0.6689608444594178
Critical Values:
	1%: -3.4411869900487906
	5%: -2.866321181236609
	10%: -2.569316262893616
Result: The series is likely non-stationary (p-value > 0.05).




In [22]:
# We need to determine the seasonal period. Let's assume a period based on data frequency or inspection.
# For this generic analysis, we'll try a period of 12, a common starting point.
# A more advanced approach would involve spectral analysis or inspecting the ACF plot.
print("---  Time Series Decomposition ---")
decomposition = sm.tsa.seasonal_decompose(time_series, model='additive', period=12)

fig = decomposition.plot()
plt.suptitle('Time Series Decomposition', y=1.02)
fig.set_size_inches(10, 8)
plt.tight_layout()
decomposition_plot_path = os.path.join(plot_dir, 'cre_decomposition.png')
plt.savefig(decomposition_plot_path)
plt.close()
print(f"Decomposition plot saved to '{decomposition_plot_path}'")
print("\n")

--- 4. Time Series Decomposition ---
Decomposition plot saved to 'models/plots/cre_decomposition.png'




In [23]:
print("--- 5. Autocorrelation Analysis ---")
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
sm.graphics.tsa.plot_acf(time_series, lags=40, ax=ax1)
sm.graphics.tsa.plot_pacf(time_series, lags=40, ax=ax2)
plt.tight_layout()
acf_pacf_plot_path = os.path.join(plot_dir, 'cre_acf_pacf.png')
plt.savefig(acf_pacf_plot_path)
plt.close()
print(f"ACF and PACF plots saved to '{acf_pacf_plot_path}'")
print("\n")

--- 5. Autocorrelation Analysis ---
ACF and PACF plots saved to 'models/plots/cre_acf_pacf.png'




In [24]:
# . Normality Test (Shapiro-Wilk Test)
# This tests if the data is drawn from a normal distribution.
# Often applied to the residuals of a model, but can be checked on the series itself.
print("--- 6. Normality Test (Shapiro-Wilk) ---")
residuals = decomposition.resid.dropna()
shapiro_stat, shapiro_p = shapiro(residuals)
print(f"Shapiro-Wilk Statistic: {shapiro_stat}")
print(f"P-value: {shapiro_p}")

if shapiro_p > 0.05:
    print("Result: The residuals appear to be normally distributed (p-value > 0.05).")
else:
    print("Result: The residuals do not appear to be normally distributed (p-value <= 0.05).")
print("\n")

--- 6. Normality Test (Shapiro-Wilk) ---
Shapiro-Wilk Statistic: 0.8944648393815715
P-value: 2.6084567307943963e-20
Result: The data does not appear to be normally distributed (p-value <= 0.05).


