# Ibis
Created: 09/13/2024\
Updated: 09/14/2024

## Setup Environment

In [1]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils import add_trading_hours
from fredapi import Fred
from openai import OpenAI

In [2]:
# MUST SET COMPUTING ENVIRONMENT
COMPUTING_ENV = 'windows'
#COMPUTING_ENV = 'ubuntu'
#COMPUTING_ENV = 'aws'

In [None]:
os.getcwd()

In [None]:
if COMPUTING_ENV == 'windows':
    WORKING_DIR = "C:\\Users\\regin\\Dropbox\\ibis"
    API_KEYS_DIR = "C:\\Users\\regin\\Dropbox\\API_KEYS"
elif COMPUTING_ENV == 'ubuntu':
    WORKING_DIR = "/home/reggie//Dropbox/ibis"
    API_KEYS_DIR = "/home/reggie/Dropbox/API_KEYS"
elif COMPUTING_ENV == 'aws':
    WORKING_DIR = "/home/ubuntu/ibis"
    API_KEYS_DIR = "/home/ubuntu/API_KEYS"

DATA_DIR = os.path.join(WORKING_DIR, "data")
STOCK_DATA_DIR  = os.path.join(DATA_DIR, 'tmp')
FRD_DATA_DIR = os.path.join(DATA_DIR, 'frd-historical')
print(f"Working directory is\n\t{WORKING_DIR}")
print(f"Data directory is\n\t{DATA_DIR}")
print(f"Stock data directory is\n\t{STOCK_DATA_DIR}")
print(f"FRD data directory is\n\t{FRD_DATA_DIR}")

# data dictionary
data_dictionary_fp = os.path.join(DATA_DIR, "data_dictionary.json")
if not os.path.exists(data_dictionary_fp):
    print(f"Data dictionary does not exist in {data_dictionary_fp}. Initializing empty data dictionary.")
    data_dictionary = {}
else:
    print(f"Data dictionary exists in {data_dictionary_fp}. Loading data dictionary.")
    with open(data_dictionary_fp, "r") as f:
        data_dictionary = json.load(f)
    print('\tKeys:', data_dictionary.keys())

# OpenAI API key
openai_api_key_fp = os.path.join(API_KEYS_DIR, 'openai-api-key-1.txt')
with open(openai_api_key_fp) as f:
    OPENAI_API_KEY = f.read().strip()
print(f"OpenAI API key is {OPENAI_API_KEY}")

# FRED Data
fred_api_key_fp = os.path.join(API_KEYS_DIR, 'FRED-API-KEY')
with open(fred_api_key_fp) as f:
    fred_api_key = f.read().strip()
print(f"FRED API key is {fred_api_key}")
FRED_DIR = os.path.join(DATA_DIR, "FRED")
fred_daily_data_fp = os.path.join(FRED_DIR, 'daily', 'FRED_daily.csv')
if not os.path.exists(fred_daily_data_fp):
    print(f"FRED daily data do not exist in {fred_daily_data_fp}.")
else:
    print(f"FRED daily data are in {fred_daily_data_fp}")

# a table to map asset types to download directories
frd_download_directories = pd.read_csv(os.path.join(FRD_DATA_DIR, 'frd-download-directories.csv'))
frd_download_directories


# Utilities

In [5]:
def load_stock_prices(ticker, asset_type, period='full', timeframe='1min', adjustment='adj_splitdiv', data_dir='./'):
    if asset_type == 'stock':
        csv_filename = f"{ticker}_{period}_{timeframe}_{adjustment.replace('_','')}.txt"
        fp = os.path.join(data_dir, csv_filename)
        prices_df = pd.read_csv(
            fp,
            sep=",",
            names=['date', 'open', 'high', 'low', 'close', 'volume'],
            header=0,  # Assuming the first row is a header, if not set to None
            on_bad_lines='warn',  # Skip bad lines
            engine='python'  # Use the Python engine for more flexible error handling
        )
    elif asset_type == 'index':
        csv_filename = f"{ticker}_{period}_{timeframe}.txt"
        fp = os.path.join(data_dir, csv_filename)
        prices_df = pd.read_csv(
            fp,
            sep=",",
            names=['date', 'open', 'high', 'low', 'close',],
            header=0,  # Assuming the first row is a header, if not set to None
            on_bad_lines='warn',  # Skip bad lines
            engine='python'  # Use the Python engine for more flexible error handling
        )

    # Convert 'date' column to datetime if it's not already
    prices_df['date'] = pd.to_datetime(prices_df['date'])
    
    # Extract the day as YYYY-MM-DD
    prices_df['day'] = prices_df['date'].dt.date
    
    if period in ['1min', '5min', '30min', '1hour']:
        # Extract the time as HH:MM:SS
        prices_df['time'] = prices_df['date'].dt.time
        
        # Calculate the time ID (minute of the day from 1 to 1440)
        prices_df['time_id'] = prices_df['date'].dt.hour * 60 + prices_df['date'].dt.minute + 1
    
        prices_df.set_index('date', inplace=True)
        prices_df = add_trading_hours(prices_df)
    else:
        prices_df.set_index('date', inplace=True)

    # add returns
    prices_df.sort_index(inplace=True, ascending=True)
    prices_df['open_to_close_ret'] = prices_df['close']/prices_df['open'] - 1
    prices_df['close_to_close_ret'] = prices_df['close'].pct_change()
    prices_df['overnight_ret'] = prices_df['open']/prices_df['close'].shift(1) - 1
    prices_df['open_to_high_ret'] = prices_df['high']/prices_df['open'] - 1
    prices_df['open_to_low_ret'] = prices_df['low']/prices_df['open'] - 1
    prices_df['low_to_high'] = prices_df['high']/prices_df['low'] - 1 # max possible return

    prices_df['ticker'] = ticker
    
    return prices_df

# Load Index Data

In [None]:
asset_type = 'index'
ticker = 'SPX'
period = 'full'
timeframe = '1day'
index_data_dir = os.path.join(FRD_DATA_DIR, frd_download_directories.query(f"type == '{asset_type}' & timeframe == '{timeframe}'")['directory'].values[0], 'csv')
print(f"Index data directory is {index_data_dir}")
spx_df = load_stock_prices(
    ticker=ticker,
    asset_type=asset_type,
    period=period,
    timeframe=timeframe,
    data_dir=index_data_dir
)
spx_df


# GPT Prompt

In [None]:
client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
def parse_python_from_gpt_markdown(content: str, save: bool = False, filename: str = None):
    """This function takes in a ChatGPT response that is in Markdown form with Python code blocks and returns the Python code as a string.
    
    Parameters:
    ----------
        content : str : The ChatGPT response in Markdown form

    Returns:
    -------
        python_str : str : The Python code as a string
    """
    python_str = ""
    in_python_block = False
    for line in content.split("\n"):
        if line.startswith("```python"):
            in_python_block = True
        elif line.startswith("```"):
            in_python_block = False
        elif in_python_block:
            python_str += line + "\n"
    if save:
        with open(filename, "w") as f:
            f.write(python_str)
    return python_str

def gpt_code(system_prompt: str, user_prompt: str, filename, save, model='gpt-4-o') -> (str, str):

    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": user_prompt,
            }
        ]
    )
    content = completion.choices[0].message.content
    return content, parse_python_from_gpt_markdown(content, save=False, filename=None)

# Downoad FRED Data

In [6]:
fred = Fred(api_key=fred_api_key)

fred_tips_series = ['DFII10', 'DFII5', 'DFII20', 'DFII30',]
fred_treasury_series = ['DGS10', 'DGS2', 'DGS30']

fred_tips_daily_data_fp = os.path.join(FRED_DIR, 'daily', 'FRED_daily_tips.csv')
fred_treasuries_daily_data_fp = os.path.join(FRED_DIR, 'daily', 'FRED_daily_treasuries.csv')

create_or_update_fred_tips_data = False
create_or_update_fred_treasuries_data = False

## Treasury Inflation-Indexed Securities

In [None]:

if create_or_update_fred_tips_data:

    # create or update data dictionary
    for series in fred_tips_series:
        if series not in data_dictionary:
            data_dictionary[series] = dict(fred.get_series_info(series))
        else:
            print(f"{series} already in data dictionary.")

    # export data_dictionary to json
    with open(data_dictionary_fp, 'w') as f:
        json.dump(data_dictionary, f, indent=2)

    tips_df = pd.DataFrame({
        series: fred.get_series(series) for series in fred_tips_series
    })
    tips_df.reset_index(inplace=True)
    tips_df.rename(columns={"index": "date"}, inplace=True)
    tips_df['date'] = pd.to_datetime(tips_df['date'])
    tips_df.set_index('date', inplace=True)
    tips_df.to_csv(fred_tips_daily_data_fp)
else:
    tips_df = pd.read_csv(fred_tips_daily_data_fp, index_col='date', parse_dates=True)
tips_df


## Treasury Constant Maturity

In [None]:
if create_or_update_fred_treasuries_data:
    for series in fred_treasury_series:
        if series not in data_dictionary:
            data_dictionary[series] = dict(fred.get_series_info(series))
        else:
            print(f"{series} already in data dictionary.")
            
    # export data_dictionary to json
    with open(data_dictionary_fp, 'w') as f:
        json.dump(data_dictionary, f, indent=2)

    treasuries_df = pd.DataFrame({
        series: fred.get_series(series) for series in fred_treasury_series
    })
    treasuries_df.reset_index(inplace=True)
    treasuries_df.rename(columns={"index": "date"}, inplace=True)
    treasuries_df['date'] = pd.to_datetime(treasuries_df['date'])
    treasuries_df.set_index('date', inplace=True)
    treasuries_df.to_csv(fred_treasuries_daily_data_fp)
else:
    treasuries_df = pd.read_csv(fred_treasuries_daily_data_fp, index_col='date', parse_dates=True)
treasuries_df

# Analyze Inflation and Interest Rate Data

## Load Series and Plot Levels and Changes

In [None]:
fred_tips_daily_df = pd.read_csv(fred_tips_daily_data_fp, index_col='date', parse_dates=True)
X = fred_tips_daily_df.copy()
X.ffill(inplace=True)
X.dropna(inplace=True)

# add pct change
X_pct_change = X.pct_change()
X_pct_change.dropna(inplace=True)

plt.figure(figsize=(12, 8))
for col in X.columns:
    plt.plot(X.index, X[col], label=col)
plt.title("Treasury Inflation-Indexed Securities (TIPS)")
plt.legend()
plt.show()

plt.figure(figsize=(12, 8))
for col in X_pct_change.columns:
    plt.plot(X_pct_change.index, X_pct_change[col], label=col)
plt.title("Treasury Inflation-Indexed Securities (TIPS) Percent Change")
plt.legend()
plt.show()

X_pct_change

## correlation matrix

In [None]:
corr = X_pct_change.corr()
print(corr.round(2))

# heatmap
plt.figure(figsize=(6, 6))
plt.imshow(corr, cmap='coolwarm', interpolation='none')
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns, rotation=90)
plt.yticks(range(len(corr)), corr.columns)
plt.show()

In [None]:
corr = X.corr()
print(corr.round(2))

# heatmap
plt.figure(figsize=(6, 6))
plt.imshow(corr, cmap='coolwarm', interpolation='none')
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns, rotation=90)
plt.yticks(range(len(corr)), corr.columns)
plt.show()

## Perform Singular Value Decomposition

In [None]:
# SVD on daily treasuries
U, s, Vt = np.linalg.svd(X, full_matrices=False)

# s contains the singular values (variance explained by each component)
# U contains the left singular vectors (temporal patterns)
# Vt contains the right singular vectors (relationships between variables)

# Retain only top k components for dimensionality reduction
k = 3
U_k = U[:, :k]
S_k = np.diag(s[:k])
Vt_k = Vt[:k, :]

# Low-rank approximation of X
X_approx = np.dot(U_k, np.dot(S_k, Vt_k))

# Singular values show the importance of each component
print("Top singular values:", s[:10])

# Use Vt_k to analyze relationships between variables (columns of X)
print("Right singular vectors (V^T):\n", Vt_k)

# first two SVs as factors
factors = U_k



### Scree Plot (Singular Values)
A scree plot displays the magnitude of the singular values, which tells you how much variance each component explains. This can help identify how many components are important and where the diminishing returns occur.

In [None]:
# Assume 's' contains the singular values from SVD
def plot_scree(s):
    plt.figure(figsize=(8, 5))
    plt.plot(np.arange(1, len(s) + 1), s, marker='o', linestyle='-')
    plt.title("Scree Plot of Singular Values")
    plt.xlabel("Component Number")
    plt.ylabel("Singular Value")
    plt.grid(True)
    plt.show()

# Example usage:
plot_scree(s[:50])  # Plot the first 50 singular values


### Cumulative Variance Explained

Another useful plot is the cumulative explained variance, which shows how much total variance is explained as you include more singular values. It helps to decide how many components are necessary to capture most of the variance.

In [None]:
def plot_cumulative_variance(s):
    explained_variance = np.cumsum(s**2) / np.sum(s**2)
    plt.figure(figsize=(8, 5))
    plt.plot(np.arange(1, len(s) + 1), explained_variance, marker='o', linestyle='-')
    plt.title("Cumulative Explained Variance")
    plt.xlabel("Number of Components")
    plt.ylabel("Cumulative Explained Variance")
    plt.grid(True)
    plt.show()

# Example usage:
plot_cumulative_variance(s[:50])  # Plot cumulative variance for the first 50 components


### Heatmap of Top Singular Vectors (Right Singular Vectors $V^T$)

You can visualize the right singular vectors, which describe how the variables (columns of the original data) contribute to each principal component. A heatmap can highlight the relationship between variables and components.

This heatmap shows the contribution of each time series (variables) to the top principal components. Patterns, correlations, and groups of similar variables will be visible, showing how variables are related to each principal component.

In [None]:
def plot_singular_vectors(Vt, k=10):
    plt.figure(figsize=(10, 6))
    sns.heatmap(Vt[:k, :], cmap='coolwarm', center=0)
    plt.title(f"Heatmap of Top {k} Right Singular Vectors (V^T)")
    plt.xlabel("Variables (Time Series)")
    plt.ylabel("Singular Vector Index")
    plt.show()

# Example usage:
plot_singular_vectors(Vt_k, k=10)  # Plot top 10 right singular vectors


### Time Series Projections (Left Singular Vectors U)

The left singular vectors represent temporal patterns. You can project the original time series data onto the top components and visualize these projections to understand the key dynamics of the system over time.

This plot shows how the time series evolve in terms of the most important components. You can spot trends, cycles, or other important temporal patterns.

In [None]:
def plot_time_series_projections(U, components=[0, 1]):
    plt.figure(figsize=(10, 6))
    
    for comp in components:
        plt.plot(U[:, comp], label=f"Component {comp+1}")
    
    plt.title("Time Series Projections onto Top Components")
    plt.xlabel("Time Steps")
    plt.ylabel("Projection")
    plt.legend()
    plt.grid(True)
    plt.show()

# Example usage:
plot_time_series_projections(U_k, components=[0, 1])  # Plot the first two components


### 2D/3D Scatter Plot of Time Series in Reduced Space

By projecting the time series into a reduced space (using the top singular vectors), you can visualize the relationships between time steps or time series in lower dimensions (e.g., a 2D or 3D scatter plot).

These projections show how the data points cluster in lower dimensions, helping to identify groups, trends, or anomalies. In 2D or 3D, clusters or separations between groups of time steps or variables may become more evident.

In [None]:
def plot_2d_projection(U, Vt, singular_values):
    # Project the data onto the first two components
    projection = np.dot(U[:, :2], np.diag(singular_values[:2]))
    
    plt.figure(figsize=(8, 6))
    plt.scatter(projection[:, 0], projection[:, 1], alpha=0.5, s=10)
    plt.title("2D Projection of Time Series Data")
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.grid(True)
    plt.show()

# Example usage:
plot_2d_projection(U_k, Vt_k, s[:2])


In [None]:
def plot_3d_projection(U, singular_values):
    # Project the data onto the first three components
    projection = np.dot(U[:, :3], np.diag(singular_values[:3]))
    
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(projection[:, 0], projection[:, 1], projection[:, 2], alpha=0.5, s=10)
    
    ax.set_title("3D Projection of Time Series Data")
    ax.set_xlabel("Component 1")
    ax.set_ylabel("Component 2")
    ax.set_zlabel("Component 3")
    
    plt.show()

# Example usage:
plot_3d_projection(U_k, s[:3])


# Analyze Rates II

In [None]:
X = tips_df.merge(treasuries_df, left_index=True, right_index=True, suffixes=('_tips', '_treasuries'))
# rename 'date' index to 'time_index'
X.index.rename('time_index', inplace=True)
X

In [None]:
# number of minutes in a year
minutes_in_year = 525600

In [None]:
k = 500
n = 100_000 # number of minutes
X = np.random.randn(n, k)
time_index = pd.date_range(start='2020-01-01', periods=len(X), freq='T')  # 'T' for minute frequency
X = pd.DataFrame(X, index=time_index).round(2)
X.columns = [f"feature_{i}" for i in range(k)]
print(X.shape)
X.head()
X.tail()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import grangercausalitytests, adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import STL
from sklearn.decomposition import PCA
from scipy.signal import cwt, ricker
from scipy.fft import fft, fftfreq
import os
import warnings
warnings.filterwarnings('ignore')

# Create directories if they don't exist
os.makedirs('./figures', exist_ok=True)
os.makedirs('./tables', exist_ok=True)

# Assume your dataframe is X
# Replace 'target_column_name' with your actual target column name
target_col = 'feature_0'
target_series = X[target_col]

# List of temporal resolutions to examine
# Resample to coarser frequencies
resolutions = ['5T', '15T', '30T', '1H', '1D']  # 5 Minutes, 15 Minutes, 30 Minutes, Hourly, Daily

for res in resolutions:
    print(f"\nAnalyzing data resampled to {res} resolution.")
    
    # Resample the data
    X_resampled = X.resample(res).mean()
    target_resampled = X_resampled[target_col]
    
    # Save resampled target series
    target_resampled.to_csv(f'./tables/target_series_{res}.csv')
    
    # 1. Correlation Analysis at the current resolution
    correlations = X_resampled.corrwith(target_resampled).sort_values(ascending=False)
    top_correlations = correlations.head(10)
    print(f"Top correlated series with the target series at {res} resolution:")
    print(top_correlations)
    
    # Save top correlations to a CSV file
    top_correlations.to_csv(f'./tables/top_correlated_series_{res}.csv', header=['Correlation'])
    
    # Plot the top correlated series with the target series
    top_correlated_series = correlations.index[1:6]  # Exclude the first one (itself)
    for col in top_correlated_series:
        plt.figure(figsize=(12, 6))
        plt.plot(target_resampled.index, target_resampled.values, label='Target Series')
        plt.plot(target_resampled.index, X_resampled[col].values, label=f'Series {col}')
        plt.legend()
        plt.title(f'Target Series vs Series {col} at {res} resolution')
        plt.xlabel('Time')
        plt.ylabel('Value')
        # Save the plot
        plt.savefig(f'./figures/target_vs_series_{col}_{res}.png')
        plt.close()
    
    # 2. Stationarity Tests (ADF Test)
    adf_result = adfuller(target_resampled.dropna())
    adf_output = pd.Series(adf_result[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    print(f"ADF Test Result for Target Series at {res} resolution:")
    print(adf_output)
    
    # Save ADF test result
    adf_output.to_csv(f'./tables/adf_test_{res}.csv', header=['Value'])
    
    # 3. Time Series Decomposition
    # Since STL requires a frequency, we need to set it based on the resampled data
    # For example, if resampling to '1H', the period might be 24 (hours in a day)
    freq_dict = {'5T': 288, '15T': 96, '30T': 48, '1H': 24, '1D': 7}  # Adjusted periods
    freq = freq_dict[res]
    stl = STL(target_resampled.dropna(), period=freq)
    stl_result = stl.fit()
    
    # Plot the decomposition
    stl_result.plot()
    plt.suptitle(f'STL Decomposition of Target Series at {res} resolution')
    plt.savefig(f'./figures/stl_decomposition_{res}.png')
    plt.close()
    
    # Save decomposition components
    decomposition_df = pd.DataFrame({
        'Trend': stl_result.trend,
        'Seasonal': stl_result.seasonal,
        'Residual': stl_result.resid
    }, index=target_resampled.dropna().index)
    decomposition_df.to_csv(f'./tables/stl_decomposition_{res}.csv')
    
    # 4. Spectral Analysis (Fourier Transform)
    n = len(target_resampled.dropna())
    yf = fft(target_resampled.dropna().values)  # Convert Series to NumPy array
    xf = fftfreq(n, 1)[:n//2]
    
    plt.figure(figsize=(12, 6))
    plt.plot(xf, 2.0/n * np.abs(yf[0:n//2]))
    plt.title(f'Frequency Domain of Target Series at {res} resolution')
    plt.xlabel('Frequency')
    plt.ylabel('Amplitude')
    plt.savefig(f'./figures/frequency_domain_{res}.png')
    plt.close()
    
    # Save frequency and amplitude data
    freq_ampl_df = pd.DataFrame({'Frequency': xf, 'Amplitude': 2.0/n * np.abs(yf[0:n//2])})
    freq_ampl_df.to_csv(f'./tables/frequency_domain_{res}.csv', index=False)
    
    # 5. Wavelet Transform (Time-Frequency Analysis)
    widths = np.arange(1, 31)
    cwt_matr = cwt(target_resampled.fillna(0).values, ricker, widths)  # Convert Series to NumPy array
    
    plt.figure(figsize=(12, 6))
    plt.imshow(cwt_matr, extent=[0, len(target_resampled), 1, 31], cmap='PRGn', aspect='auto',
               vmax=abs(cwt_matr).max(), vmin=-abs(cwt_matr).max())
    plt.title(f'Continuous Wavelet Transform of Target Series at {res} resolution')
    plt.xlabel('Time')
    plt.ylabel('Scale')
    plt.savefig(f'./figures/cwt_{res}.png')
    plt.close()
    
    # Note: Saving the entire wavelet matrix may not be practical due to size
    
    # 6. PCA at the current resolution
    pca = PCA(n_components=5)
    principal_components = pca.fit_transform(X_resampled.fillna(0))
    explained_variance = pca.explained_variance_ratio_
    print(f"Explained Variance Ratios by Principal Components at {res} resolution:")
    for i, ev in enumerate(explained_variance):
        print(f"PC{i+1}: {ev:.4f}")
    
    # Save explained variance ratios
    ev_df = pd.DataFrame({'Explained Variance Ratio': explained_variance},
                         index=[f'PC{i+1}' for i in range(len(explained_variance))])
    ev_df.to_csv(f'./tables/pca_explained_variance_{res}.csv')
    
    # Correlate PCs with target series
    pc_df = pd.DataFrame(principal_components, index=X_resampled.index, columns=[f'PC{i+1}' for i in range(5)])
    pc_df[target_col] = target_resampled.values
    pc_correlations = pc_df.corr()[target_col][:-1]  # Exclude the target column itself
    print(f"Correlation between Target Series and Principal Components at {res} resolution:")
    print(pc_correlations)
    
    # Save PC correlations with target series
    pc_correlations.to_csv(f'./tables/pc_correlations_with_target_{res}.csv', header=['Correlation'])
    
    # Plot the target series with principal components
    for pc in pc_df.columns[:-1]:  # Exclude the target column
        plt.figure(figsize=(12, 6))
        plt.plot(target_resampled.index, target_resampled.values, label='Target Series')
        plt.plot(target_resampled.index, pc_df[pc].values, label=pc)
        plt.legend()
        plt.title(f'Target Series vs {pc} at {res} resolution')
        plt.xlabel('Time')
        plt.ylabel('Value')
        # Save the plot
        plt.savefig(f'./figures/target_vs_{pc}_{res}.png')
        plt.close()
    
    # 7. Autocorrelation and Partial Autocorrelation Plots at the current resolution
    # Determine the maximum number of lags based on data length
    max_lags = min(20, len(target_resampled.dropna()) - 1)
    
    if max_lags > 0:
        plt.figure(figsize=(12, 6))
        plot_acf(target_resampled.dropna(), lags=max_lags)
        plt.title(f'Autocorrelation Function of Target Series at {res} resolution')
        plt.savefig(f'./figures/target_series_acf_{res}.png')
        plt.close()
    
        plt.figure(figsize=(12, 6))
        plot_pacf(target_resampled.dropna(), lags=max_lags)
        plt.title(f'Partial Autocorrelation Function of Target Series at {res} resolution')
        plt.savefig(f'./figures/target_series_pacf_{res}.png')
        plt.close()
    else:
        print(f"Not enough data to compute autocorrelation at {res} resolution.")
    
    # Optional: Granger Causality Tests at the current resolution
    max_lag = 10
    test_results = {}
    for col in top_correlated_series:
        data = X_resampled[[target_col, col]].dropna()
        if len(data) > max_lag:
            test = grangercausalitytests(data, maxlag=max_lag, verbose=False)
            p_values = [round(test[i+1][0]['ssr_chi2test'][1], 4) for i in range(max_lag)]
            test_results[col] = p_values
        else:
            print(f"Not enough data for Granger Causality Test at {res} resolution for series {col}.")
            test_results[col] = [np.nan]*max_lag
    
    gc_df = pd.DataFrame(test_results, index=range(1, max_lag+1))
    print(f"Granger Causality Test P-values at {res} resolution:")
    print(gc_df)
    
    # Save Granger causality test results
    gc_df.to_csv(f'./tables/granger_causality_pvalues_{res}.csv')


# Load RDFN Price Data

* Log(Price) transformation

In [None]:
# Define parameters
ticker = 'RDFN'
asset_type = "stock"        # Example values: stock, etf, futures, crypto, index, fx
period = "full"             # Example values: full, month, week, day
timeframe = "1min"          # Example values: 1min, 5min, 30min, 1hour, 1day
adjustment = "adj_splitdiv"    # Example values: adj_split, adj_splitdiv, UNADJUSTED
stock_csv_filename = f"{ticker}_{period}_{timeframe}_{adjustment.replace('_','')}.txt"
print(stock_csv_filename)

In [None]:


rdfn_df = load_stock_prices(ticker, asset_type, period, timeframe, adjustment, STOCK_DATA_DIR)

print(rdfn_df.info())
rdfn_df[['open', 'high', 'low', 'close', 'volume']] = rdfn_df[['open', 'high', 'low', 'close', 'volume']].apply(np.log)
rdfn_df

In [None]:
# alt
rdfn_df = pd.read_csv(os.path.join(DATA_DIR, stock_csv_filename.replace('.txt', '.csv')), parse_dates=True, index_col='date')
rdfn_df

In [None]:
rdfn_df = rdfn_df.query("date >= '2024-07-01'")
rdfn_df

In [None]:
rdfn_df.describe()

In [None]:
# plot high
plt.figure(figsize=(12, 6))
plt.plot(rdfn_df['high'].values, label=f"{ticker} High")
plt.title(f"{ticker} High Price")
plt.legend()
plt.show()

# line plot volume
plt.figure(figsize=(12, 6))
plt.plot(rdfn_df['volume'].values, label=f"{ticker} Volume")
plt.title(f"{ticker} Volume")
plt.legend()
plt.show()

In [None]:
# daily resampling
rdfn_daily_df = rdfn_df.resample('D').agg({
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last',
    'volume': 'sum'
})
rdfn_daily_df

In [None]:
tips_df = pd.read_csv(os.path.join(FRED_DIR, 'daily', 'FRED_daily_tips.csv'), parse_dates=True, index_col='date')
tips_df

In [None]:
treasuries_df = pd.read_csv(os.path.join(FRED_DIR, 'daily', 'FRED_daily_treasuries.csv'), parse_dates=True, index_col='date')
treasuries_df

In [None]:
macro_df = tips_df.join(treasuries_df, how='inner')
macro_df

In [None]:
X = rdfn_daily_df.join(macro_df, how='left')
X

In [None]:
X.query("volume > 0").query("date >= '2024-09-01'").query("date < '2024-09-13'")

In [None]:
plt.figure(figsize=(12, 8))
plt.plot(X.index, X['DFII10'], label='DFII10')
# title
plt.title("Treasury Inflation-Indexed Securities (TIPS)")
plt.legend()
plt.show()