# Spare Signals
Created: 07/12/2024\
Updated: 07/13/2024

## Notes
* Outline dataflow and transformations to create something like unit tests
* move to AWS/Colab
* simulate [See Chinco gist](https://gist.github.com/alexchinco/467325abbf11d5c8f565)
* restrict to trading days and model overnight separately
* keep track of selected stocks and their coefficients
* precompute the 30 minute training windows
* Use SGDRegressor with L1 instead of LassoCV
* Use Numpy instead of Pandas
* Remove stocks with limited obs
* Impute missing values
* use a strategy to compute sharpe ratios over
* compute pct over/under metrics; max/min error; other asymmetric metrics
* Try other LLMs to critique the code
* Re-implement in Julia
* Speed up with Spark, Dask, Ray, or C++. ([See ChatGPT discussion](https://chatgpt.com/share/d010299a-3bb5-4230-8389-3530de660cf9))

In [None]:
#%pip install statsmodels

In [None]:
# supress statsmodels warnings
import warnings
warnings.filterwarnings('ignore')

import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from tqdm.notebook import tqdm
from tqdm import tqdm
from IPython.display import display

from statsmodels.tsa.ar_model import AutoReg
from joblib import Parallel, delayed
import pickle


_start_year = 2017
start_year = pd.to_datetime(f"{_start_year}-01-01 00:00:00")
_end_year = 2017
end_year = pd.to_datetime(f"{_end_year}-12-31 23:59:59")
asset_type = 'stock'
period = '1min'
timeframe = 'full'
adjustment = 'adjsplitdiv'
N_TICKERS = 300

def add_trading_hours(df):
    """
    Add indicator for trading hours
    Parameters
    ----------
    df: pd.DataFrame
        Must have index as datetime
    """
    trading_start = 930
    trading_end = 1600
    df['time'] = df.index.strftime('%H%M').astype(int)
    df['is_trading_hour'] = (df['time'] >= trading_start) & (df['time'] <= trading_end)
    return df

# check operating system
if os.name == 'nt':
    CSV_DEST_DIR = f"E:/frd-historical/data/{asset_type}/{period}/csv/"
else:
    CSV_DEST_DIR = f"/media/reggie/reg_ext/frd-historical/data/{asset_type}/{period}/csv/"
print(f"{len(os.listdir(CSV_DEST_DIR))} files found in {CSV_DEST_DIR}")
os.listdir(CSV_DEST_DIR)[:10]

In [None]:
with open('return_features_df.pickle', 'rb') as file:
    return_features_df = pickle.load(file)
print(return_features_df.shape)
print(return_features_df.info())

tickers = list(set([col.split("_")[0] for col in return_features_df.columns]))
len(tickers)
print(f"Number of tickers: {len(tickers)}")


# First slow attempt

In [None]:
X_rolling.

In [None]:
data = return_features_df.copy()
#data = data.iloc[:32]
focal_ticker = 'AAPL'
window_size = 30
prediction_horizon = 1
lasso_cv = 10
#def rolling_window_predictions(data, focal_ticker, window_size=30, prediction_horizon=1):
# Identify columns for the focal ticker
focal_columns = [f'{focal_ticker}_return', 
                    f'{focal_ticker}_lag_1_return', 
                    f'{focal_ticker}_lag_2_return', 
                    f'{focal_ticker}_lag_3_return']

# Prepare the list of all lagged columns
#lagged_columns = [col for col in data.columns if 'lag' in col]
lagged_columns = []
for ticker in tickers:
    for i in range(1, 4):
        lagged_columns.append(f'{ticker}_lag_{i}_return')
results = {'date': [], 'lasso': [], 'ar': [], 'ret': [], 'tickers': [], 'lasso_coef': []}

n_windows = len(data) - window_size - prediction_horizon

lagged_columns_df = data[lagged_columns]

for i in tqdm(range(window_size, len(data) - prediction_horizon)):
    start_idx = i - window_size
    end_idx = i

    # Prepare predictors (X) and response (y)
    _X = data[lagged_columns].iloc[start_idx:end_idx]
    _y = data[f'{focal_ticker}_return'].iloc[start_idx:end_idx]
    #print(f"start_idx: {start_idx}, end_idx: {end_idx}, X dim: {_X.shape}, y dim: {_y.shape}")
    if len(_X) < 5 or len(_y) < 5:  # Ensure there are enough samples for cross-validation
        continue
    
    X = _X.values
    y = _y.values

    # LASSO Model
    lasso = LassoCV(cv=lasso_cv, n_jobs=10).fit(X, y)
    #latest_data = lagged_columns_df.iloc[end_idx:end_idx+1].dropna()
    latest_data = lagged_columns_df.iloc[end_idx:end_idx+1]
    if latest_data.empty:
        continue
    lasso_pred = lasso.predict(latest_data)[0]
    
    # AR Model
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        ar_model = AutoReg(data[f'{focal_ticker}_return'].iloc[start_idx:end_idx].dropna(), lags=10).fit()
        #ar_pred = ar_model.predict(start=len(ar_model.model.endog), end=len(ar_model.model.endog))[0]
        ar_pred = float(ar_model.predict(start=len(ar_model.model.endog), end=len(ar_model.model.endog)))

    # Store predictions
    results['date'].append(data.index[end_idx + prediction_horizon])
    results['lasso'].append(lasso_pred)
    results['ar'].append(ar_pred)
    results['ret'].append(y[-1])
    results['lasso_coef'].append(lasso.coef_)

    #return results

#results = rolling_window_predictions(return_features_df, focal_ticker)

# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(results)
results_df.set_index('date', inplace=True)
results_df

In [None]:
return_features_df.values

In [None]:
data = return_features_df.reset_index().values
# Create column names
column_names = ['date']
for ticker in tickers:
    column_names.extend([f'{ticker}_return'] + [f'{ticker}_lag_{i}_return' for i in range(1, 4)])

# Assuming the first column is the date
dates = data[:, 0]

# Find indices of relevant columns
focal_indices = [column_names.index(f'{focal_ticker}_return')]
focal_indices.extend([column_names.index(f'{focal_ticker}_lag_{i}_return') for i in range(1, 4)])

lagged_indices = []
for ticker in tickers:
    for i in range(1, 4):
        lagged_indices.append(column_names.index(f'{ticker}_lag_{i}_return'))

results = {'date': [], 'lasso': [], 'ar': [], 'ret': [], 'lasso_coef': []}

n_windows = len(data) - window_size - prediction_horizon

lagged_columns_array = data[:, lagged_indices]

for i in tqdm(range(window_size, len(data) - prediction_horizon)):
    start_idx = i - window_size
    end_idx = i

    # Prepare predictors (X) and response (y)
    _X = data[start_idx:end_idx, lagged_indices]
    _y = data[start_idx:end_idx, focal_indices[0]]

    if len(_X) < 5 or len(_y) < 5:  # Ensure there are enough samples for cross-validation
        continue
    
    X = _X
    y = _y

    # LASSO Model
    lasso = LassoCV(cv=lasso_cv, n_jobs=10).fit(X, y)
    latest_data = lagged_columns_array[end_idx:end_idx+1]
    if latest_data.size == 0:
        continue
    lasso_pred = lasso.predict(latest_data)[0]
    
    # AR Model
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        ar_model = AutoReg(y, lags=10).fit()
        ar_pred = float(ar_model.predict(start=len(ar_model.model.endog), end=len(ar_model.model.endog)))

    # Store predictions
    results['date'].append(dates[end_idx + prediction_horizon])
    results['lasso'].append(lasso_pred)
    results['ar'].append(ar_pred)
    results['ret'].append(y[-1])
    results['lasso_coef'].append(lasso.coef_)

# Convert results to numpy array for easier analysis
results_array = np.array([(d, l, a, r, *lc) for d, l, a, r, lc in zip(
    results['date'], results['lasso'], results['ar'], results['ret'], results['lasso_coef']
)])

# If you need to convert back to pandas DataFrame:
# import pandas as pd
# results_df = pd.DataFrame(results_array, columns=['date', 'lasso', 'ar', 'ret'] + [f'lasso_coef_{i}' for i in range(len(results['lasso_coef'][0]))])
# results_df.set_index('date', inplace=True)
# results_df

In [None]:
# pre-cache windows
X_rolling = []
y_rolling = []
for i in range(window_size, len(data) - prediction_horizon):
    start_idx = i - window_size
    end_idx = i
    _X = data[start_idx:end_idx, lagged_indices]
    _y = data[start_idx:end_idx, focal_indices[0]]
    X_rolling.append(_X)
    y_rolling.append(_y)
# ...

In [None]:
end_idx, prediction_horizon, data.index[end_idx + prediction_horizon]
lasso_pred, ar_pred, y[-1], max(1000*lasso.coef_)
len(results['date']), len(results['lasso']), len(results['ar']), len(results['ret']), len(results['lasso_coef'])
date_df = pd.DataFrame(results['date'], columns=['date'])
lasso_df = pd.DataFrame(results['lasso'], columns=['lasso'])
ar_df = pd.DataFrame(results['ar'], columns=['ar'])
ret_df = pd.DataFrame(results['ret'], columns=['ret'])
date_df
lasso_df
ar_df
ret_df
results_df = pd.concat([date_df, lasso_df, ar_df, ret_df], axis=1)
results_df

In [None]:
d = {'x': [-2, -1, 0], 'y': [1, 2, 3], 'z': [4, 5, 6]}
# get dict with only x and y
{k: v for k, v in d.items() if k in ['x', 'y']}


In [None]:
results_df = pd.DataFrame({k: v for k, v in results.items() if k in ['date', 'lasso', 'ar', 'ret']})
results_df.set_index('date', inplace=True)
results_df


lasso_results = {'date': [], 'max_coef': [], 'min_coef': [], 'abs_max_coef': [], 'n_coefs_gt_0': [], 'n_coefs_lt_0': []}
for date in results['date']:
    idx = results['date'].index(date)
    coefs = results['lasso_coef'][idx]
    lasso_results['date'].append(date)
    lasso_results['max_coef'].append(max(coefs))
    lasso_results['min_coef'].append(min(coefs))
    lasso_results['abs_max_coef'].append(max(abs(coefs)))
    lasso_results['n_coefs_gt_0'].append(sum(coef > 0 for coef in coefs))
    lasso_results['n_coefs_lt_0'].append(sum(coef < 0 for coef in coefs))
lasso_results_df = pd.DataFrame(lasso_results)
lasso_results_df.set_index('date', inplace=True)

lasso_results_df.to_csv("lasso-results-df.csv")
with open('lasso_results_df.pickle', 'wb') as file:
    pickle.dump(lasso_results_df, file)

In [None]:
X['abs_min_coef'] = X['min_coef'].abs()
# set biggest coef to max(X['max_coef'], X['abs_min_coef])
X['max_coef'] = X[['max_coef', 'abs_min_coef']].max(axis=1)
X

In [None]:
# examine max lasso coefs
with open('lasso_results_df.pickle', 'rb') as file:
    lasso_results_df = pickle.load(file)

# plot distribution of biggest magnitude lasso coefs
X = lasso_results_df.copy()
zero = 0.0000000001
X = X[X['abs_max_coef'] > zero]
plt.figure(figsize=(10, 6))
plt.hist(X['abs_max_coef'], bins=100)
plt.title('Distribution of biggest magnitude LASSO coefficients')
plt.xlabel('Coefficient')
plt.ylabel('Frequency')
plt.show()
lasso_results_df

In [None]:
# compare distributions of max and min coefs
plt.figure(figsize=(10, 6))
plt.hist(X['abs_max_coef'], bins=100)
plt.hist(X['max_coef'], bins=100)
plt.hist(X['min_coef'], bins=100)

plt.title('Distribution of biggest magnitude LASSO coefficients')
plt.xlabel('Coefficient')
plt.ylabel('Frequency')
plt.show()

In [None]:
fp = "results_df.pickle"
with open(fp, "rb") as files:
    results_df = pickle.load(files)
results_df


In [None]:

# Evaluate and compare predictions
def evaluate_forecasts(results_df, true_values):
    lasso_preds = results_df['lasso']
    ar_preds = results_df['ar']
    dates = results_df['date']
    true_vals = true_values.loc[dates]

    # Compute evaluation metrics (e.g., R², MSE)
    lasso_r2 = np.corrcoef(lasso_preds, true_vals)[0, 1]**2 if len(lasso_preds) > 1 else np.nan
    ar_r2 = np.corrcoef(ar_preds, true_vals)[0, 1]**2 if len(ar_preds) > 1 else np.nan

    evaluation = {'lasso_r2': lasso_r2, 'ar_r2': ar_r2}

    return evaluation

true_values = return_features_df[f'{focal_ticker}_return']
evaluation_results = evaluate_forecasts(results_df, true_values)

print(evaluation_results)
