In [186]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder, KBinsDiscretizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV

from lightgbm import LGBMRegressor
import torch
import torch.nn as nn

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, mean_absolute_percentage_error, median_absolute_error

In [140]:
target_col = 'close'

In [168]:
# Data created with data_import.ipynb
data_raw = pd.read_csv('data/data.csv')
data = data_raw.copy()

  data_raw = pd.read_csv('data/data.csv')


In [169]:
# Convert 'date' column to datetime
data['datetime'] = pd.to_datetime(data['datetime'])

# Sort dataframe by date (if not already sorted)
data = data.sort_values(['ticker', 'datetime'])

data['Founded'] = data['Founded'].astype(str)

# Get founded year from messy Founded column, using regex
ticker_founded = (data.groupby('ticker')['Founded']
                  .apply(lambda x: x.str.extractall(r'(\d{4,})').astype(int))
                  .reset_index()
                  .groupby('ticker')
                  .agg({0:'min'})
                  .rename(columns={0:'founded_regex'})
                  .reset_index())

data = data.merge(ticker_founded, on='ticker', how='left')

# Remove records that are before the founded date
data = data[data['datetime'].dt.year >= data['founded_regex']]

In [170]:
data['close'] = np.log(data['close'])

In [171]:
# Calculate EWMA for each term
data['ewma_extr_short'] = data['close'].ewm(alpha=0.1).mean()
data['ewma_short'] = data['close'].ewm(alpha=0.3).mean()
data['ewma_mid'] = data['close'].ewm(alpha=0.5).mean()
data['ewma_extr_mid'] = data['close'].ewm(alpha=0.7).mean()
data['ewma_long'] = data['close'].ewm(alpha=0.9).mean()
data['ewma_extr_long'] = data['close'].ewm(alpha=0.99).mean()

In [172]:
import pandas as pd

def calculate_smoothed_rsi(prices, window=14):
    deltas = prices.diff().dropna()
    gain = deltas * 0
    loss = deltas * 0

    gain[deltas > 0] = deltas[deltas > 0]
    loss[deltas < 0] = -deltas[deltas < 0]

    avg_gain = gain.ewm(span=window, min_periods=window).mean()
    avg_loss = loss.ewm(span=window, min_periods=window).mean()

    rs = avg_gain / avg_loss
    smoothed_rsi = 100 - (100 / (1 + rs))

    return smoothed_rsi

# Group by 'ticker' and calculate smoothed RSI for each group
smoothed_rsi = data.groupby('ticker')['close'].apply(calculate_smoothed_rsi)

# Combine the results into a DataFrame
smoothed_rsi_df = smoothed_rsi.reset_index(level=0, drop=True).rename('smoothed_rsi')

# Merge the smoothed RSI back to the original DataFrame
data = data.merge(smoothed_rsi_df, left_index=True, right_index=True)

In [173]:
data.head()

Unnamed: 0,datetime,open,high,low,close,volume,ticker,Security,GICS Sector,GICS Sub-Industry,...,CIK,Founded,founded_regex,ewma_extr_short,ewma_short,ewma_mid,ewma_extr_mid,ewma_long,ewma_extr_long,smoothed_rsi
2,1999-11-19,28.93,28.97,26.82,3.303217,10897100.0,A,Agilent Technologies,Health Care,Life Sciences Tools & Services,...,1090872.0,1999,1999.0,3.34407,3.33873,3.331965,3.32312,3.311057,3.304071,
3,1999-11-22,27.84,29.65,26.99,3.389462,4705200.0,A,Agilent Technologies,Health Care,Life Sciences Tools & Services,...,1090872.0,1999,1999.0,3.36082,3.361895,3.364821,3.370848,3.381692,3.388608,
4,1999-11-23,28.64,29.39,26.95,3.293983,4274400.0,A,Agilent Technologies,Health Care,Life Sciences Tools & Services,...,1090872.0,1999,1999.0,3.341385,3.335084,3.327041,3.316603,3.302746,3.29493,
5,1999-11-24,27.04,28.26,26.95,3.320349,3464400.0,A,Agilent Technologies,Health Care,Life Sciences Tools & Services,...,1090872.0,1999,1999.0,3.336248,3.329771,3.323587,3.319232,3.318589,3.320095,
6,1999-11-26,27.54,27.96,27.46,3.323236,1237100.0,A,Agilent Technologies,Health Care,Life Sciences Tools & Services,...,1090872.0,1999,1999.0,3.333471,3.327549,3.323409,3.322037,3.322771,3.323204,


In [16]:
#TODO Fibonacci Levels

MemoryError: Unable to allocate 11.2 MiB for an array with shape (1472997,) and data type float64

In [191]:
target_col = 'close'
features = ['GICS Sector', 'ewma_extr_short', 'ewma_short', 'ewma_mid', 'ewma_extr_mid', 'ewma_long', 'ewma_extr_long', 'smoothed_rsi']

data['smoothed_rsi'] = data['smoothed_rsi'].fillna(0)

X = data.copy()[features]
y = data.copy()[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=X['GICS Sector'])

In [192]:
def create_lgbm_pipeline(X):
    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=['number']).columns
    categorical_cols = X.select_dtypes(exclude=['number']).columns

    # Define preprocessing steps for numerical and categorical columns
    numerical_transformer = Pipeline(steps=[
        ('scaler', MinMaxScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine preprocessing steps using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Create the pipeline with preprocessor and LightGBM Regressor
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LGBMRegressor())
    ])

    return pipeline


In [193]:
X

Unnamed: 0,GICS Sector,ewma_extr_short,ewma_short,ewma_mid,ewma_extr_mid,ewma_long,ewma_extr_long,smoothed_rsi
2,Health Care,3.344070,3.338730,3.331965,3.323120,3.311057,3.304071,0.000000
3,Health Care,3.360820,3.361895,3.364821,3.370848,3.381692,3.388608,0.000000
4,Health Care,3.341385,3.335084,3.327041,3.316603,3.302746,3.294930,0.000000
5,Health Care,3.336248,3.329771,3.323587,3.319232,3.318589,3.320095,0.000000
6,Health Care,3.333471,3.327549,3.323409,3.322037,3.322771,3.323204,0.000000
...,...,...,...,...,...,...,...,...
3484729,Health Care,5.105036,5.055426,5.031758,5.017114,5.010062,5.009123,13.755351
3484730,Health Care,5.097589,5.047968,5.031163,5.026532,5.028518,5.030354,27.615805
3484731,Health Care,5.090179,5.040624,5.027325,5.024400,5.023989,5.023554,26.023649
3484732,Health Care,5.084276,5.037784,5.029241,5.029130,5.030440,5.031081,30.995173


In [194]:
pipeline = create_lgbm_pipeline(X_train)
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017761 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1807
[LightGBM] [Info] Number of data points in the train set: 2722626, number of used features: 18
[LightGBM] [Info] Start training from score 3.176239


In [195]:
# Calculate evaluation metrics on the test set
mse = mean_squared_error(y_test, preds)
mae = mean_absolute_error(y_test, preds)
r2 = r2_score(y_test, preds)

# Print the evaluation metrics
print("Mean Squared Error (MSE) on Test Set:", mse)
print("Mean Absolute Error (MAE) on Test Set:", mae)
print("R-squared (R2) on Test Set:", r2)

Mean Squared Error (MSE) on Test Set: 0.002641141648477203
Mean Absolute Error (MAE) on Test Set: 0.010221992922250879
R-squared (R2) on Test Set: 0.9986953527663955
