## Adding parameters to the data model
**ema20** = exponential moving average 20 days.
**ema50** = exponential moving average 50 days.
**openHigher** = If Open > (1,01 * Previous day Close) then 1 else 0.
**strongClose** = function=(High + Low) / 2. If Close > 'function' then 1 else 0. Maybe not necessary. Lets try model with and without the strongClose parameter.
**averageVolume** = Average volume from past 200 days. 200 simple moving average for volume.
**strongVolume** = If Volume > (2 * averageVolume) then 1 else 0.
**strongVolume6MoPrior** = If strongVolume is equal to 1 more than 3 times in the past 126 days then 1 else 0.
**strongVolumeAfterFiling** = If strongVolume is equal to 1 more 2 times in the following 5 days after financialRelease = 1 then 1 else 0.
**accVolume** = If strongVolume = 1 and strongVolume6MoPrior = 1 and strongVolumeAfterFiling = 1 then 1 else 0.
**uptrend** = If ema50 > (ema50 50 days prior) then 1 else 0.

In [85]:
import requests
import pandas as pd
# from datetime import datetime, timedelta, date
import datetime
import time
from polygon import RESTClient
import logging
import signal
import sys
import pickle
import lz4.frame  # type: ignore
import concurrent.futures
import os
import pandas as pd
import numpy as np
import glob
import nbimporter
from readRawAggs import main

In [86]:
# # Define the path to your existing CSV file
# csv_file_path = r"C:\Users\SamuliMustonen\Documents\Ready Solutions\Docs\StockTrading\Data\ntnx_data_raw.csv"
# # Load the CSV file into a DataFrame
# df = pd.read_csv(csv_file_path, delimiter=';', header=0)

In [87]:
# Function to calculate EMA
def calculate_ema(data, window):
    """Calculate the Exponential Moving Average (EMA) for a given window."""
    # Ensure the 'close' column is of type float
    data['close'] = data['close'].astype(float)
    # Calculate EMA using ewm() and handle insufficient data gracefully
    ema = data.groupby('symbol')['close'].ewm(span=window, adjust=False).mean()
    # Handle insufficient data gracefully
    ema = ema.reset_index(level=0, drop=True)  # Reset index to align with original DataFrame
    # Replace initial values with NaN for insufficient data
    if len(data) < window:
        ema.iloc[:window] = None  # Set initial window values to None (NaN)
    
    return ema

In [88]:
# Function to calculate openHigher.
def calculate_open_higher(data):
    """Calculate if price opens higher than previous day's close."""
    multiplier = 1.01

    # Ensure the 'open' and 'close' columns are of type float
    data['open'] = data['open'].astype(float)
    data['close'] = data['close'].astype(float)

    # Shift the 'close' column by one day for each symbol to get the previous day's close
    data['prev_close'] = data.groupby('symbol')['close'].shift(1)

    # Compare if 'open' price is greater than 1.01 * previous day's close
    data['openHigher'] = (data['open'] > (multiplier * data['prev_close'])).astype(int)

    # Drop the 'prev_close' column if no longer needed
    data = data.drop(columns=['prev_close'])

    return data['openHigher']

In [89]:
# Function to calculate average volume
def calculate_avg_volume(data, window):
    """Calculate the Average Volume for a given window."""
    # Ensure the 'volume' column is of type float
    data['volume'] = data['volume'].astype(float)
    # Calculate average volume and handle insufficient data gracefully
    averageVolume = data.groupby('symbol')['volume'].transform(lambda x: x.rolling(window).mean())
    # Handle insufficient data gracefully
    averageVolume = averageVolume.reset_index(level=0, drop=True)  # Reset index to align with original DataFrame
    # Replace initial values with NaN for insufficient data
    if len(data) < window:
        averageVolume[:window] = None  # Set initial window values to None (NaN)
    
    return averageVolume

In [90]:
# Function to calculate strong volume
def calculate_strong_volume(data):
    """Calculate the Strong Volume."""
    
    # Ensure the 'volume' and 'averageVolume' columns are of type float
    data['volume'] = data['volume'].astype(float)
    data['averageVolume'] = data['averageVolume'].astype(float)
    
    # Calculate strong volume: volume greater than 2 times averageVolume
    strongVolume = (data['volume'] > 2 * data['averageVolume']).astype(int)
    
    # Replace initial values with NaN for insufficient data
    if len(data) < 100:
        strongVolume[:100] = None  # Set initial values to None (NaN)
    
    return strongVolume

In [91]:
# Function to calculate strong volume 6 months prior of openHigher
def calculate_strong_volume_prior(data, window):
    """Calculate the Strong Volume 6 Months Prior."""
    
    # Ensure that 'strongVolume' is of type int to handle NaNs
    data['strongVolume'] = data['strongVolume'].astype(int)

    # Calculate rolling sum for the strong volume and shift by one period
    strongVolume6MoPrior = data.groupby('symbol')['strongVolume'].rolling(window=window, min_periods=1).sum().shift(1)
    
    # Replace initial values with NaN for insufficient data
    strongVolume6MoPrior = strongVolume6MoPrior.reset_index(level=0, drop=True)  # Align with original DataFrame

    return strongVolume6MoPrior

In [92]:
# Function to calculate strong volume after openHigher
# def calculate_strong_volume_after(data):
#     """Calculate the Strong Volume After Open Higher."""
    
#     # Ensure that 'strongVolume' and 'openHigher' are of type int to handle NaNs
#     data['strongVolume'] = data['strongVolume'].astype(int)
#     data['openHigher'] = data['openHigher'].astype(int)
    
#     # Initialize the new column with default values
#     data['strongVolumeAfterOpenHigher'] = 0

#     # Loop through each group of 'symbol'
#     for symbol, group in data.groupby('symbol'):
#         # Create a temporary column for strongVolume counts
#         group['strong_count'] = group['strongVolume'].rolling(window=4).sum().shift(-4)
        
#         # Check for openHigher and set newColumn
#         for index in group.index:
#             if group.at[index, 'openHigher'] == 1 and group.at[index, 'strong_count'] >= 2:
#                 data.at[index, 'strongVolumeAfterOpenHigher'] = 1
    
#     return data

In [93]:
# Function to calculate accumulating volume that includes all previous volume functions
def calculate_accumulating_volume(data):
    """Calculate the accumulating volume."""
    # Ensure that 'strongVolume' and 'openHigher' are of type int to handle NaNs
    data['strongVolume'] = data['strongVolume'].fillna(0).astype(int)
    data['strongVolume6MoPrior'] = data['strongVolume6MoPrior'].fillna(0).astype(int)
    accVolume = ((data['strongVolume'] == 1) & (data['strongVolume6MoPrior'] == 1)).astype(int)
    
    return accVolume

In [94]:
# Function to calculate uptrend from ema50.
def calculate_uptrend(data):
    """Calculate the uptrend based on ema50."""
    # Fill NaN values in the 'ema50' column
    data['ema50'] = data['ema50'].fillna(0).astype(float)
    
    # Calculate uptrend using transform to maintain index alignment
    uptrend = data.groupby('symbol')['ema50'].transform(lambda x: (x > x.shift(22)).astype(int))
    
    return uptrend

In [95]:
# Explicitly format columns
# df[['close', 'open', 'high', 'low', 'volume']] = df[['close', 'open', 'high', 'low', 'volume']].astype('float64')
# df[['symbol', 'timestamp']] = df[['symbol', 'timestamp']].astype('object')

In [96]:
# Use main() from readRawAggs
df = main()
#Sort the data by date
df = df.sort_values(by=['symbol', 'timestamp'])

# Calculate EMA20 and EMA50.
df['ema20'] = calculate_ema(df, 20)
df['ema50'] = calculate_ema(df, 50)

# Calculate when stock opens higher than previous day close.
df['openHigher'] = calculate_open_higher(df)

# Calculate average volume from past 100 days.
df['averageVolume'] = calculate_avg_volume(df, 100)

# Calculate strong volume. 1 if over 2x average volume.
df['strongVolume'] = calculate_strong_volume(df)

# Calculate strong volume 6 months prior opening higher. 126 trading days is 6 months.
df['strongVolume6MoPrior'] = calculate_strong_volume_prior(df, 126)

# Calculate strong volume after opening higher. Two strong volume days within 5 day period.
# df['strongVolumeAfterOpenHigher'] = calculate_strong_volume_after(df)

# Calculate if all strong volume indicators give a positive singal.
df['accVolume'] = calculate_accumulating_volume(df)

# Calculate if EMA50 has an uptrend in the past month.
df['uptrend'] = calculate_uptrend(df)

In [None]:
print(df.head())

In [100]:
# Define the path where the files will be saved
save_path = "C:\\Users\\SamuliMustonen\\Documents\\Ready Solutions\\Docs\\StockTrading\\Data\\rawComplete"

# Ensure the save directory exists
os.makedirs(save_path, exist_ok=True)

# Define your date range for the filename
start_date = "2022-01-01"
end_date = "2024-09-30"

# Save the data to a compressed .pickle.lz4 file 
filename = os.path.join(save_path, f"all_raw_data_{start_date}_to_{end_date}.pickle.gz")

try:
    # Save the DataFrame using pandas with lz4 compression
    df.to_pickle(filename, compression='gzip')
    logging.info(f"Saved to {filename}")
except Exception as e:
    logging.error(f"Serialization Error: {e}")