## Adding parameters to the data model
**exceptionalCandleSize** = Candle size as calculated (high-low). If the candle size is 2 standard deviations larger than average candle size then 1 else 0.
**ema20** = exponential moving average 20 days.
**ema50** = exponential moving average 50 days.
**openHigher** = The Gap-Up indicator: day opens 5% higher than perevious close. If Open > (1,05 * Previous day Close) then 1 else 0.
**averageVolume** = Average volume from past 100 market-days.
**strongVolume** = If Volume > (2 * averageVolume) then 1 else 0.
**strongVolume6MoPrior** = If strongVolume is equal to 1 more than 3 times in the past 100 market-days (5-6 months) then 1 else 0.
**accVolume** = If strongVolume = 1 and strongVolume6MoPrior = 1 then 1 else 0.
**uptrend** = The stock is uptrending in the past 5 market-days. If ema50 >= (ema50 5 market-days prior) then 1 else 0.
**closeHigh** = The close is higher than the open, thus producing a green candle.

In [1]:
import requests
import pandas as pd
# from datetime import datetime, timedelta, date
import datetime
import time
from polygon import RESTClient
import logging
import signal
import sys
import pickle
import lz4.frame  # type: ignore
import concurrent.futures
import os
import pandas as pd
import numpy as np
import glob
import nbimporter
from readRawAggs import main

In [2]:
# # Define the path to your existing CSV file
# csv_file_path = r"C:\Users\SamuliMustonen\Documents\Ready Solutions\Docs\StockTrading\Data\ntnx_data_raw.csv"
# # Load the CSV file into a DataFrame
# df = pd.read_csv(csv_file_path, delimiter=';', header=0)

In [3]:
# Function to calculate EMA
def calculate_ema(data, window):
    """Calculate the Exponential Moving Average (EMA) for a given window."""
    # Ensure the 'close' column is of type float
    data['close'] = data['close'].astype(float)
    # Calculate EMA using ewm() and handle insufficient data gracefully
    ema = data.groupby('symbol')['close'].ewm(span=window, adjust=False).mean()
    # Handle insufficient data gracefully
    ema = ema.reset_index(level=0, drop=True)  # Reset index to align with original DataFrame
    # Replace initial values with NaN for insufficient data
    if len(data) < window:
        ema.iloc[:window] = None  # Set initial window values to None (NaN)
    
    return ema

In [4]:
# Function to calculate high-low
def calculate_candle_size(data):
    """Calculate high - low to determine candle size."""
    # Ensure columns are type float
    data['high'] = data['high'].astype(float)
    data['low'] = data['low'].astype(float)
    # Calculate new column high - low
    candleSize = data['high'] - data['low']
    
    return candleSize

In [5]:
def calculate_avg_candle_size(data, window):
    """Calculate the average high-low for a given window."""
    # Ensure the column is of type float
    data['candleSize'] = data['candleSize'].astype(float)
    # Calculate average candle size and handle insufficient data gracefully
    avgCandleSize = data.groupby('symbol')['candleSize'].transform(lambda x: x.rolling(window).mean())
    # Handle insufficient data gracefully
    avgCandleSize = avgCandleSize.reset_index(level=0, drop=True)  # Reset index to align with original DataFrame
    # Replace initial values with NaN for insufficient data
    if len(data) < window:
        avgCandleSize[:window] = None  # Set initial window values to None (NaN)
    
    return avgCandleSize

In [6]:
# Function to calculate strong volume
def calculate_volatility(data):
    """Calculate the volatility for ATR."""
    
    # Ensure the 'volume' and 'averageVolume' columns are of type float
    data['ma14CandleSize'] = data['ma14CandleSize'].astype(float)
    data['close'] = data['close'].astype(float)
    
    # Calculate strong volume: volume greater than 2 times averageVolume
    volatility = (data['ma14CandleSize'] / data['close']).astype(float)
    
    # Replace initial values with NaN for insufficient data
    if len(data) < 14:
        volatility[:14] = None  # Set initial values to None (NaN)
    
    return volatility

In [7]:
def calculate_std_dev_candle_size(data, window):
    """Calculate the standard deviation of candle size for each symbol."""
    # Group by 'symbol' and calculate standard deviation of candleSize column
    stdDevCandleSize = data.groupby('symbol')['candleSize'].transform(lambda x: x.rolling(window).std())
    # Handle insufficient data gracefully
    stdDevCandleSize = stdDevCandleSize.reset_index(level=0, drop=True)
    if len(data) < window:
        stdDevCandleSize[:window] = None  # Set initial window values to None (NaN)
    
    return stdDevCandleSize

In [8]:
def calculate_exceptional_candle_size(data):
    """Calculate if candleSize is 2 standard deviations higher than avgCandleSize."""
    # Group by 'symbol' and calculate exceptional candle size
    exceptionalCandleSize = (data['candleSize'] >= (data['ma100CandleSize'] + 2 * data['stdDevCandleSize'])).astype(int)
    
    return exceptionalCandleSize

In [9]:
# Function to calculate openHigher.
def calculate_open_higher(data):
    """Calculate if price opens higher than previous day's close."""
    multiplier = 1.03

    # Ensure the 'open' and 'close' columns are of type float
    data['open'] = data['open'].astype(float)
    data['close'] = data['close'].astype(float)

    # Shift the 'close' column by one day for each symbol to get the previous day's close
    data['prev_close'] = data.groupby('symbol')['close'].shift(1)

    # Compare if 'open' price is greater than 1.01 * previous day's close
    data['openHigher'] = (data['open'] > (multiplier * data['prev_close'])).astype(int)

    # Drop the 'prev_close' column if no longer needed
    data = data.drop(columns=['prev_close'])

    return data['openHigher']

In [10]:
# Function to calculate average volume
def calculate_avg_volume(data, window):
    """Calculate the Average Volume for a given window."""
    # Ensure the 'volume' column is of type float
    data['volume'] = data['volume'].astype(float)
    # Calculate average volume and handle insufficient data gracefully
    averageVolume = data.groupby('symbol')['volume'].transform(lambda x: x.rolling(window).mean())
    # Handle insufficient data gracefully
    averageVolume = averageVolume.reset_index(level=0, drop=True)  # Reset index to align with original DataFrame
    # Replace initial values with NaN for insufficient data
    if len(data) < window:
        averageVolume[:window] = None  # Set initial window values to None (NaN)
    
    return averageVolume

In [11]:
# Function to calculate strong volume
def calculate_strong_volume(data):
    """Calculate the Strong Volume."""
    
    # Ensure the 'volume' and 'averageVolume' columns are of type float
    data['volume'] = data['volume'].astype(float)
    data['averageVolume'] = data['averageVolume'].astype(float)
    
    # Calculate strong volume: volume greater than 2 times averageVolume
    strongVolume = (data['volume'] > 2 * data['averageVolume']).astype(int)
    
    # Replace initial values with NaN for insufficient data
    if len(data) < 100:
        strongVolume[:100] = None  # Set initial values to None (NaN)
    
    return strongVolume

In [12]:
# Function to calculate strong volume 6 months prior of openHigher
def calculate_strong_volume_prior(data, window):
    """Calculate the Strong Volume 6 Months Prior."""
    
    # Ensure that 'strongVolume' is of type int to handle NaNs
    data['strongVolume'] = data['strongVolume'].astype(int)

    # Calculate rolling sum for the strong volume and shift by one period
    strongVolume6MoPrior = data.groupby('symbol')['strongVolume'].rolling(window=window, min_periods=1).sum().shift(1)
    # Replace initial values with NaN for insufficient data
    strongVolume6MoPrior = strongVolume6MoPrior.reset_index(level=0, drop=True)  # Align with original DataFrame

    return (strongVolume6MoPrior >= 3).astype(int)

In [13]:
# Function to calculate strong volume after openHigher
# def calculate_strong_volume_after(data):
#     """Calculate the Strong Volume After Open Higher."""
    
#     # Ensure that 'strongVolume' and 'openHigher' are of type int to handle NaNs
#     data['strongVolume'] = data['strongVolume'].astype(int)
#     data['openHigher'] = data['openHigher'].astype(int)
    
#     # Initialize the new column with default values
#     data['strongVolumeAfterOpenHigher'] = 0

#     # Loop through each group of 'symbol'
#     for symbol, group in data.groupby('symbol'):
#         # Create a temporary column for strongVolume counts
#         group['strong_count'] = group['strongVolume'].rolling(window=4).sum().shift(-4)
        
#         # Check for openHigher and set newColumn
#         for index in group.index:
#             if group.at[index, 'openHigher'] == 1 and group.at[index, 'strong_count'] >= 2:
#                 data.at[index, 'strongVolumeAfterOpenHigher'] = 1
    
#     return data

In [14]:
# Function to calculate accumulating volume that includes all previous volume functions
def calculate_accumulating_volume(data):
    """Calculate the accumulating volume."""
    # Ensure that 'strongVolume' and 'openHigher' are of type int to handle NaNs
    data['strongVolume'] = data['strongVolume'].fillna(0).astype(int)
    data['strongVolume6MoPrior'] = data['strongVolume6MoPrior'].fillna(0).astype(int)
    accVolume = ((data['strongVolume'] == 1) & (data['strongVolume6MoPrior'] == 1)).astype(int)
    
    return accVolume

In [15]:
# Function to calculate uptrend from ema50.
def calculate_uptrend(data):
    """Calculate the uptrend based on ema50."""
    # Fill NaN values in the 'ema50' column
    data['ema50'] = data['ema50'].fillna(0).astype(float)
    
    # Calculate uptrend using transform to maintain index alignment
    uptrend = data.groupby('symbol')['ema50'].transform(lambda x: (x > x.shift(5)).astype(int))
    
    return uptrend

In [16]:
# Function to calculate accumulating volume that includes all previous volume functions
def calculate_close_high(data):
    """Calculate the accumulating volume."""
    # Ensure that 'strongVolume' and 'openHigher' are of type int to handle NaNs
    data['openHigher'] = data['openHigher'].fillna(0).astype(int)
    closeHigh = ((data['openHigher'] == 1) & (data['close'] > data['open'])).astype(int)
    
    return closeHigh

In [17]:
# Function to calculate accumulating volume that includes all previous volume functions
def calculate_model_peg_alert(data):
    """Calculate the accumulating volume."""
    # Ensure that 'strongVolume' and 'openHigher' are of type int to handle NaNs
    data['exceptionalCandleSize'] = data['exceptionalCandleSize'].fillna(0).astype(int)
    data['openHigher'] = data['openHigher'].fillna(0).astype(int)
    data['accVolume'] = data['accVolume'].fillna(0).astype(int)
    data['uptrend'] = data['uptrend'].fillna(0).astype(int)
    data['closeHigh'] = data['closeHigh'].fillna(0).astype(int)
    # Boolean evaluation that all the parameters are 1.
    pegAlert = ((data['exceptionalCandleSize'] == 1) 
                 & (data['openHigher'] == 1) 
                 & (data['accVolume'] == 1)
                 & (data['uptrend'] == 1)
                 & (data['closeHigh'] == 1)).astype(int)
    
    
    return pegAlert

In [18]:
# # Sample DataFrame
# datatest = pd.DataFrame({
#     'symbol': ['AAPL'] * 3 + ['MSFT'] * 3,
#     'open': [10, 11, 12] * 2,
#     'close': [9.4, 12, 10.7] * 2,
#     'openHigher': [1, 1, 1] * 2
# })

In [19]:
# Calculate candle size from high minus low.
# datatest['candleSize'] = calculate_candle_size(datatest)

# # Calculate average candle size from high minus low.
# datatest['avgCandleSize'] = calculate_avg_candle_size(datatest, 3)

# # Calculate std dev candle size from high minus low.
# datatest['stdDevCandleSize'] = calculate_std_dev_candle_size(datatest, 3)

# # Calculate if candle size is 2 std dev higher than average candle size. Boolean.
# datatest['exceptionalCandleSize'] = calculate_exceptional_candle_size(datatest)

# test closeHigh
# datatest['closeHigh'] = calculate_close_high(datatest)

# print(datatest)

In [33]:
# Use main() from readRawAggs
df = main()
#Sort the data by date
df = df.sort_values(by=['symbol', 'timestamp'])

# Calculate candle size from high minus low.
df['candleSize'] = calculate_candle_size(df)

# Calculate average candle size from high minus low.
df['ma100CandleSize'] = calculate_avg_candle_size(df, 100)

# Calculate average candle size from high minus low.
df['ma14CandleSize'] = calculate_avg_candle_size(df, 14)

# Calculate volatility for ATR (average true range as %).
df['volatility'] = calculate_volatility(df)

# Calculate std dev candle size from high minus low.
df['stdDevCandleSize'] = calculate_std_dev_candle_size(df, 100)

# Candle standard deviation in percentage
df['stdDevCandleSizePer'] = (df['stdDevCandleSize'] / df['candleSize'])

# Trailing stop price is 1 - (( 1+ std% ) * volatility) * high.
df['trailingStop'] = (1 - (( 1 + df['stdDevCandleSizePer'] ) * df['volatility'])) * df['high']

# Calculate if candle size is 2 std dev higher than average candle size. Boolean.
df['exceptionalCandleSize'] = calculate_exceptional_candle_size(df)

# Calculate EMA20 and EMA50.
df['ema20'] = calculate_ema(df, 20)
df['ema50'] = calculate_ema(df, 50)

# Calculate when stock opens higher than previous day close.
df['openHigher'] = calculate_open_higher(df)

# Calculate average volume from past 100 days.
df['averageVolume'] = calculate_avg_volume(df, 100)

# Calculate strong volume. 1 if over 2x average volume.
df['strongVolume'] = calculate_strong_volume(df)

# Calculate strong volume 5-6 months prior opening higher. 100 trading days is 6 months.
df['strongVolume6MoPrior'] = calculate_strong_volume_prior(df, 100)

# Calculate strong volume after opening higher. Two strong volume days within 5 day period.
# df['strongVolumeAfterOpenHigher'] = calculate_strong_volume_after(df)

# Calculate if all strong volume indicators give a positive singal.
df['accVolume'] = calculate_accumulating_volume(df)

# Calculate if EMA50 has an uptrend in the past month.
df['uptrend'] = calculate_uptrend(df)

# Calculate if day opens higher than previous day and also close higher than opens.
df['closeHigh'] = calculate_close_high(df)

# Calculate if all the boolean evaluations return 1 then it is a PEG alert.
df['pegAlert'] = calculate_model_peg_alert(df)

In [34]:
# Round columns to 2 decimal places and one to 4.
df[['close', 'open', 'high', 'low', 'ema20', 'ema50', 'trailingStop']] = df[['close', 'open', 'high', 'low', 'ema20', 'ema50', 'trailingStop']].round(2)
df[['exceptionalCandleSize', 'volatility']] = df[['exceptionalCandleSize', 'volatility']].round(4)

# Round the large value column to the nearest hundred without decimals
df['volume'] = (df['volume'] / 100).round() * 100
df['volume'] = (df['volume']).astype(int)

# Drop unecessary columns from dataframe.
df.drop(['candleSize', 'ma100CandleSize', 'prev_close', 'ma14CandleSize', 'stdDevCandleSize', 'stdDevCandleSizePer'], axis=1, inplace=True)

In [35]:
df_non_nan = df[df['volatility'].notna()]
df2 = df_non_nan[df_non_nan['symbol']=='AKRO']
df3 = df2[df2['timestamp']=='2023-01-03']

# Display the filtered DataFrame
print(df3)

      symbol  close  open   high    low  volume   timestamp  volatility  \
18551   AKRO  49.49  54.8  54.88  49.47  904100  2023-01-03      0.0668   

       trailingStop  exceptionalCandleSize  ema20  ema50  openHigher  \
18551         50.37                      1  47.21  43.03           0   

       averageVolume  strongVolume  strongVolume6MoPrior  accVolume  uptrend  \
18551     1276425.75             0                     1          0        1   

       closeHigh  pegAlert  
18551          0         0  


In [36]:
print(df.head())

  symbol  close   open   high    low   volume   timestamp  volatility  \
0   AACT  10.12  10.10  10.15  10.10   603100  2023-06-12         NaN   
1   AACT  10.11  10.14  10.14  10.11     5500  2023-06-13         NaN   
2   AACT  10.10  10.11  10.13  10.10   143900  2023-06-14         NaN   
3   AACT  10.11  10.11  10.11  10.10  2061100  2023-06-15         NaN   
4   AACT  10.11  10.12  10.12  10.11   251000  2023-06-16         NaN   

   trailingStop  exceptionalCandleSize  ema20  ema50  openHigher  \
0           NaN                      0  10.12  10.12           0   
1           NaN                      0  10.12  10.12           0   
2           NaN                      0  10.12  10.12           0   
3           NaN                      0  10.12  10.12           0   
4           NaN                      0  10.12  10.12           0   

   averageVolume  strongVolume  strongVolume6MoPrior  accVolume  uptrend  \
0            NaN             0                     0          0        0   


In [24]:
# Define the path where the files will be saved
save_path = "C:\\Users\\SamuliMustonen\\Documents\\Ready Solutions\\Docs\\StockTrading\\Data\\rawComplete"

# Ensure the save directory exists
os.makedirs(save_path, exist_ok=True)

# Define your date range for the filename
start_date = "2022-01-01"
end_date = "2024-09-30"

# Save the data to a compressed .pickle.lz4 file 
filename = os.path.join(save_path, f"all_raw_data_{start_date}_to_{end_date}.pickle.gz")

try:
    # Save the DataFrame using pandas with lz4 compression
    df.to_pickle(filename, compression='gzip')
    logging.info(f"Saved to {filename}")
except Exception as e:
    logging.error(f"Serialization Error: {e}")