Ticker,Name,Industry,Sub_Industry,Date,Adj Close,Close,High,Low,Open,Volume
A,Agilent Technologies,Health Care,Life Sciences Tools & Services,2024-09-16,137.8354034423828,138.30999755859375,139.77999877929688,137.5,138.27999877929688,887000
A,Agilent Technologies,Health Care,Life Sciences Tools & Services,2024-09-17,137.87525939941406,138.35000610351562,139.77999877929688,137.38999938964844,138.66000366210938,1210200
A,Agilent Technologies,Health Care,Life Sciences Tools & Services,2024-09-18,138.53298950195312,139.00999450683594,141.0,137.42999267578125,138.5,1390500

In [38]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [39]:
df1 = pd.read_csv(r"../data/input/A.csv")
df2 = pd.read_csv(r"../data/input/AAPL.csv")
df3 = pd.read_csv(r"../data/input/ACN.csv")

In [40]:
merged_df = pd.concat([df1, df2])
merged_df = pd.concat([merged_df, df3])

In [41]:
merged_df

Unnamed: 0,Ticker,Name,Industry,Sub_Industry,Date,Adj Close,Close,High,Low,Open,Volume
0,A,Agilent Technologies,Health Care,Life Sciences Tools & Services,2023-03-20,132.649078,134.539993,135.380005,132.729996,133.139999,1704500
1,A,Agilent Technologies,Health Care,Life Sciences Tools & Services,2023-03-21,135.084366,137.009995,137.419998,135.220001,135.559998,1392300
2,A,Agilent Technologies,Health Care,Life Sciences Tools & Services,2023-03-22,131.850464,133.729996,137.539993,133.600006,136.960007,1268500
3,A,Agilent Technologies,Health Care,Life Sciences Tools & Services,2023-03-23,129.287018,131.130005,135.070007,129.369995,134.660004,3297900
4,A,Agilent Technologies,Health Care,Life Sciences Tools & Services,2023-03-24,130.105362,131.960007,131.979996,128.220001,130.139999,1571600
...,...,...,...,...,...,...,...,...,...,...,...
495,ACN,Accenture,Information Technology,IT Consulting & Other Services,2025-03-11,327.790009,327.790009,334.429993,325.549988,333.079987,3390000
496,ACN,Accenture,Information Technology,IT Consulting & Other Services,2025-03-12,324.329987,324.329987,330.000000,322.890015,328.500000,3086200
497,ACN,Accenture,Information Technology,IT Consulting & Other Services,2025-03-13,317.070007,317.070007,323.690002,314.609985,323.690002,3667400
498,ACN,Accenture,Information Technology,IT Consulting & Other Services,2025-03-14,318.820007,318.820007,318.899994,314.290009,316.410004,4086000


In [42]:
static_features = merged_df[['Ticker', "Industry", "Sub_Industry"]].drop_duplicates()
static_features

Unnamed: 0,Ticker,Industry,Sub_Industry
0,A,Health Care,Life Sciences Tools & Services
0,AAPL,Information Technology,"Technology Hardware, Storage & Peripherals"
0,ACN,Information Technology,IT Consulting & Other Services


In [43]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [44]:
encoded_static = encoder.fit_transform(static_features)

In [45]:
encoded_static

array([[1., 0., 0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 1., 1., 0., 0.]])

In [47]:
# Convert to DataFrame
static_columns = encoder.get_feature_names_out(["Ticker", "Industry", "Sub_Industry"])
static_features_df = pd.DataFrame(encoded_static, columns=static_columns)
static_features_df["Ticker"] = static_features["Ticker"].values

In [48]:
static_features_df

Unnamed: 0,Ticker_A,Ticker_AAPL,Ticker_ACN,Industry_Health Care,Industry_Information Technology,Sub_Industry_IT Consulting & Other Services,Sub_Industry_Life Sciences Tools & Services,"Sub_Industry_Technology Hardware, Storage & Peripherals",Ticker
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,A
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,AAPL
2,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,ACN


In [49]:
# Function to create sliding windows
def create_sliding_windows(df, window_size=10):
    sequences, targets, tickers = [], [], []
    
    for ticker, ticker_df in df.groupby("Ticker"):
        scaler = StandardScaler()
        
        # Select dynamic features to normalize
        dynamic_features = ["Adj Close", "Close", "High", "Low", "Open", "Volume"]
        
        # Normalize using training data stats (avoid data leakage)
        train_data = ticker_df[dynamic_features].iloc[:-30]  # Assume last 30 days are test/validation
        scaler.fit(train_data)
        
        normalized_data = scaler.transform(ticker_df[dynamic_features])
        ticker_df[dynamic_features] = normalized_data
        
        for i in range(len(ticker_df) - window_size):
            # Create input-output pairs
            seq = ticker_df.iloc[i:i + window_size][dynamic_features].values
            target = ticker_df.iloc[i + window_size]["Adj Close"]
            
            # Get one-hot encoded static features
            static_vector = static_features_df[static_features_df["Ticker"] == ticker].drop(columns=["Ticker"]).values.flatten()
            
            sequences.append(np.hstack([seq, np.tile(static_vector, (window_size, 1))]))  # Repeat static info across window
            targets.append(target)
            tickers.append(ticker)
    
    return np.array(sequences), np.array(targets), np.array(tickers)

In [50]:
# Generate training data
window_size = 10
X, y, tickers = create_sliding_windows(merged_df, window_size=window_size)

In [51]:
print(X.shape, y.shape, tickers.shape)

(1470, 10, 14) (1470,) (1470,)


In [46]:
print(X.shape, y.shape, tickers.shape)

(228, 10, 12) (228,) (228,)


In [33]:
tickers

array(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'AAPL', 'AAPL',
       'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL',
       'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL',
       'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL',
       'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL',
       'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAPL', 'AAP

In [37]:
X[0].shape

(10, 12)

In [32]:
y.shape

(228,)

In [29]:
X.shape

(228, 10, 12)

In [53]:
# Save processed data
np.savez("../data/processed/processed_data.npz", X=X, y=y, tickers=tickers)

## V2

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

###########################
# DATA PREPARATION
###########################

# Load the dataset and keep an original copy for ROI calculations later
df = pd.read_csv("your_data.csv")
df_original = df.copy()  # keep a copy with original price levels

# Convert Date column to datetime and sort by ticker and date
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values(by=["Ticker", "Date"])
df_original["Date"] = pd.to_datetime(df_original["Date"])
df_original = df_original.sort_values(by=["Ticker", "Date"])

# Static Features (One-Hot Encoding)
static_features = df[["Ticker", "Industry", "Sub_Industry"]].drop_duplicates()
encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
encoded_static = encoder.fit_transform(static_features)
static_columns = encoder.get_feature_names_out(["Ticker", "Industry", "Sub_Industry"])
static_features_df = pd.DataFrame(encoded_static, columns=static_columns)
static_features_df["Ticker"] = static_features["Ticker"].values

# Function to create sliding windows and store per-company scalers
def create_sliding_windows(df, window_size=10):
    sequences, targets, tickers = [], [], []
    scalers_dict = {}  # to store the fitted scaler for each ticker
    
    # Define the dynamic features to be normalized
    dynamic_features = ["Adj Close", "Close", "High", "Low", "Open", "Volume"]
    
    for ticker, ticker_df in df.groupby("Ticker"):
        # Initialize and fit the scaler on training portion (exclude last 30 days for test/validation)
        scaler = StandardScaler()
        train_data = ticker_df[dynamic_features].iloc[:-30]
        scaler.fit(train_data)
        scalers_dict[ticker] = scaler
        
        # Normalize the entire ticker data using the fitted scaler
        normalized_data = scaler.transform(ticker_df[dynamic_features])
        # Use .loc to avoid SettingWithCopyWarning
        ticker_df.loc[:, dynamic_features] = normalized_data
        
        # Create sliding windows for this ticker
        for i in range(len(ticker_df) - window_size):
            seq = ticker_df.iloc[i:i + window_size][dynamic_features].values
            # The target is the normalized "Adj Close" of the day after the window
            target = ticker_df.iloc[i + window_size]["Adj Close"]
            
            # Get one-hot encoded static features for this ticker and repeat across the window
            static_vector = static_features_df[static_features_df["Ticker"] == ticker]\
                                .drop(columns=["Ticker"]).values.flatten()
            seq_with_static = np.hstack([seq, np.tile(static_vector, (window_size, 1))])
            
            sequences.append(seq_with_static)
            targets.append(target)
            tickers.append(ticker)
    
    return np.array(sequences), np.array(targets), np.array(tickers), scalers_dict

# Generate sliding window samples and store scalers
window_size = 10
X, y, tickers_array, scalers_dict = create_sliding_windows(df, window_size=window_size)

# Save processed data if needed (optional)
np.savez("processed_data.npz", X=X, y=y, tickers=tickers_array)