### Imports & Functions


In [199]:
# src/data_preparation.py
import pandas as pd
import os

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score


def DISPLAY_DF(df):
  print(f'Shape: {df.shape}')
  return df.head()

### Read Input Datasets

### üìä Initial Stock Data Features: Description & Usefulness

These are the basic features retrieved for each stock ticker:

| Column     | Description                                      | Usefulness in Prediction                                                                 |
|------------|--------------------------------------------------|------------------------------------------------------------------------------------------|
| **date**   | The trading date for the stock entry.            | Used for sorting, filtering, and time-based features (e.g., moving averages, trends).   |
| **open**   | Stock price at the beginning of the trading day. | Can indicate early market sentiment compared to the close price.                         |
| **high**   | Highest price reached during the day.            | Useful for understanding daily volatility and computing technical indicators.            |
| **low**    | Lowest price during the trading day.             | Helps analyze price range and compute volatility-related features.                       |
| **close**  | Final trading price of the day.                  | Most commonly used price in analysis and model training, used for returns and target.    |
| **volume** | Number of shares traded during the day.          | Indicates market activity and interest, useful for understanding liquidity and momentum. |

---

### Quick Descriptions:
- **Close** is the most critical for trend analysis and setting the prediction target.
- **Volume** helps gauge interest or unusual market activity.
- **High/Low/Open** support more complex features like candle patterns or volatility.
- **Date** ensures chronological order and enables time-based aggregations.


In [200]:
# Iterate over all files in 'data' folder to load stock data
combined = []
COLS_REQUIRED = ["date", "open", "high", "low", "close", "volume", "ticker"]
# COLS_REQUIRED = ["date", "open", "high", "low", "close", "volume"]

for file in os.listdir("data"):
    if 'csv' in file:
        try:
            # Read the CSV file for each stock
            df = pd.read_csv(f"data/{file}")
            ticker = file.split('_')[0]  # Assuming stock ticker is part of the filename like 'AAPL_historical.csv'
        except Exception as e:
            print(f'Issue while loading data from file {file}: {e}')
            continue

        # df = compute_features(df, ticker)
        print(f'Loaded from: {file} (Ticker: {ticker})')
        df["ticker"] = ticker
        df = df[COLS_REQUIRED].drop_duplicates()

        combined.append(df)

# Combine all stock data into a single DataFrame
full_df = pd.concat(combined)


DISPLAY_DF(full_df)

Loaded from: AAPL_historical_data.csv (Ticker: AAPL)
Loaded from: META_historical_data.csv (Ticker: META)
Loaded from: MSFT_historical_data.csv (Ticker: MSFT)
Loaded from: GOOGL_historical_data.csv (Ticker: GOOGL)
Loaded from: AMZN_historical_data.csv (Ticker: AMZN)
Loaded from: IBM_historical_data.csv (Ticker: IBM)
Loaded from: TSLA_historical_data.csv (Ticker: TSLA)
Loaded from: NVDA_historical_data.csv (Ticker: NVDA)
Loaded from: NFLX_historical_data.csv (Ticker: NFLX)
Shape: (49926, 7)


Unnamed: 0,date,open,high,low,close,volume,ticker
0,1999-11-01,80.0,80.69,77.37,77.62,2487300.0,AAPL
1,1999-11-02,78.0,81.69,77.31,80.25,3564600.0,AAPL
2,1999-11-03,81.62,83.25,81.0,81.5,2932700.0,AAPL
3,1999-11-04,82.06,85.37,80.62,83.62,3384700.0,AAPL
4,1999-11-05,84.62,88.37,84.0,88.31,3721500.0,AAPL


#### EDA

In [202]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49926 entries, 0 to 5759
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    49926 non-null  object 
 1   open    49926 non-null  float64
 2   high    49926 non-null  float64
 3   low     49926 non-null  float64
 4   close   49926 non-null  float64
 5   volume  49926 non-null  float64
 6   ticker  49926 non-null  object 
dtypes: float64(5), object(2)
memory usage: 3.0+ MB


In [203]:
def summarize_stock_data(df):
    summary = []

    for ticker in df['ticker'].unique():
        subset = df[df['ticker'] == ticker]
        summary.append({
            "Ticker": ticker,
            "Unique Dates": subset['date'].nunique(),
            "Start Date": subset['date'].min(),
            "End Date": subset['date'].max(),
            "Total Records": len(subset),
            "Missing Values": subset.isnull().sum().sum(),
            "Average Volume": round(subset['volume'].mean(), 2),
            "Average Close Price": round(subset['close'].mean(), 2),
            "Std. Dev. of Close": round(subset['close'].std(), 2)
        })

    summary_df = pd.DataFrame(summary)
    print(summary_df.sort_values(by="Ticker").to_string(index=False))


summarize_stock_data(full_df)

Ticker  Unique Dates Start Date   End Date  Total Records  Missing Values  Average Volume  Average Close Price  Std. Dev. of Close
  AAPL          6401 1999-11-01 2025-04-11           6401               0     33516986.30               175.30              145.92
  AMZN          6401 1999-11-01 2025-04-11           6401               0     11616651.19               586.70              921.92
 GOOGL          5196 2004-08-19 2025-04-11           5196               0      9551435.71               768.25              612.94
   IBM          6401 1999-11-01 2025-04-11           6401               0      5879340.87               133.62               38.62
  META          3244 2012-05-18 2025-04-11           3244               0     29054446.50               195.10              144.25
  MSFT          6401 1999-11-01 2025-04-11           6401               0     44772358.64                99.98              111.95
  NFLX          5760 2002-05-23 2025-04-11           5760               0      5115

###  üß† Feature Engineering

---

These are derived features created to capture market behavior patterns and help the model make more informed predictions.

#### I. **Daily Return Percentage**

| Attribute | Description |
|----------|-------------|
| **Name** | Daily Return (%) |
| **Purpose** | Measures the percentage change in price between consecutive days. |
| **Impact** | Helps detect short-term momentum or volatility in price movement. |
| **Significance** | Positive returns may indicate upward momentum; negative returns may signal decline. |

In [205]:
full_df["return"] = full_df["close"].pct_change()

### II. **Moving Averages**

#### **5-Day Moving Average**
| **Attribute** | **Description** |
|---------------|-----------------|
| **Name**      | 5-Day Moving Average |
| **Purpose**   | Smooths recent price fluctuations over a short window. |
| **Impact**    | Highlights short-term trends and reduces noise in price. |
| **Significance** | Can help identify early reversals or momentum in the near-term. |

---

#### **10-Day Moving Average**
| **Attribute** | **Description** |
|---------------|-----------------|
| **Name**      | 10-Day Moving Average |
| **Purpose**   | Captures a slightly longer short-term trend in price movements. |
| **Impact**    | Used to compare with shorter moving averages (like 5-day) for crossover strategies. |
| **Significance** | When the 5-day moving average crosses the 10-day, it can signal a trend reversal. |

---

#### **20-Day Moving Average**
| **Attribute** | **Description** |
|---------------|-----------------|
| **Name**      | 20-Day Moving Average |
| **Purpose**   | Captures medium-term trend behavior. |
| **Impact**    | Acts as a support/resistance level and is used in many trading strategies. |
| **Significance** | Frequently used in combination with shorter moving averages (e.g., 5-day and 10-day) to detect momentum shifts. |

---


In [206]:
full_df["ma5"] = full_df["close"].rolling(window=5).mean()     # short-term trend
full_df["ma10"] = full_df["close"].rolling(window=10).mean()   # medium trend
full_df["ma20"] = full_df["close"].rolling(window=20).mean()   # longer trend

#### III. **Daily Volume Change Percentage**

| Attribute | Description |
|----------|-------------|
| **Name** | Daily Volume Change (%) |
| **Purpose** | Measures the percentage change in trading volume from one day to the next. |
| **Impact** | Sudden spikes or drops in volume may indicate unusual market interest or news impact. |
| **Significance** | Useful to detect breakouts or trend confirmations when combined with price movement. |


In [207]:
full_df["vol_chg"] = full_df["volume"].pct_change()

#### IV. **Price Volatility (High - Low)**

| Attribute | Description |
|----------|-------------|
| **Name** | Daily Price Range |
| **Purpose** | Captures the day's price volatility by subtracting the low from the high. |
| **Impact** | High values represent more price movement during the day‚Äîpotentially more risk or opportunity. |
| **Significance** | Useful for understanding intraday uncertainty or pressure in stock price. |

In [208]:
# Volatility - difference between high and low shows intraday price range
full_df["volatility"] = full_df["high"] - full_df["low"]

In [209]:
!pip install ta



In [210]:
import ta

In [211]:
# üîÅ New features
full_df["rsi"] = ta.momentum.RSIIndicator(full_df["close"]).rsi()
full_df["macd"] = ta.trend.MACD(full_df["close"]).macd()
full_df["momentum"] = ta.momentum.ROCIndicator(full_df["close"]).roc()

# Ensure 'date' column is datetime
full_df["date"] = pd.to_datetime(full_df["date"])

# Add time-based features
full_df["dayofweek"] = full_df["date"].dt.dayofweek
full_df["month"] = full_df["date"].dt.month
full_df["day"] = full_df["date"].dt.day
full_df["is_month_start"] = full_df["date"].dt.is_month_start.astype(int)
full_df["is_month_end"] = full_df["date"].dt.is_month_end.astype(int)

# üéØ Regression Target: Next Day Close
full_df["target"] = full_df["close"].shift(-1)
full_df = full_df.dropna()

In [212]:
DISPLAY_DF(full_df)

Shape: (49900, 22)


Unnamed: 0,date,open,high,low,close,volume,ticker,return,ma5,ma10,...,volatility,rsi,macd,momentum,dayofweek,month,day,is_month_start,is_month_end,target
25,1999-12-07,116.56,118.0,114.0,117.81,3973400.0,AAPL,0.015603,112.412,103.705,...,4.0,86.406493,7.915651,31.455032,1,12,7,0,0,110.06
26,1999-12-08,116.25,117.87,109.5,110.06,3681700.0,AAPL,-0.065784,113.812,105.43,...,8.37,68.509417,7.771596,19.061013,2,12,8,0,0,105.25
27,1999-12-09,111.0,111.0,100.87,105.25,7635700.0,AAPL,-0.043703,112.824,106.486,...,10.13,60.178293,7.186463,16.144339,3,12,9,0,0,103.0
28,1999-12-10,105.31,109.25,99.0,103.0,5694300.0,AAPL,-0.021378,110.424,107.28,...,10.25,56.704582,6.466642,10.97942,4,12,10,0,0,99.0
29,1999-12-13,102.39,102.5,98.94,99.0,4731800.0,AAPL,-0.038835,107.024,107.724,...,3.56,51.06158,5.509897,4.551695,0,12,13,0,0,94.87


### Saved the Final Prepared Data

In [213]:
full_df.to_parquet("data/training_data.gzip")