# Data extraction

In [4]:
import yfinance as yf
from datetime import datetime, timedelta

ticker = yf.Ticker("AAPL")
end_date = datetime(2024, 6, 1)
start_date = end_date - timedelta(days=50)
data = ticker.history(start=start_date, end=end_date)
closing_price = data['Close']
print(closing_price)




Date
2024-04-12 00:00:00-04:00    175.490158
2024-04-15 00:00:00-04:00    171.653305
2024-04-16 00:00:00-04:00    168.363190
2024-04-17 00:00:00-04:00    166.991470
2024-04-18 00:00:00-04:00    166.037231
2024-04-19 00:00:00-04:00    164.009491
2024-04-22 00:00:00-04:00    164.844437
2024-04-23 00:00:00-04:00    165.898056
2024-04-24 00:00:00-04:00    168.005356
2024-04-25 00:00:00-04:00    168.870117
2024-04-26 00:00:00-04:00    168.283661
2024-04-29 00:00:00-04:00    172.458450
2024-04-30 00:00:00-04:00    169.307495
2024-05-01 00:00:00-04:00    168.283661
2024-05-02 00:00:00-04:00    171.991272
2024-05-03 00:00:00-04:00    182.279144
2024-05-06 00:00:00-04:00    180.619186
2024-05-07 00:00:00-04:00    181.305008
2024-05-08 00:00:00-04:00    181.642990
2024-05-09 00:00:00-04:00    183.462021
2024-05-10 00:00:00-04:00    182.197922
2024-05-13 00:00:00-04:00    185.412888
2024-05-14 00:00:00-04:00    186.557510
2024-05-15 00:00:00-04:00    188.836868
2024-05-16 00:00:00-04:00    188.95

# Testing

In [None]:
balance_sheet = ticker.quarterly_balance_sheet

Unnamed: 0,2025-03-31,2024-12-31,2024-09-30,2024-06-30,2024-03-31,2023-12-31,2023-09-30
Treasury Shares Number,,,,,,0.0,0.0
Ordinary Shares Number,14939315000.0,15037874000.0,15116786000.0,15222259000.0,15337686000.0,,
Share Issued,14939315000.0,15037874000.0,15116786000.0,15222259000.0,15337686000.0,,
Net Debt,70024000000.0,66500000000.0,76686000000.0,75739000000.0,71895000000.0,,
Total Debt,98186000000.0,96799000000.0,106629000000.0,101304000000.0,104590000000.0,,
...,...,...,...,...,...,...,...
Cash Cash Equivalents And Short Term Investments,48498000000.0,53775000000.0,65171000000.0,61801000000.0,67150000000.0,,
Other Short Term Investments,20336000000.0,23476000000.0,35228000000.0,36236000000.0,34455000000.0,,
Cash And Cash Equivalents,28162000000.0,30299000000.0,29943000000.0,25565000000.0,32695000000.0,,
Cash Equivalents,3101000000.0,3226000000.0,2744000000.0,2699000000.0,4468000000.0,,


# Feature Construction

In [6]:
import pandas as pd
import numpy as np

# Daily Return
daily_return = closing_price.pct_change()
daily = daily_return['AAPL']

# 5-Day Return
ret_5d = closing_price.pct_change(5)

# 10-Day Return
ret_10d = closing_price.pct_change(10)

# 5-Day Volatility (Std Dev of Returns)
vol_5d = closing_price.pct_change().rolling(window=5).std()

# 10-Day Volatility
vol_10d = closing_price.pct_change().rolling(window=10).std()

# Momentum (10d)
momentum_10d = closing_price - closing_price.shift(10)

# SMA_10/SMA_50 Ratio
sma_10 = closing_price.rolling(window=10).mean()
sma_50 = closing_price.rolling(window=50).mean()
sma_ratio = sma_10/sma_50

# Z-score (20d)
rolling_mean = closing_price.rolling(window=20).mean()
rolling_std = closing_price.rolling(window=20).std()
z_score_20d = (closing_price - rolling_mean)/rolling_std

# RSI (14d)
delta = closing_price.diff()
gain = delta.where(delta > 0, 0.0)
loss = -delta.where(delta < 0, 0.0)

avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()

rs = avg_gain/avg_loss
rsi_14 = 100 - (100 / (1 + rs))

# Features DataFrame

features = pd.concat([
    daily.rename('ret_1d'),
    ret_5d.rename('ret_5d'),
    ret_10d.rename('ret_10d'),
    vol_5d.rename('vol_5d'),
    vol_10d.rename('vol_10d'),
    momentum_10d.rename('momentum_10d'),
    sma_ratio.rename('sma_ratio_10_50'),
    z_score_20d.rename('zscore_20d'),
    rsi_14.rename('rsi_14')
], axis=1).dropna()


print(features)

KeyError: 'AAPL'

In [None]:
def build_features(closing_price: pd.Series) -> pd.DataFrame:
    daily_return = closing_price.pct_change()
    ret_5d = closing_price.pct_change(5)
    ret_10d = closing_price.pct_change(10)
    vol_5d = closing_price.pct_change().rolling(window=5).std()
    vol_10d = closing_price.pct_change().rolling(window=10).std()
    momentum_10d = closing_price - closing_price.shift(10)

    sma_10 = closing_price.rolling(window=10).mean()
    sma_50 = closing_price.rolling(window=50).mean()
    sma_ratio = sma_10 / sma_50

    rolling_mean = closing_price.rolling(window=20).mean()
    rolling_std = closing_price.rolling(window=20).std()
    z_score_20d = (closing_price - rolling_mean) / rolling_std

    delta = closing_price.diff()
    gain = delta.where(delta > 0, 0.0)
    loss = -delta.where(delta < 0, 0.0)
    avg_gain = gain.rolling(window=14).mean()
    avg_loss = loss.rolling(window=14).mean()
    rs = avg_gain / avg_loss
    rsi_14 = 100 - (100 / (1 + rs))

    features = pd.concat([
    daily_return.rename('ret_1d'),
    ret_5d.rename('ret_5d'),
    ret_10d.rename('ret_10d'),
    vol_5d.rename('vol_5d'),
    vol_10d.rename('vol_10d'),
    momentum_10d.rename('momentum_10d'),
    sma_ratio.rename('sma_ratio_10_50'),
    z_score_20d.rename('zscore_20d'),
    rsi_14.rename('rsi_14')
    ], axis=1)

    # Check which columns have NaNs
    print(features.isna().sum())

    # Then drop rows where **any** feature is NaN
    features = features.dropna()


    return features

features.head()

Unnamed: 0_level_0,ret_1d
Date,Unnamed: 1_level_1
2024-04-15,-0.021863
2024-04-16,-0.019167
2024-04-17,-0.008147
2024-04-18,-0.005714
2024-04-19,-0.012213
