# Data extraction

In [2]:
import yfinance as yf
from datetime import datetime, timedelta

tickers = yf.Tickers("AAPL")
end_date = datetime(2024, 6, 1)
start_date = end_date - timedelta(days=50)
data = tickers.history(start=start_date, end=end_date)
closing_price = data['Close']
print(closing_price)



[*********************100%***********************]  1 of 1 completed

Ticker            AAPL
Date                  
2024-04-12  175.490143
2024-04-15  171.653320
2024-04-16  168.363205
2024-04-17  166.991486
2024-04-18  166.037231
2024-04-19  164.009491
2024-04-22  164.844421
2024-04-23  165.898071
2024-04-24  168.005356
2024-04-25  168.870117
2024-04-26  168.283691
2024-04-29  172.458450
2024-04-30  169.307495
2024-05-01  168.283691
2024-05-02  171.991287
2024-05-03  182.279160
2024-05-06  180.619186
2024-05-07  181.305038
2024-05-08  181.643005
2024-05-09  183.462006
2024-05-10  182.197906
2024-05-13  185.412872
2024-05-14  186.557526
2024-05-15  188.836868
2024-05-16  188.956314
2024-05-17  188.986176
2024-05-20  190.150726
2024-05-21  191.454636
2024-05-22  190.011383
2024-05-23  186.010086
2024-05-24  189.095657
2024-05-28  189.105621
2024-05-29  189.404205
2024-05-30  190.399567
2024-05-31  191.355103





# Feature Construction

In [19]:
import pandas as pd
import numpy as np

# Daily Return
daily_return = closing_price.pct_change()
daily = daily_return['AAPL']

# 5-Day Return
ret_5d = closing_price.pct_change(5)

# 10-Day Return
ret_10d = closing_price.pct_change(10)

# 5-Day Volatility (Std Dev of Returns)
vol_5d = closing_price.pct_change().rolling(window=5).std()

# 10-Day Volatility
vol_10d = closing_price.pct_change().rolling(window=10).std()

# Momentum (10d)
momentum_10d = closing_price - closing_price.shift(10)

# SMA_10/SMA_50 Ratio
sma_10 = closing_price.rolling(window=10).mean()
sma_50 = closing_price.rolling(window=50).mean()
sma_ratio = sma_10/sma_50

# Z-score (20d)
rolling_mean = closing_price.rolling(window=20).mean()
rolling_std = closing_price.rolling(window=20).std()
z_score_20d = (closing_price - rolling_mean)/rolling_std

# RSI (14d)
delta = closing_price.diff()
gain = delta.where(delta > 0, 0.0)
loss = -delta.where(delta < 0, 0.0)

avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()

rs = avg_gain/avg_loss
rsi_14 = 100 - (100 / (1 + rs))

# Features DataFrame

features = pd.concat([
    daily.rename('ret_1d'),
    ret_5d.rename('ret_5d'),
    ret_10d.rename('ret_10d'),
    vol_5d.rename('vol_5d'),
    vol_10d.rename('vol_10d'),
    momentum_10d.rename('momentum_10d'),
    sma_ratio.rename('sma_ratio_10_50'),
    z_score_20d.rename('zscore_20d'),
    rsi_14.rename('rsi_14')
], axis=1).dropna()


print(features)

TypeError: 'str' object is not callable

In [23]:
def build_features(closing_price: pd.Series) -> pd.DataFrame:
    daily_return = closing_price.pct_change()
    ret_5d = closing_price.pct_change(5)
    ret_10d = closing_price.pct_change(10)
    vol_5d = closing_price.pct_change().rolling(window=5).std()
    vol_10d = closing_price.pct_change().rolling(window=10).std()
    momentum_10d = closing_price - closing_price.shift(10)

    sma_10 = closing_price.rolling(window=10).mean()
    sma_50 = closing_price.rolling(window=50).mean()
    sma_ratio = sma_10 / sma_50

    rolling_mean = closing_price.rolling(window=20).mean()
    rolling_std = closing_price.rolling(window=20).std()
    z_score_20d = (closing_price - rolling_mean) / rolling_std

    delta = closing_price.diff()
    gain = delta.where(delta > 0, 0.0)
    loss = -delta.where(delta < 0, 0.0)
    avg_gain = gain.rolling(window=14).mean()
    avg_loss = loss.rolling(window=14).mean()
    rs = avg_gain / avg_loss
    rsi_14 = 100 - (100 / (1 + rs))

    features = pd.concat([
    daily_return.rename('ret_1d'),
    ret_5d.rename('ret_5d'),
    ret_10d.rename('ret_10d'),
    vol_5d.rename('vol_5d'),
    vol_10d.rename('vol_10d'),
    momentum_10d.rename('momentum_10d'),
    sma_ratio.rename('sma_ratio_10_50'),
    z_score_20d.rename('zscore_20d'),
    rsi_14.rename('rsi_14')
    ], axis=1)

    # Check which columns have NaNs
    print(features.isna().sum())

    # Then drop rows where **any** feature is NaN
    features = features.dropna()


    return features

features.head()

Unnamed: 0_level_0,ret_1d
Date,Unnamed: 1_level_1
2024-04-15,-0.021863
2024-04-16,-0.019167
2024-04-17,-0.008147
2024-04-18,-0.005714
2024-04-19,-0.012213
