# Data extraction

In [30]:
import yfinance as yf
from datetime import datetime, timedelta

tickers = yf.Tickers("AAPL")
end_date = datetime(2024, 6, 1)
start_date = end_date - timedelta(days=10000)
data = tickers.history(start=start_date, end=end_date)
closing_price = data['Close']
print(closing_price)



[*********************100%***********************]  1 of 1 completed

Ticker            AAPL
Date                  
1997-01-14    0.134305
1997-01-15    0.129609
1997-01-16    0.125852
1997-01-17    0.125852
1997-01-20    0.127261
...                ...
2024-05-24  189.095657
2024-05-28  189.105621
2024-05-29  189.404205
2024-05-30  190.399567
2024-05-31  191.355103

[6891 rows x 1 columns]





# Feature Construction

In [31]:
import pandas as pd
import numpy as np

# Daily Return
daily_return = closing_price.pct_change()
daily_return.columns = ['ret_1d']

# 5-Day Return
ret_5d = closing_price.pct_change(5)
ret_5d.columns = ['ret_5d']

# 10-Day Return
ret_10d = closing_price.pct_change(10)
ret_10d.columns=['ret_10d']
# 5-Day Volatility (Std Dev of Returns)
vol_5d = closing_price.pct_change().rolling(window=5).std()
vol_5d.columns=['vol_5d']

# 10-Day Volatility
vol_10d = closing_price.pct_change().rolling(window=10).std()
vol_10d.columns=['vol_10d']

# Momentum (10d)
momentum_10d = closing_price - closing_price.shift(10)
momentum_10d.columns=['momentum_10d']

# SMA_10/SMA_50 Ratio
sma_10 = closing_price.rolling(window=10).mean()
sma_50 = closing_price.rolling(window=50).mean()
sma_ratio = sma_10/sma_50
sma_ratio.columns = ['sma_ratio_10_50']

# Z-score (20d)
rolling_mean = closing_price.rolling(window=20).mean()
rolling_std = closing_price.rolling(window=20).std()
z_score_20d = (closing_price - rolling_mean)/rolling_std
z_score_20d.columns = ['z_score_20d']

# RSI (14d)
delta = closing_price.diff()
gain = delta.where(delta > 0, 0.0)
loss = -delta.where(delta < 0, 0.0)

avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()

rs = avg_gain/avg_loss
rsi_14 = 100 - (100 / (1 + rs))
rsi_14.columns = ['rsi_14']


# Features DataFrame

y =  closing_price.pct_change(-1)*100
y.columns = ['y']
features = pd.concat([
    daily_return,
    ret_5d,
    ret_10d,
    vol_5d,
    vol_10d,
    sma_ratio,
    z_score_20d,
    rsi_14,
    y

],axis=1).dropna()
features.head()
print(features.size)
print(features.shape)
print(features)



61569
(6841, 9)
              ret_1d    ret_5d   ret_10d    vol_5d   vol_10d  sma_ratio_10_50  \
Date                                                                            
1997-03-25  0.000000  0.015383  0.007633  0.039462  0.027440         0.994721   
1997-03-26  0.015157  0.038764  0.030774  0.039140  0.027581         0.999093   
1997-03-27  0.111940  0.079712  0.137410  0.056437  0.044132         1.011002   
1997-03-31 -0.020139  0.097738  0.101887  0.053022  0.045430         1.019333   
1997-04-01 -0.041093  0.060609  0.060609  0.059133  0.048194         1.024426   
...              ...       ...       ...       ...       ...              ...   
2024-05-23 -0.021058 -0.015592  0.013889  0.011595  0.011112         1.076088   
2024-05-24  0.016588  0.000579  0.037858  0.014665  0.011626         1.077923   
2024-05-28  0.000053 -0.005496  0.019916  0.014294  0.010579         1.077875   
2024-05-29  0.001579 -0.010710  0.015259  0.013751  0.010478         1.077442   
2024-05-30  

# ML Model - Linear Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Features df
X = features.drop(columns=['y'])

# Target Columns
y = features['y']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Train a Linear Regression model (least squared)
model = LinearRegression()
model.fit(X_train, y_train) # Find A in y=Ax

# Predict and evaluate
y_pred = model.predict(X_test) # Predict y_test using the A matrix found and X_test
mse = mean_squared_error(y_test,y_pred) # Compare the found y_test (y_prep) to the true y_test
print('Mean Squared Error', mse)

print('Min of y:', y.min())
print('Max of y:', y.max())

range_y = y.max() - y.min()
print('range : ', range_y)

pour_error = ((mse**(1/2)/range_y)*100)
print('error :', pour_error, '%')

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
r2 = r2_score(y_test, y_pred)
rmse = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(r2,rmse,mse)

q1 = y.quantile(0.25)
q3 = y.quantile(0.75)
iqr = q3 - q1
nRMSE = (mse*(1/2)) / iqr
print(nRMSE) # <0.5 excellent, 0.5-1.0 good, 1.0-1.5 moderate, >1.5 Weak


Mean Squared Error 6.340311585380458
Min of y: -24.940721380047126
Max of y: 107.76720459871676
range :  132.70792597876388
error : 1.897398001101363 %
0.003858515654681205 1.7318529146834947 6.340311585380458
1.3045963314304694
