In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.linear_model import LinearRegression
import yfinance as yf
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline

In [2]:
ticker="TSLA"
data =yf.download(ticker, start="2023-01-01", end="2024-02-02",auto_adjust=False)
data=data.rename(columns=str.title)

[*********************100%***********************]  1 of 1 completed


In [3]:
data.head()


Price,Adj Close,Close,High,Low,Open,Volume
Ticker,Tsla,Tsla,Tsla,Tsla,Tsla,Tsla
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2023-01-03,108.099998,108.099998,118.800003,104.639999,118.470001,231402800
2023-01-04,113.639999,113.639999,114.589996,107.519997,109.110001,180389000
2023-01-05,110.339996,110.339996,111.75,107.160004,110.510002,157986300
2023-01-06,113.059998,113.059998,114.389999,101.809998,103.0,220911100
2023-01-09,119.769997,119.769997,123.519997,117.110001,118.959999,190284000


In [4]:
data.isna().sum()


Price      Ticker
Adj Close  Tsla      0
Close      Tsla      0
High       Tsla      0
Low        Tsla      0
Open       Tsla      0
Volume     Tsla      0
dtype: int64

In [5]:
(data.isna().mean()*100).round(2)

Price      Ticker
Adj Close  Tsla      0.0
Close      Tsla      0.0
High       Tsla      0.0
Low        Tsla      0.0
Open       Tsla      0.0
Volume     Tsla      0.0
dtype: float64

In [6]:
data["oc_diff"]=data["Close"]-data["Open"]
data["hl_range"]= data["High"] - data ["Low"]
data["vol_chg_pct"]= data["Volume"].pct_change()
#Polynomial feature
data["hl_range_sq"] = data["hl_range"]**2

In [7]:
data.head()


Price,Adj Close,Close,High,Low,Open,Volume,oc_diff,hl_range,vol_chg_pct,hl_range_sq
Ticker,Tsla,Tsla,Tsla,Tsla,Tsla,Tsla,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2023-01-03,108.099998,108.099998,118.800003,104.639999,118.470001,231402800,-10.370003,14.160004,,200.505704
2023-01-04,113.639999,113.639999,114.589996,107.519997,109.110001,180389000,4.529999,7.07,-0.220455,49.984896
2023-01-05,110.339996,110.339996,111.75,107.160004,110.510002,157986300,-0.170006,4.589996,-0.124191,21.068066
2023-01-06,113.059998,113.059998,114.389999,101.809998,103.0,220911100,10.059998,12.580002,0.398293,158.256446
2023-01-09,119.769997,119.769997,123.519997,117.110001,118.959999,190284000,0.809998,6.409996,-0.13864,41.088049


In [8]:
data["y_next_close"]=data["Close"].shift(-1)

In [9]:
data.head()


Price,Adj Close,Close,High,Low,Open,Volume,oc_diff,hl_range,vol_chg_pct,hl_range_sq,y_next_close
Ticker,Tsla,Tsla,Tsla,Tsla,Tsla,Tsla,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
2023-01-03,108.099998,108.099998,118.800003,104.639999,118.470001,231402800,-10.370003,14.160004,,200.505704,113.639999
2023-01-04,113.639999,113.639999,114.589996,107.519997,109.110001,180389000,4.529999,7.07,-0.220455,49.984896,110.339996
2023-01-05,110.339996,110.339996,111.75,107.160004,110.510002,157986300,-0.170006,4.589996,-0.124191,21.068066,113.059998
2023-01-06,113.059998,113.059998,114.389999,101.809998,103.0,220911100,10.059998,12.580002,0.398293,158.256446,119.769997
2023-01-09,119.769997,119.769997,123.519997,117.110001,118.959999,190284000,0.809998,6.409996,-0.13864,41.088049,118.849998


In [13]:
features=["oc_diff","hl_range","vol_chg_pct","hl_range_sq"]
dfm = data[features + ["y_next_close","Close"]].dropna().copy()


In [14]:
dfm = dfm.sort_index()
assert isinstance(dfm.index, pd.DatetimeIndex)

In [25]:
full23= dfm.loc["2023-01-01":"2023-12-31"]
jan24= dfm.loc["2024-01-01":"2024-01-31"]


In [27]:
split_idx = int(len(full23) * 0.80)
train = full23.iloc[:split_idx]      
valid = full23.iloc[split_idx:]

In [29]:
X_tr, y_tr = train[features].values, train["y_next_close"].values
X_va, y_va = valid[features].values, valid["y_next_close"].values

In [32]:
X_24, y_24 = jan24[features].values, jan24["y_next_close"].values


In [33]:
lin = LinearRegression().fit(X_tr, y_tr)
pred_va_lin = lin.predict(X_va)
print("Linear 23-valid | R²=", r2_score(y_va, pred_va_lin),
      " MSE=", mean_squared_error(y_va, pred_va_lin))


Linear 23-valid | R²= -2.1855892005423025  MSE= 876.9222834085523


In [35]:
poly = Pipeline([
    ("sc",   StandardScaler()),
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("lin",  LinearRegression())
]).fit(X_tr, y_tr)
pred_va_poly = poly.predict(X_va)
print("Poly d2 23-valid | R²=", r2_score(y_va, pred_va_poly),
      " MSE=", mean_squared_error(y_va, pred_va_poly))

Poly d2 23-valid | R²= -3.620107798611156  MSE= 1271.8135406982317


In [36]:
best = poly if r2_score(y_va, pred_va_poly) > r2_score(y_va, pred_va_lin) else lin

best.fit(full23[features].values, full23["y_next_close"].values)   # retrain on full 2023
pred_24 = best.predict(X_24)

print("Jan-2024 | R²=", r2_score(y_24, pred_24),
      " MSE=", mean_squared_error(y_24, pred_24))


Jan-2024 | R²= -0.4521848193912441  MSE= 545.6289337538589
