## Ticker

In [1]:
ticker_symbol = "NVDA"

## Data Collection

In [2]:
import pandas as pd

df = pd.read_csv(f'../data/{ticker_symbol}.csv')

## Data Collection

In [23]:
import yfinance as yf

ticker = yf.Ticker("NVDA")
df = ticker.history(period="3y")  # 1 year of daily data
df = df[["Open", "High", "Low", "Close", "Volume"]]
df.reset_index(inplace=True)


In [24]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2022-07-25 00:00:00-04:00,16.994482,17.099332,16.625016,16.999475,480748000
1,2022-07-26 00:00:00-04:00,16.864674,16.894631,16.454267,16.509188,397865000
2,2022-07-27 00:00:00-04:00,17.007466,17.910165,16.888638,17.764376,569776000
3,2022-07-28 00:00:00-04:00,17.94911,18.113871,17.41488,17.958096,474646000
4,2022-07-29 00:00:00-04:00,17.787341,18.21772,17.666515,18.136837,435460000


In [25]:
df.size

4512

## Feature Engineering

### Technical Indicators

In [26]:
# RSI (Relative Strength Index)

import pandas as pd


def compute_rsi(prices: pd.Series, period: int = 14) -> pd.Series:
    delta = prices.diff()

    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    avg_gain = gain.rolling(window=period).mean()
    avg_loss = loss.rolling(window=period).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    return rsi


In [27]:
# EMA (Exponential Moving Averages)

def compute_ema(series: pd.Series, span: int) -> pd.Series:
    return series.ewm(span=span, adjust=False).mean()

In [28]:
#  MACD (Moving Average Convergence Divergence)

def compute_macd(prices: pd.Series):
    ema_12 = compute_ema(prices, span=12)
    ema_26 = compute_ema(prices, span=26)

    macd_line = ema_12 - ema_26
    signal_line = compute_ema(macd_line, span=9)
    histogram = macd_line - signal_line

    return macd_line, signal_line, histogram

In [29]:
# Bollinger Bands

def compute_bollinger_bands(prices: pd.Series, window: int = 20, num_std: int = 2):
    sma = prices.rolling(window=window).mean()
    std = prices.rolling(window=window).std()
    
    bb_upper = sma + num_std * std
    bb_lower = sma - num_std * std
    
    return bb_upper, sma, bb_lower

In [30]:
df["RSI"] = compute_rsi(df["Close"])
df["MACD"], df["MACD_signal"], df["MACD_hist"] = compute_macd(df["Close"])
df["BB_upper"], df["BB_middle"], df["BB_lower"] = compute_bollinger_bands(df["Close"])

### Lag Features

In [31]:
for lag in range(1, 6):
    df[f"Close_lag_{lag}"] = df["Close"].shift(lag)

### Rolling Statistics

In [32]:
df["Close_rolling_mean_5"] = df["Close"].rolling(window=5).mean()

## Data Cleaning

In [33]:
df.dropna(inplace=True)

In [35]:
df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Volume,RSI,MACD,MACD_signal,MACD_hist,BB_upper,BB_middle,BB_lower,Close_lag_1,Close_lag_2,Close_lag_3,Close_lag_4,Close_lag_5,Close_rolling_mean_5
747,2025-07-17 00:00:00-04:00,172.020004,174.160004,170.830002,173.0,160841100,78.758823,7.910958,7.155343,0.755615,175.514337,158.282001,141.049664,171.369995,170.699997,164.070007,164.919998,164.100006,168.812
748,2025-07-18 00:00:00-04:00,173.639999,174.25,171.259995,172.410004,146456400,75.17174,7.982884,7.320851,0.662033,176.857465,159.628501,142.399538,173.0,171.369995,170.699997,164.070007,164.919998,170.310001
749,2025-07-21 00:00:00-04:00,172.75,173.380005,171.0,171.380005,123126100,72.383849,7.866098,7.4299,0.436197,177.299776,161.005001,144.710226,172.410004,173.0,171.369995,170.699997,164.070007,171.772
750,2025-07-22 00:00:00-04:00,171.339996,171.389999,164.580002,167.029999,193114300,73.21612,7.337947,7.41151,-0.073562,176.570008,162.148001,147.725994,171.380005,172.410004,173.0,171.369995,170.699997,171.038
751,2025-07-23 00:00:00-04:00,169.529999,171.259995,167.970001,170.779999,153077000,73.033734,7.139677,7.357143,-0.217467,176.53709,163.292001,150.046913,167.029999,171.380005,172.410004,173.0,171.369995,170.920001


In [36]:
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'RSI', 'MACD',
       'MACD_signal', 'MACD_hist', 'BB_upper', 'BB_middle', 'BB_lower',
       'Close_lag_1', 'Close_lag_2', 'Close_lag_3', 'Close_lag_4',
       'Close_lag_5', 'Close_rolling_mean_5'],
      dtype='object')

## Modeling

### Prepare Features and Target

In [37]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["Date", "Close"])  # Use engineered features
y = df["Close"].shift(-1).dropna()      # Predict next day's close

X = X.iloc[:-1]  # match length with y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

### Train a Regressor

In [42]:
from lightgbm import LGBMRegressor

model = LGBMRegressor(n_estimators=500, learning_rate=0.05)
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000580 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3331
[LightGBM] [Info] Number of data points in the train set: 585, number of used features: 17
[LightGBM] [Info] Start training from score 62.236957


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


## Evaluation

In [43]:
from sklearn.metrics import root_mean_squared_error

preds = model.predict(X_test)
rmse = root_mean_squared_error(y_test, preds)
print(f"RMSE: {rmse:.2f}")

RMSE: 9.68


## Predict Next Day

In [40]:
X.tail(1)

Unnamed: 0,Open,High,Low,Volume,RSI,MACD,MACD_signal,MACD_hist,BB_upper,BB_middle,BB_lower,Close_lag_1,Close_lag_2,Close_lag_3,Close_lag_4,Close_lag_5,Close_rolling_mean_5
750,171.339996,171.389999,164.580002,193114300,73.21612,7.337947,7.41151,-0.073562,176.570008,162.148001,147.725994,171.380005,172.410004,173.0,171.369995,170.699997,171.038


In [44]:
latest = X.tail(1)
next_day_price = model.predict(latest)[0]
print(f"Predicted next close: {next_day_price:.2f}")


Predicted next close: 141.74
