In [1]:
import yfinance as yf
import pandas as pd
import ta

def dataset(
    ticker,
    lags,
    start_date,
    end_date,
    sma_window=20,
    ema_window=20,
    rsi_window=14,
    macd_fast=12,
    macd_slow=26,
    macd_signal=9,
    bb_window=20,
    bb_std=2,
    atr_window=14,
):
    # 1) download both in one call
    raw = yf.download([ticker, "^VIX"],
                      start=start_date,
                      end=end_date,
                      threads=False)

    # 2) pull out the asset OHLCV
    price_df = raw.xs(ticker, axis=1, level=1)[
        ["Open", "High", "Low", "Close", "Volume"]
    ]

    # 3) pull out just the VIX close
    vix = raw["Close"]["^VIX"].rename("VIX")

    # 4) build your working frame with only the Close
    df = pd.DataFrame(index=price_df.index)
    df["Close"] = price_df["Close"]

    rends = df["Close"].pct_change().dropna()

    # 5) add lagged closes
    for lag in range(1, lags + 1):
        df[f"rend_lag_{lag}"] = rends.shift(lag)

    # 6) compute TA on the asset’s series only
    df["SMA"] = ta.trend.sma_indicator(price_df["Close"], window=sma_window)
    df["EMA"] = ta.trend.ema_indicator(price_df["Close"], window=ema_window)
    df["RSI"] = ta.momentum.rsi(price_df["Close"], window=rsi_window)

    macd = ta.trend.MACD(
        price_df["Close"],
        window_slow=macd_slow,
        window_fast=macd_fast,
        window_sign=macd_signal,
    )
    df["MACD"]        = macd.macd()
    df["MACD_signal"] = macd.macd_signal()
    df["MACD_hist"]   = macd.macd_diff()

    bb = ta.volatility.BollingerBands(
        price_df["Close"], window=bb_window, window_dev=bb_std
    )
    df["BB_upper"] = bb.bollinger_hband()
    df["BB_lower"] = bb.bollinger_lband()

    atr = ta.volatility.AverageTrueRange(
        high=price_df["High"],
        low=price_df["Low"],
        close=price_df["Close"],
        window=atr_window,
    )
    df["ATR"] = atr.average_true_range()

    df["OBV"]  = ta.volume.on_balance_volume(
        price_df["Close"], price_df["Volume"]
    )
    df["VWAP"] = ta.volume.volume_weighted_average_price(
        price_df["High"],
        price_df["Low"],
        price_df["Close"],
        price_df["Volume"],
    )

    # 7) add VIX
    df["VIX"] = vix

    # 8) normalized ratios
    ratio_cols = [
        "SMA",
        "EMA",
        "RSI",
        "MACD",
        "MACD_hist",
        "BB_upper",
        "BB_lower",
        "ATR",
        "OBV",
        "VWAP",
    ]
    for col in ratio_cols:
        df[f"close_{col.lower()}_ratio"] = df["Close"] / df[col]

    # 9) drop any NaNs and return
    df.dropna(inplace=True)
    return df


In [2]:
dataset = dataset(
    'AMZN',
    5,
    '2000-01-01',
    '2025-05-01',
    # indicator params
    sma_window=20,
    ema_window=20,
    rsi_window=14,
    macd_fast=12,
    macd_slow=26,
    macd_signal=9,
    bb_window=20,
    bb_std=2,
    atr_window=14,
)

dataset

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  2 of 2 completed


Unnamed: 0_level_0,Close,rend_lag_1,rend_lag_2,rend_lag_3,rend_lag_4,rend_lag_5,SMA,EMA,RSI,MACD,...,close_sma_ratio,close_ema_ratio,close_rsi_ratio,close_macd_ratio,close_macd_hist_ratio,close_bb_upper_ratio,close_bb_lower_ratio,close_atr_ratio,close_obv_ratio,close_vwap_ratio
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-02-18,3.237500,-0.023873,-0.042337,-0.008396,-0.022970,0.000000,3.601094,3.623622,39.993808,-0.049002,...,0.899032,0.893443,0.080950,-66.069027,-69.664312,0.766352,1.087274,11.396963,-3.190806e-09,0.845697
2000-02-22,3.178125,-0.061594,-0.023873,-0.042337,-0.008396,-0.022970,3.584687,3.581194,38.885394,-0.079041,...,0.886584,0.887448,0.081731,-40.208780,-51.922237,0.750619,1.082700,11.512726,-2.816613e-09,0.826704
2000-02-23,3.521875,-0.018340,-0.061594,-0.023873,-0.042337,-0.008396,3.587656,3.575545,47.889806,-0.074253,...,0.981665,0.984990,0.073541,-47.430848,-78.025876,0.831562,1.197892,12.332522,-3.670432e-09,0.913677
2000-02-24,3.421875,0.108161,-0.018340,-0.061594,-0.023873,-0.042337,3.596719,3.560909,45.776816,-0.077633,...,0.951388,0.960955,0.074751,-44.077711,-88.161375,0.809042,1.154519,11.976591,-3.177488e-09,0.916368
2000-02-25,3.456250,-0.028394,0.108161,-0.018340,-0.061594,-0.023873,3.602187,3.550942,46.648241,-0.076654,...,0.959486,0.973333,0.074092,-45.088965,-114.188246,0.817442,1.161279,12.332261,-3.528579e-09,0.935398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-24,186.539993,0.042846,0.035023,-0.030647,-0.009866,-0.029289,182.066499,182.806470,50.464030,-5.759657,...,1.024571,1.020423,3.696494,-32.387342,161.553053,0.929701,1.141002,19.671074,6.229353e-09,1.051291
2025-04-25,188.990005,0.032890,0.042846,0.035023,-0.030647,-0.009866,181.448000,183.395378,52.138596,-4.591284,...,1.041566,1.030506,3.624762,-41.162774,101.693255,0.953799,1.147122,20.714725,6.303504e-09,1.057590
2025-04-28,187.699997,0.013134,0.032890,0.042846,0.035023,-0.030647,181.196999,183.805342,51.157969,-3.726477,...,1.035889,1.021189,3.669028,-50.369291,86.156573,0.951101,1.137275,21.203048,6.267423e-09,1.043884
2025-04-29,187.389999,-0.006826,0.013134,0.032890,0.042846,0.035023,181.053500,184.146738,50.910170,-3.031183,...,1.034998,1.017612,3.680797,-61.820743,81.505512,0.951552,1.134486,21.967883,6.265789e-09,1.036825


In [3]:
dataset.to_csv('amzn_dataset2.csv', index=True)