In [10]:
import os
import pandas as pd
from pandas_datareader import data as pdr

In [11]:
START = "2015-01-01"
END   = "2025-10-01"
TICKERS = ["SPY", "QQQ"]

In [12]:
def get_stooq(tk, start=START, end=END) -> pd.DataFrame:
    # Fetch daily EOD OHLCV for a single ticker from Stooq within [start, end)
    df = pdr.DataReader(tk, "stooq", start=start, end=end)
    
    # Stooq returns rows in descending date order; sort ascending and make Date a column
    df = df.sort_index().reset_index()
    # Add a Ticker column so we can concatenate multiple tickers later
    df["Ticker"] = tk
    
    # Keep a clean, consistent column subset for downstream merges/features
    return df[["Ticker","Date","Open","High","Low","Close","Volume"]]

In [13]:
frames = [get_stooq(tk) for tk in TICKERS]
# Stack all frames into a single DataFrame; reindex rows 0..N-1
frames_cvs = pd.concat(frames, ignore_index=True) 

prices = pd.concat(frames, ignore_index=True).sort_values(["Ticker","Date"])

In [14]:
frames_cvs

Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume
0,SPY,2015-01-02,172.834,173.274,171.016,172.069,144993270
1,SPY,2015-01-05,171.006,171.148,168.662,168.957,202423620
2,SPY,2015-01-06,169.254,169.791,166.556,167.378,249677041
3,SPY,2015-01-07,168.718,169.791,168.249,169.435,149593226
4,SPY,2015-01-08,170.862,172.652,170.842,172.469,175671912
...,...,...,...,...,...,...,...
5401,QQQ,2025-09-25,592.200,595.115,588.500,593.530,70920209
5402,QQQ,2025-09-26,594.350,596.300,591.060,595.970,54337416
5403,QQQ,2025-09-29,599.110,602.050,597.410,598.730,48332934
5404,QQQ,2025-09-30,598.430,600.710,596.100,600.370,46533814


In [15]:
prices

Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume
2703,QQQ,2015-01-02,95.7833,96.1811,94.5548,95.0215,33920142
2704,QQQ,2015-01-05,94.6011,94.7069,93.3638,93.6168,39552414
2705,QQQ,2015-01-06,93.7599,93.9255,91.9571,92.3698,71694249
2706,QQQ,2015-01-07,92.9902,93.7795,92.7642,93.5619,40703138
2707,QQQ,2015-01-08,94.3568,95.5470,94.2520,95.3706,43549097
...,...,...,...,...,...,...,...
2698,SPY,2025-09-25,657.9400,659.4056,654.4050,658.0500,89622067
2699,SPY,2025-09-26,659.5100,662.3700,657.8800,661.8200,69179209
2700,SPY,2025-09-29,664.3600,665.2800,661.8600,663.6800,73499015
2701,SPY,2025-09-30,662.9300,666.6500,661.6100,666.1800,86288028


In [16]:
'''prices["ret_1d"] = prices.groupby("Ticker")["Close"].pct_change()
# Daily return per ticker: (Close_t - Close_{t-1}) / Close_{t-1}'''

up = prices.groupby("Ticker")["Close"].shift(-1) >= prices["Close"]
# True if next day's Close ≥ today's Close (per ticker)

prices["label_up_next"] = (
    up.mask(prices.groupby("Ticker")["Close"].shift(-1).isna()).astype("Int64")
)
# Convert to 0/1 with nullable Int64; last day per ticker -> <NA> (no next day)

In [17]:
prices

Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume,label_up_next
2703,QQQ,2015-01-02,95.7833,96.1811,94.5548,95.0215,33920142,0
2704,QQQ,2015-01-05,94.6011,94.7069,93.3638,93.6168,39552414,0
2705,QQQ,2015-01-06,93.7599,93.9255,91.9571,92.3698,71694249,1
2706,QQQ,2015-01-07,92.9902,93.7795,92.7642,93.5619,40703138,1
2707,QQQ,2015-01-08,94.3568,95.5470,94.2520,95.3706,43549097,0
...,...,...,...,...,...,...,...,...
2698,SPY,2025-09-25,657.9400,659.4056,654.4050,658.0500,89622067,1
2699,SPY,2025-09-26,659.5100,662.3700,657.8800,661.8200,69179209,1
2700,SPY,2025-09-29,664.3600,665.2800,661.8600,663.6800,73499015,1
2701,SPY,2025-09-30,662.9300,666.6500,661.6100,666.1800,86288028,1


In [18]:
os.makedirs("Raw Datasets", exist_ok=True)
out_raw = "Raw Datasets/frames_spy_qqq_2015_2025_stooq.csv"
frames_cvs.to_csv(out_raw, index=False)
os.makedirs("Processed Datasets", exist_ok=True)
out_processed = "Processed Datasets/prices_spy_qqq_2015_2025_stooq.csv"
prices.to_csv(out_processed, index=False)


print("Saved ->", out_raw)
print("shape:", frames_cvs.shape)
print("-------------------------------------------------")

print("Saved ->", out_processed)
print("shape:", prices.shape)
print("tickers:", prices["Ticker"].unique().tolist())
print("date range:", prices["Date"].min(), "->", prices["Date"].max())


Saved -> Raw Datasets/frames_spy_qqq_2015_2025_stooq.csv
shape: (5406, 7)
-------------------------------------------------
Saved -> Processed Datasets/prices_spy_qqq_2015_2025_stooq.csv
shape: (5406, 8)
tickers: ['QQQ', 'SPY']
date range: 2015-01-02 00:00:00 -> 2025-10-01 00:00:00
