<a href="https://colab.research.google.com/github/AlexMilekhin/Volatility-Surface-Modelling-ML-Forecasting/blob/main/data_collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

Saving yahoo_cookie.txt to yahoo_cookie.txt


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import requests
import os
import numpy as np, pandas as pd
from datetime import datetime, timezone
from math import sqrt, exp, log
from scipy.stats import norm
from scipy.optimize import brentq
from scipy.interpolate import griddata
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401

In [None]:
class YahooClient:
    def __init__(self, load_browser_cookies=True, cookie_str: str | None = None):
        self.s = requests.Session()
        self.s.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/127.0 Safari/537.36",
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Language": "en-US,en;q=0.9",
            "Connection": "keep-alive",
            "Sec-Fetch-Site": "same-origin",
            "Sec-Fetch-Mode": "cors",
            "Sec-Fetch-Dest": "empty",
            "Pragma": "no-cache",
            "Cache-Control": "no-cache",
        })
        self.crumb = None

        # Allow passing a manual cookie string if browser extraction fails
        if not cookie_str:
            cookie_str = os.environ.get("YAHOO_COOKIE") or None
        if not cookie_str:
            cookie_path = "/content/yahoo_cookie.txt"
            if os.path.isfile(cookie_path):
              with open(cookie_path, "r", encoding="utf-8") as f:
                cookie_str = f.read().strip()
            else:
              print("No cookie found...")
        if cookie_str:
            for kv in cookie_str.split(";"):
                if "=" in kv:
                    k, v = kv.split("=", 1)
                    self.s.cookies.set(k.strip(), v.strip(), domain=".yahoo.com")

        if load_browser_cookies and not cookie_str:
            try:
                import browser_cookie3
            except Exception:
                browser_cookie3 = None
            if browser_cookie3:
                for loader in ("chrome", "edge", "firefox"):
                    try:
                        cj = getattr(browser_cookie3, loader)(domain_name=".yahoo.com")
                        self.s.cookies.update(cj)
                        break
                    except Exception:
                        continue

    def warmup(self, ticker="JPM"):
        pages = [
            f"https://finance.yahoo.com/quote/{ticker}/options?p={ticker}",
            f"https://finance.yahoo.com/quote/{ticker}?p={ticker}",
            "https://finance.yahoo.com/",
        ]
        for u in pages:
            try:
                r = self.s.get(u, timeout=12, allow_redirects=True)
                if r.status_code == 200:
                    break
            except Exception:
                pass

        for cu in (
            "https://query1.finance.yahoo.com/v1/test/getcrumb",
            "https://query2.finance.yahoo.com/v1/test/getcrumb",
        ):
            try:
                c = self.s.get(cu, timeout=12)
                if c.ok and c.text.strip():
                    self.crumb = c.text.strip()
                    break
            except Exception:
                continue

    def _get_json(self, base_url, params=None, referer=None, retries=2):
        for _ in range(retries + 1):
            p = dict(params or {})
            if self.crumb:
                p["crumb"] = self.crumb
            headers = {"Accept": "application/json, text/javascript, */*; q=0.01"}
            if referer:
                headers["Referer"] = referer

            r = self.s.get(base_url, params=p, headers=headers, timeout=12, allow_redirects=True)

            if r.status_code in (401, 403, 404, 999):
                self.warmup()
                continue

            r.raise_for_status()
            return r.json()
        raise RuntimeError(f"Yahoo request failed after retries: {base_url}")

    def get_expirations(self, ticker: str) -> list[str]:
        t = normalize_ticker(ticker)
        self.warmup(t)
        ref = f"https://finance.yahoo.com/quote/{t}/options?p={t}"
        for host in ("https://query2.finance.yahoo.com", "https://query1.finance.yahoo.com"):
            try:
                j = self._get_json(f"{host}/v7/finance/options/{t}", referer=ref)
                res = j.get("optionChain", {}).get("result", [])
                if res:
                    exp_unix = res[0].get("expirationDates", [])
                    return [pd.to_datetime(ts, unit="s").strftime("%Y-%m-%d") for ts in exp_unix]
            except Exception:
                continue
        return []

    def get_chain(self, ticker: str, expiry: str):
        t = normalize_ticker(ticker)
        ref = f"https://finance.yahoo.com/quote/{t}/options?p={t}"
        exps = self.get_expirations(t)
        if not exps:
            raise ValueError("No expirations returned by Yahoo (blocked or symbol not optionable).")
        exp_map = {pd.to_datetime(e).strftime("%Y-%m-%d"): int(pd.Timestamp(e).timestamp()) for e in exps}
        if expiry not in exp_map:
            raise ValueError(f"Requested expiry {expiry} not in Yahoo list.")
        epoch = exp_map[expiry]

        j = None
        for host in ("https://query2.finance.yahoo.com", "https://query1.finance.yahoo.com"):
            try:
                j = self._get_json(f"{host}/v7/finance/options/{t}", params={"date": epoch}, referer=ref)
                break
            except Exception:
                continue
        if j is None:
            raise RuntimeError("Failed to fetch chain from Yahoo.")

        result = j.get("optionChain", {}).get("result", [])
        if not result:
            raise ValueError("Empty optionChain result.")
        opts = result[0].get("options", [])
        if not opts:
            return pd.DataFrame()

        o = opts[0]
        calls = pd.DataFrame(o.get("calls", []))
        puts  = pd.DataFrame(o.get("puts", []))
        if calls.empty and puts.empty:
            return pd.DataFrame()
        calls["type"] = "call"
        puts["type"]  = "put"
        df = pd.concat([calls, puts], ignore_index=True)
        if "lastTradeDate" in df.columns:
            df["lastTradeDate"] = pd.to_datetime(df["lastTradeDate"], unit="s", errors="coerce")

        # Convenience columns
        for col in ["bid", "ask", "lastPrice", "strike", "impliedVolatility"]:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")
        if {"bid","ask"}.issubset(df.columns):
            df["mid"] = (df["bid"] + df["ask"]) / 2
            df["spread"] = df["ask"] - df["bid"]
        if "impliedVolatility" in df.columns:
            df["IV_%"] = df["impliedVolatility"] * 100

        # Order columns nicely
        order = [
            "type","contractSymbol","strike","lastPrice","bid","ask","mid","spread",
            "change","percentChange","volume","openInterest","IV_%","impliedVolatility",
            "inTheMoney","lastTradeDate","contractSize","currency"
        ]
        cols = [c for c in order if c in df.columns] + [c for c in df.columns if c not in order]
        return df[cols] if not df.empty else df


def normalize_ticker(t: str) -> str:
    t = (t or "").strip().upper()
    if "." in t and not t.endswith(".HK"):
        t = t.replace(".", "-")
    return t


def fetch_all_option_chains(ticker: str) -> pd.DataFrame:
    yc = YahooClient()
    expirations = yc.get_expirations(ticker)
    if not expirations:
        return pd.DataFrame()
    frames: list[pd.DataFrame] = []
    for exp in expirations:
        try:
            df = yc.get_chain(ticker, exp)
            if not df.empty:
                df = df.copy()
                df["expiry"] = exp
                frames.append(df)
        except Exception:
            continue
    if not frames:
        return pd.DataFrame()
    combined = pd.concat(frames, ignore_index=True)
    # Columns already normalized in get_chain; ensure order with expiry first
    order = [
        "expiry","type","contractSymbol","strike","lastPrice","bid","ask","mid","spread",
        "change","percentChange","volume","openInterest","IV_%","impliedVolatility",
        "inTheMoney","lastTradeDate","contractSize","currency"
    ]
    cols = [c for c in order if c in combined.columns] + [c for c in combined.columns if c not in order]
    return combined[cols]


In [None]:

# === Basic usage ===

# Choose your symbol
TICKER = "AAPL"

yc = YahooClient()

# 1) List expirations
exps = yc.get_expirations(TICKER)
print("First few expirations:", exps[:5])

# 2) Pick one expiry (use first if unsure)
if exps:
    EXPIRY = exps[0]
    chain = yc.get_chain(TICKER, EXPIRY)
    display(chain.head(10))
else:
    print("No expirations returned. Yahoo may be blocking requests or symbol is not optionable.")


First few expirations: ['2025-10-24', '2025-10-31', '2025-11-07', '2025-11-14', '2025-11-21']


Unnamed: 0,type,contractSymbol,strike,lastPrice,bid,ask,mid,spread,change,percentChange,volume,openInterest,IV_%,impliedVolatility,inTheMoney,lastTradeDate,contractSize,currency,expiration
0,call,AAPL251024C00110000,110.0,146.65,0.0,0.0,0.0,0.0,0.0,0.0,4.0,14,0.001,1e-05,True,2025-10-22 17:37:02,REGULAR,USD,1761264000
1,call,AAPL251024C00120000,120.0,128.65,0.0,0.0,0.0,0.0,0.0,0.0,25.0,26,0.001,1e-05,True,2025-10-15 19:54:40,REGULAR,USD,1761264000
2,call,AAPL251024C00125000,125.0,133.49,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3,0.001,1e-05,True,2025-10-03 19:39:45,REGULAR,USD,1761264000
3,call,AAPL251024C00140000,140.0,116.1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0.001,1e-05,True,2025-10-01 15:02:45,REGULAR,USD,1761264000
4,call,AAPL251024C00150000,150.0,111.22,0.0,0.0,0.0,0.0,0.0,0.0,1.0,51,0.001,1e-05,True,2025-10-22 14:19:40,REGULAR,USD,1761264000
5,call,AAPL251024C00155000,155.0,105.21,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.001,1e-05,True,2025-10-23 18:14:36,REGULAR,USD,1761264000
6,call,AAPL251024C00160000,160.0,94.2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.001,1e-05,True,2025-10-09 19:24:23,REGULAR,USD,1761264000
7,call,AAPL251024C00165000,165.0,94.25,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5,0.001,1e-05,True,2025-10-23 15:20:06,REGULAR,USD,1761264000
8,call,AAPL251024C00170000,170.0,87.77,0.0,0.0,0.0,0.0,0.0,0.0,2.0,6,0.001,1e-05,True,2025-10-22 19:49:40,REGULAR,USD,1761264000
9,call,AAPL251024C00175000,175.0,85.1,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3,0.001,1e-05,True,2025-10-23 19:46:18,REGULAR,USD,1761264000


In [None]:

# === Fetch all expirations for a ticker and combine into one DataFrame ===

ALL = fetch_all_option_chains(TICKER)
print(f"Combined rows: {len(ALL)}")
display(ALL.head(1000))


Combined rows: 2027


Unnamed: 0,expiry,type,contractSymbol,strike,lastPrice,bid,ask,mid,spread,change,percentChange,volume,openInterest,IV_%,impliedVolatility,inTheMoney,lastTradeDate,contractSize,currency,expiration
0,2025-10-24,call,AAPL251024C00110000,110.0,146.65,0.0,0.0,0.0,0.0,0.0,0.0,4.0,14,0.001000,0.000010,True,2025-10-22 17:37:02,REGULAR,USD,1761264000
1,2025-10-24,call,AAPL251024C00120000,120.0,128.65,0.0,0.0,0.0,0.0,0.0,0.0,25.0,26,0.001000,0.000010,True,2025-10-15 19:54:40,REGULAR,USD,1761264000
2,2025-10-24,call,AAPL251024C00125000,125.0,133.49,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3,0.001000,0.000010,True,2025-10-03 19:39:45,REGULAR,USD,1761264000
3,2025-10-24,call,AAPL251024C00140000,140.0,116.10,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0.001000,0.000010,True,2025-10-01 15:02:45,REGULAR,USD,1761264000
4,2025-10-24,call,AAPL251024C00150000,150.0,111.22,0.0,0.0,0.0,0.0,0.0,0.0,1.0,51,0.001000,0.000010,True,2025-10-22 14:19:40,REGULAR,USD,1761264000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2026-04-17,put,AAPL260417P00140000,140.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,4.0,73,12.500875,0.125009,False,2025-10-22 19:22:00,REGULAR,USD,1776384000
996,2026-04-17,put,AAPL260417P00145000,145.0,0.70,0.0,0.0,0.0,0.0,0.0,0.0,1.0,45,12.500875,0.125009,False,2025-10-13 14:17:42,REGULAR,USD,1776384000
997,2026-04-17,put,AAPL260417P00150000,150.0,0.50,0.0,0.0,0.0,0.0,0.0,0.0,1.0,51,12.500875,0.125009,False,2025-10-21 15:58:48,REGULAR,USD,1776384000
998,2026-04-17,put,AAPL260417P00155000,155.0,0.68,0.0,0.0,0.0,0.0,0.0,0.0,3.0,92,12.500875,0.125009,False,2025-10-22 15:23:23,REGULAR,USD,1776384000


In [None]:
# Save the combined DataFrame to a CSV file in Google Drive
try:
    ALL.to_csv('/content/drive/MyDrive/Colab Notebooks/VolSurf_ML/combined_options_data.csv', index=False)
    print("Successfully saved combined_options_data.csv to Google Drive.")
except Exception as e:
    print(f"Error saving file to Google Drive: {e}")
    print("Make sure you have mounted your Google Drive correctly.")

Successfully saved combined_options_data.csv to Google Drive.


In [None]:
# Download historical data for the last year
ticker_data = yf.download(TICKER, period="1y")

# Calculate daily returns
ticker_data['Daily_Return'] = ticker_data[('Close', TICKER)].pct_change()

# Calculate realized volatility (annualized)
# Assuming 252 trading days in a year
realized_volatility = ticker_data['Daily_Return'].std() * np.sqrt(252)

print(f"Realized Volatility for {TICKER} over the last year: {realized_volatility:.4f}")

#ticker_data.to_csv('/content/drive/MyDrive/Colab Notebooks/VolSurf_ML/ticker_data.csv', index=False)

  ticker_data = yf.download(TICKER, period="1y")
[*********************100%***********************]  1 of 1 completed

Realized Volatility for GOOGL over the last year: 0.3246





In [None]:
hist_price = yf.Ticker(TICKER).history(period="1y")
print(f'Hist price for {TICKER} over 1 year {hist_price}')


Hist price for GOOGL over 1 year                                  Open        High         Low       Close  \
Date                                                                        
2024-10-18 00:00:00-04:00  162.473513  163.986844  162.363996  162.702499   
2024-10-21 00:00:00-04:00  162.234547  163.777745  161.905994  163.349640   
2024-10-22 00:00:00-04:00  162.264412  165.042171  162.264412  164.414932   
2024-10-23 00:00:00-04:00  164.036579  165.091938  161.219003  162.065277   
2024-10-24 00:00:00-04:00  162.115072  162.612877  160.303056  162.005554   
...                               ...         ...         ...         ...   
2025-10-13 00:00:00-04:00  240.210007  244.500000  239.710007  244.149994   
2025-10-14 00:00:00-04:00  241.229996  247.119995  240.509995  245.449997   
2025-10-15 00:00:00-04:00  247.250000  252.110001  245.990005  251.029999   
2025-10-16 00:00:00-04:00  251.770004  256.959991  250.100006  251.460007   
2025-10-17 00:00:00-04:00  250.759995  254.