In [None]:
!pip install git+https://github.com/jkirkby3/fypy.git

## We will collect options data from https://data.binance.vision/?prefix=data/option/daily/EOHSummary/BTCUSDT/ , the very bitcoin data is gathered from https://data.binance.vision/?prefix=data/spot/daily/klines/BTCUSDT/1h/ :

In [None]:
import requests, zipfile, io, gc, pandas as pd
from datetime import datetime, timedelta

def daterange(s, e):
    d = s
    while d <= e:
        yield d
        d += timedelta(days=1)

s = datetime.strptime("2023-05-18", "%Y-%m-%d").date(); e = datetime.strptime("2023-10-23", "%Y-%m-%d").date()
dfs = []; fetched_dates = 0
for day in daterange(s, e):
    r = requests.get(f"https://data.binance.vision/data/option/daily/EOHSummary/BTCUSDT/BTCUSDT-EOHSummary-{day.strftime('%Y-%m-%d')}.zip")
    if r.status_code != 200:
        print(f"Error fetching option data for {day.strftime('%Y-%m-%d')}")
        e = day - timedelta(days=1)
        break
    else:
        fetched_dates += 1
    z = zipfile.ZipFile(io.BytesIO(r.content))
    csv_name = [n for n in z.namelist() if n.lower().endswith(".csv")][0]
    with z.open(csv_name) as f:
        dfs.append(pd.read_csv(f))
gc.collect()

df = pd.concat(dfs, ignore_index=True)
print(f"Options data collected, preliminary shape - {df.shape}\n{fetched_dates=}")

klines = []
for day in daterange(s, e):
    r = requests.get(f"https://data.binance.vision/data/spot/daily/klines/BTCUSDT/1h/BTCUSDT-1h-{day.strftime('%Y-%m-%d')}.zip")
    if r.status_code != 200:
        raise Exception(f"Spot kline file missing for {day.strftime('%Y-%m-%d')}")
    z = zipfile.ZipFile(io.BytesIO(r.content))
    csv_name = [n for n in z.namelist() if n.lower().endswith(".csv")][0]
    with z.open(csv_name) as f:
        cols = ['open_time','open','high','low','close','volume','close_time','quote_volume','count','taker_buy_base_asset_volume','taker_buy_quote_asset_volume','ignore']
        dfk = pd.read_csv(f, header=None, names=cols)[['open_time','close']]
        gc.collect()
        klines.append(dfk)
gc.collect()

spot = pd.concat(klines, ignore_index=True)
spot['open_time'] = pd.to_numeric(spot['open_time'], errors='coerce').astype('Int64')
spot['underlying_price'] = pd.to_numeric(spot['close'], errors='coerce')
spot = spot[['open_time','underlying_price']].dropna().drop_duplicates(subset=['open_time']).set_index('open_time')
gc.collect()

df['hour'] = df['hour'].astype(int)
df['open_time_ms'] = df.apply(lambda r: int(pd.Timestamp(f"{r['date']} {int(r['hour']):02d}:00:00", tz='UTC').value // 10**6), axis=1)
df = df.merge(spot, left_on='open_time_ms', right_index=True, how='left')
df.drop(['open_time_ms'], inplace=True, axis=1)

df.sort_values(by=["date", "hour"], inplace=True)
from src.utils.config import OUTPUTS_DIR
df.to_csv(OUTPUTS_DIR / "binance_dump.csv", index=False)
print("Final shape:", df.shape)
del df; del spot; gc.collect()

Error fetching option data for 2023-09-08
Options data collected, preliminary shape - (614310, 26)
fetched_dates=113
Final shape: (614310, 27)


0

In [None]:
import gc, re, pandas as pd, numpy as np
from datetime import datetime

from fypy.volatility.implied.ImpliedVolCalculator import ImpliedVolCalculator_Black76
from fypy.termstructures.DiscountCurve import DiscountCurve_ConstRate
from fypy.termstructures.EquityForward import EquityForward

try:
    del df
except:
    pass
from src.utils.config import OUTPUTS_DIR
df = pd.read_csv(OUTPUTS_DIR / "binance_dump.csv")

numcols = ['close', 'underlying_price',
           'best_bid_price', 'mark_price', 'best_sell_iv', 'mark_iv',
           'openinterest_contracts', 'openinterest_usdt']
for c in numcols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

def extract_expiry(row):
    m = re.match(r'(\d{6})-', str(row.get('strike')))
    dd = m.group(1)
    yy = int(dd[0:2]); mm = int(dd[2:4]); ddn = int(dd[4:6])
    yyyy = 2000 + yy
    return pd.Timestamp(datetime(yyyy, mm, ddn, 0, 0, 0), tz='UTC')
df['expiry'] = df.apply(extract_expiry, axis=1)
df['current_time'] = pd.to_datetime(df['date'], format='%Y-%m-%d', utc=True) + pd.to_timedelta(df['hour'], unit='h')
df['ttm'] = (df['expiry'] - df['current_time']).dt.total_seconds() / (365.25 * 24 * 3600)
df['strike'] = df.apply(lambda r: int(r.get('strike').split('-')[-1]), axis=1)
df['is_call'] = df.apply(lambda r: r.get('symbol').endswith('C'), axis=1)

df.rename(inplace=True, columns=
    {
        'best_bid_price': 'bid_price', 'mark_price': 'mid_price',
        'best_sell_iv':   'bid_IV',    'mark_iv':    'mid_IV',
    })
required = ['current_time', 'is_call', 'ttm', 'strike', 'close', 'underlying_price',
            'openinterest_contracts', 'openinterest_usdt',
            'bid_price', 'mid_price', 'bid_IV', 'mid_IV']
df = df[required]
df.sort_values(by=["current_time"], inplace=True)
gc.collect()


required = ['ttm', 'strike', 'close', 'underlying_price']
mask_valid = df[required].notna().all(axis=1)
for col in required:
    mask_valid = mask_valid & (df[col] > 0)
df = df[mask_valid].reset_index(drop=True)
gc.collect()
print(f"Dataset shape after preliminary postprocessing: {df.shape}")
df[['current_time', 'is_call'] + required].to_csv(OUTPUTS_DIR / "binance_dump_with_valid_defaults.csv", index=False)


required += ['openinterest_contracts', 'openinterest_usdt']
mask_valid = df[required].notna().all(axis=1)
for col in required:
    mask_valid = mask_valid & (df[col] > 0)
df = df[mask_valid].reset_index(drop=True)
gc.collect()
print(f"Dataset shape with valid volumes: {df.shape}")
df[['current_time', 'is_call'] + required].to_csv(OUTPUTS_DIR / "binance_dump_with_valid_volumes.csv", index=False)


def impl_vol(price, strike, is_call, ttm, S):
    disc_curve = DiscountCurve_ConstRate(rate=0.0); fwd_curve = EquityForward.from_rates(S0=S, r=0.0, q=0.0)
    try:
        vol = ImpliedVolCalculator_Black76(fwd_curve, disc_curve).imply_vol(price, strike, is_call, ttm)
    except:
        vol = np.nan
    return vol

for idx, row in df.iterrows():
    price = row['close']; strike = row['strike']; is_call = row['is_call']; ttm = row['ttm']; S = row['underlying_price']
    df.at[idx, 'close_IV_calculated'] = impl_vol(price, strike, is_call, ttm, S)

mask_valid = df[['close_IV_calculated']].notna().all(axis=1) & (df['close_IV_calculated'] > 0)
df = df[mask_valid].reset_index(drop=True)
print(f"Dataset shape with valid volumes & arbitrage-free: {df.shape}")
df[['current_time', 'is_call', 'close_IV_calculated'] + required].to_csv(OUTPUTS_DIR / "binance_dump_with_valid_volumes_arbitrage_free.csv", index=False)
df.drop('close_IV_calculated', inplace=True, axis=1)
gc.collect()