In [27]:
import requests
import pandas as pd
from datetime import datetime
import time

API_KEY = "abc808ce-4363-41d9-9e35-2660f1757ad6" 
HEADERS = {"Authorization": f"Bearer {API_KEY}", "accept": "application/json"}
BASE_URL = "https://api.elections.kalshi.com/trade-api/v2"

SPORTS_KEYWORDS = {
    "nfl", "nba", "mlb", "nhl", "mls", "wnba", "pga", "lpga", "ufc", "fifa",
    "premier league", "champions league", "la liga", "bundesliga", "serie a",
    "football", "soccer", "basketball", "baseball", "hockey", "tennis", "golf",
    "mma", "boxing", "racing", "cricket", "rugby", "volleyball",
    "super bowl", "world series", "stanley cup", "nba finals", "world cup",
    "march madness", "masters", "wimbledon", "us open", "playoffs", "olympics"
}

def is_sports_market(market):
    text = ' '.join([
        market.get('title', '').lower(),
        market.get('series_ticker', '').lower(),
        market.get('event_ticker', '').lower()
    ])
    
    found_keywords = SPORTS_KEYWORDS.intersection(text.split())
    return len(found_keywords) >= 2

def fetch_filtered_markets():
    markets = []
    cursor = None
    batch_count = 0
    start_time = time.time()
    
    while True:
        try:
            params = {
                "limit": 1000,
                "cursor": cursor,
                "min_close_ts": 0,
                "max_close_ts": int(datetime.now().timestamp()),
                "status": "settled"
            }
            
            response = requests.get(
                f"{BASE_URL}/markets",
                headers=HEADERS,
                params=params,
                timeout=10 
            )
            
            if response.status_code != 200:
                print(f"Stopped early: Status {response.status_code}")
                break

            data = response.json()
            batch = [m for m in data.get('markets', []) if is_sports_market(m)]
            markets.extend(batch)
            
            batch_count += 1
            elapsed = time.time() - start_time
            print(f"Batch {batch_count}: +{len(batch)} markets ({elapsed:.1f}s)")

            cursor = data.get('cursor')
            if not cursor:
                break

            time.sleep(0.5 if batch_count % 10 == 0 else 0.1)

        except (requests.Timeout, requests.ConnectionError):
            print("Connection issue, retrying...")
            time.sleep(5)
            continue

    return markets

def safe_datetime_conversion(s):
    return pd.to_datetime(s, errors='coerce', utc=True)

if __name__ == "__main__":
    print("Starting optimized data fetch...")
    sports_markets = fetch_filtered_markets()
    
    df = pd.DataFrame(sports_markets)
    
    datetime_cols = ['open_time', 'close_time', 'expiration_time']
    for col in datetime_cols:
        if col in df.columns:
            df[col] = safe_datetime_conversion(df[col])
    
    print(f"\nFinal count: {len(df)} sports markets")
    df.to_parquet("sports_markets.parquet", index=False)
    print("Data saved to sports_markets.parquet")

    if not df.empty:
        print("\nSample market:")
        print(df.iloc[0][['ticker', 'title', 'result']])

Starting optimized data fetch...
Batch 1: +0 markets (0.4s)
Batch 2: +0 markets (0.8s)
Batch 3: +0 markets (1.1s)
Batch 4: +0 markets (1.5s)
Batch 5: +0 markets (1.9s)
Batch 6: +0 markets (2.2s)
Batch 7: +0 markets (2.6s)
Batch 8: +0 markets (2.9s)
Batch 9: +0 markets (3.2s)
Batch 10: +0 markets (3.6s)
Batch 11: +0 markets (4.3s)
Batch 12: +0 markets (4.7s)
Batch 13: +0 markets (5.0s)
Batch 14: +0 markets (5.3s)
Batch 15: +0 markets (5.6s)
Batch 16: +0 markets (5.9s)
Batch 17: +0 markets (6.3s)
Batch 18: +0 markets (6.6s)
Batch 19: +0 markets (6.9s)
Batch 20: +0 markets (7.2s)
Batch 21: +0 markets (8.0s)
Batch 22: +0 markets (8.4s)
Batch 23: +0 markets (8.7s)
Batch 24: +0 markets (9.0s)
Batch 25: +0 markets (9.3s)
Batch 26: +0 markets (9.6s)
Batch 27: +0 markets (10.2s)
Batch 28: +0 markets (10.6s)
Batch 29: +0 markets (10.9s)
Batch 30: +0 markets (11.2s)
Batch 31: +0 markets (11.9s)
Batch 32: +0 markets (12.3s)
Batch 33: +0 markets (12.7s)
Batch 34: +0 markets (13.0s)
Batch 35: +0 mar

In [31]:
import json
import pandas as pd

def json_to_dataframe(json_file_path):
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    
    df = pd.DataFrame(data)
    
    datetime_cols = ['open_time', 'close_time', 'expected_expiration_time', 
                    'expiration_time', 'latest_expiration_time']
    
    for col in datetime_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], format='ISO8601', utc=True, errors='coerce')
    
    numeric_cols = ['floor_strike', 'volume', 'volume_24h', 'liquidity', 
                   'open_interest', 'risk_limit_cents']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

if __name__ == "__main__":
    json_path = "kalshi_sports_markets.json"
    markets_df = json_to_dataframe(json_path)
    
    print(markets_df[['ticker', 'close_time']].head())
    
    markets_df.to_csv("cleaned_markets.csv", index=False)

                    ticker                close_time
0  KXCPICOREYOY-24NOV-T3.4 2024-12-11 13:25:00+00:00
1  KXCPICOREYOY-24NOV-T3.3 2024-12-11 13:25:00+00:00
2  KXCPICOREYOY-24NOV-T3.2 2024-12-11 13:25:00+00:00
3  KXCPICOREYOY-24NOV-T3.1 2024-12-11 13:25:00+00:00
4  KXCPICOREYOY-24NOV-T3.0 2024-12-11 13:25:00+00:00
