In [None]:
import os
import json
import numpy as np
import pandas as pd
from openai import OpenAI
from pydantic import BaseModel
from datetime import timedelta

# -----------------------------
# 1) Schema for structured output
# -----------------------------
class ResearchPaperExtraction(BaseModel):
    expected_return: float


# -----------------------------
# 2) Prompts
# -----------------------------
def make_system_prompt() -> str:
    return (
        "You are a model that predicts stock returns.\n"
        "Given a time-series of daily returns as percentage change from the past month,\n"
        "predict the average daily return (as a percentage change) for the next month.\n"
        "Output ONLY valid JSON matching the provided schema."
    )

def make_user_prompt(ticker: str, pct_change_list: list[float]) -> str:
    return f"Ticker: {ticker}\npct_change: {pct_change_list}"


# -----------------------------
# 3) Main (single-call test)
# -----------------------------
def main():
    # Fail fast if API key is missing
    # api_key = os.environ.get("OPENAI_API_KEY")
    api_key = ""
    if not api_key:
        raise RuntimeError("Missing OPENAI_API_KEY env var. Set it before running.")

    client = OpenAI(api_key=api_key)

    # Create directories if they don't exist
    os.makedirs("yfinance", exist_ok=True)
    os.makedirs("responses", exist_ok=True)

    # Load your S&P 500 table (kept because you had it; not strictly needed for the smoke test)
    sp500_table = pd.read_csv("sp500-master/2024-sp500-ticker-list.csv")

    # Pick ONE month only (first month in your intended range)
    current_date = pd.Timestamp("2024-06-01")  # change if you want
    month_start = current_date.strftime("%Y-%m-%d")
    month_end = (current_date + pd.DateOffset(months=1) - timedelta(days=1)).strftime("%Y-%m-%d")

    print(f"Using month: {month_start} -> {month_end}")

    # Read prices data you previously saved
    data_path = f"yfinance/data_2024-07-01_2024-07-31.csv"
    if not os.path.exists(data_path):
        raise FileNotFoundError(
            f"Missing {data_path}. Either generate it with yfinance or point to the right file."
        )

    data = pd.read_csv(data_path, index_col=0, parse_dates=True)

    # Compute returns
    returns = data.pct_change().iloc[1:]
    returns_path = f"yfinance/returns_{month_start}_{month_end}.csv"
    returns.to_csv(returns_path)
    print(f"Saved returns to: {returns_path}")

    # Pick ONE ticker only (first column)
    sp500_tickers = list(returns.columns)
    if not sp500_tickers:
        raise RuntimeError("No tickers found in returns file/columns.")

    ticker = sp500_tickers[0]
    pct_change = returns[ticker].dropna().astype(float).tolist()

    # Optional: reduce prompt size (keeps last ~21 trading days)
    pct_change = pct_change[-21:]

    system_prompt = make_system_prompt()
    user_prompt = make_user_prompt(ticker, pct_change)

    schema = ResearchPaperExtraction.model_json_schema()

    # ---- ONE API CALL ----
    # Preferred: Responses API + structured parsing when available.
    # Structured Outputs with json_schema is supported for gpt-4o-mini snapshots and later. :contentReference[oaicite:0]{index=0}
    # Responses API is the recommended API for new projects. :contentReference[oaicite:1]{index=1}
    try:
        # If your SDK supports .responses.parse, this gives you a typed Pydantic result directly.
        response = client.responses.parse(
            model="gpt-4o-mini-2024-07-18",
            input=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            text_format=ResearchPaperExtraction,
            timeout=60,
        )
        parsed = response.output_parsed
        expected_return = float(parsed.expected_return)
        raw_obj = {"expected_return": expected_return}
        print("OK (responses.parse). expected_return =", expected_return)

    except AttributeError:
        # Fallback if your installed SDK version doesn't expose responses.parse:
        completion = client.chat.completions.create(
            model="gpt-4o-mini-2024-07-18",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            response_format={
                "type": "json_schema",
                "json_schema": {"name": "ResearchPaperExtraction", "schema": schema, "strict": True},
            },
            timeout=60,
        )
        raw_obj = json.loads(completion.choices[0].message.content)
        expected_return = float(raw_obj["expected_return"])
        print("OK (chat.completions fallback). expected_return =", expected_return)

    # Save response (same structure you used, but only one ticker / one value)
    out = {
        ticker: {
            "ticker": ticker,
            "pct_change": pct_change,
            "expected_return": [expected_return],  # keep list to match your later pipeline
        }
    }
    out_path = f"responses/gpt_{month_start}_{month_end}.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

    print(f"Saved response to: {out_path}")


def main():
    # Fail fast if API key is missing
    api_key = ""
    if not api_key:
        raise RuntimeError("Missing OPENAI_API_KEY env var. Set it before running.")

    client = OpenAI(api_key=api_key)

    # Create directories if they don't exist
    os.makedirs("yfinance", exist_ok=True)
    os.makedirs("responses", exist_ok=True)

    # Load your S&P 500 table (kept because you had it; not strictly needed for the smoke test)
    sp500_table = pd.read_csv("sp500-master/2024-sp500-ticker-list.csv")

    # Pick ONE month only (first month in your intended range)
    current_date = pd.Timestamp("2024-06-01")  # change if you want
    month_start = current_date.strftime("%Y-%m-%d")
    month_end = (current_date + pd.DateOffset(months=1) - timedelta(days=1)).strftime("%Y-%m-%d")

    print(f"Using month: {month_start} -> {month_end}")

    # Read prices data you previously saved
    data_path = f"yfinance/data_{month_start}_{month_end}.csv"
    if not os.path.exists(data_path):
        raise FileNotFoundError(
            f"Missing {data_path}. Either generate it with yfinance or point to the right file."
        )

    data = pd.read_csv(data_path, index_col=0, parse_dates=True)

    # Compute returns
    returns = data.pct_change().iloc[1:]
    returns_path = f"yfinance/returns_{month_start}_{month_end}.csv"
    returns.to_csv(returns_path)
    print(f"Saved returns to: {returns_path}")

    # Pick ONE ticker only (first column)
    sp500_tickers = list(returns.columns)
    if not sp500_tickers:
        raise RuntimeError("No tickers found in returns file/columns.")

    ticker = sp500_tickers[0]
    pct_change = returns[ticker].dropna().astype(float).tolist()

    # Optional: reduce prompt size (keeps last ~21 trading days)
    pct_change = pct_change[-21:]

    system_prompt = make_system_prompt()
    user_prompt = make_user_prompt(ticker, pct_change)

    schema = ResearchPaperExtraction.model_json_schema()

    # ---- ONE API CALL ----
    # Preferred: Responses API + structured parsing when available.
    # Structured Outputs with json_schema is supported for gpt-4o-mini snapshots and later. :contentReference[oaicite:0]{index=0}
    # Responses API is the recommended API for new projects. :contentReference[oaicite:1]{index=1}
    try:
        # If your SDK supports .responses.parse, this gives you a typed Pydantic result directly.
        response = client.responses.parse(
            model="gpt-4o-mini-2024-07-18",
            input=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            text_format=ResearchPaperExtraction,
            timeout=60,
        )
        parsed = response.output_parsed
        expected_return = float(parsed.expected_return)
        raw_obj = {"expected_return": expected_return}
        print("OK (responses.parse). expected_return =", expected_return)

    except AttributeError:
        # Fallback if your installed SDK version doesn't expose responses.parse:
        completion = client.chat.completions.create(
            model="gpt-4o-mini-2024-07-18",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            response_format={
                "type": "json_schema",
                "json_schema": {"name": "ResearchPaperExtraction", "schema": schema, "strict": True},
            },
            timeout=60,
        )
        raw_obj = json.loads(completion.choices[0].message.content)
        expected_return = float(raw_obj["expected_return"])
        print("OK (chat.completions fallback). expected_return =", expected_return)

    # Save response (same structure you used, but only one ticker / one value)
    out = {
        ticker: {
            "ticker": ticker,
            "pct_change": pct_change,
            "expected_return": [expected_return],  # keep list to match your later pipeline
        }
    }
    out_path = f"responses/gpt_{month_start}_{month_end}.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

    print(f"Saved response to: {out_path}")

main()

Using month: 2024-06-01 -> 2024-06-30
Saved returns to: yfinance/returns_2024-06-01_2024-06-30.csv


  returns = data.pct_change().iloc[1:]


OK (responses.parse). expected_return = -0.9174411971347645
Saved response to: responses/gpt_2024-06-01_2024-06-30.json


In [6]:
import os
import pandas as pd
# Read prices data you previously saved
data_path = f"yfinance/data_2024-07-01_2024-07-31.csv"
if not os.path.exists(data_path):
    raise FileNotFoundError(
        f"Missing {data_path}. Either generate it with yfinance or point to the right file."
    )

data = pd.read_csv(data_path, index_col=0, parse_dates=True)

# Compute returns
returns = data.pct_change().iloc[1:]
returns_path = data_path
returns.to_csv(returns_path)
print(f"Saved returns to: {returns_path}")


# Pick ONE ticker only (first column)
sp500_tickers = list(returns.columns)
if not sp500_tickers:
    raise RuntimeError("No tickers found in returns file/columns.")

ticker = sp500_tickers[0].replace("'", "")
pct_change = returns[ticker].dropna().astype(float).tolist()

# Optional: reduce prompt size (keeps last ~21 trading days)
pct_change = pct_change[-21:]

system_prompt = make_system_prompt()
user_prompt = make_user_prompt(ticker, pct_change)

Saved returns to: yfinance/data_2024-07-01_2024-07-31.csv


  returns = data.pct_change().iloc[1:]


In [8]:
data_path = f"yfinance/data_2024-07-01_2024-07-31.csv"
if not os.path.exists(data_path):
    raise FileNotFoundError(
        f"Missing {data_path}. Either generate it with yfinance or point to the right file."
    )

data = pd.read_csv(data_path, index_col=0, parse_dates=True)

# Compute returns
returns = data.pct_change().iloc[1:]

  returns = data.pct_change().iloc[1:]


In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
# from scipy.optimize import minimize
from tqdm import tqdm
import json
import os
from openai import OpenAI
from pydantic import BaseModel
from tqdm import tqdm
import argparse
from datetime import datetime, timedelta
from keys import gpt_key
class ResearchPaperExtraction(BaseModel):
    expected_return: float

# make model_name arg parser
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default="gpt")
args = parser.parse_args()

"""
model
"""
model_name = args.model_name
if model_name == 'gpt':
    client = OpenAI(
        api_key=gpt_key
    )

"""
data
"""
# Create date range from June 2024 to November 2024
date_range = pd.date_range(start="2024-06-01", end="2024-12-01", freq='MS')

# Create directories if they don't exist
os.makedirs('yfinance', exist_ok=True)
os.makedirs('responses', exist_ok=True)

# 1. S&P500 ticker data
# url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
# tables = pd.read_html(url)[0]  # Wikipedia table data
# sp500_table = tables[['Symbol', 'Security', 'GICS Sector', 'GICS Sub-Industry']]

sp500_table = pd.read_csv('yfinance/filtered_sp500_data.csv')[['tic', 'conm', 'gics', 'sic', 'stock_ret']]

for current_date in tqdm(date_range):
    month_start = current_date.strftime('%Y-%m-%d')
    month_end = (current_date + pd.DateOffset(months=1) - timedelta(days=1)).strftime('%Y-%m-%d')
    print(f"Processing data for {month_start} - {month_end}")
    
    # # Download data using yfinance
    # data = yf.download(sp500_table['Symbol'].tolist(), start=month_start, end=month_end)['Close']
    # Retrieve stock_ret from sp500_table for the given month
    mask = (sp500_table['date_key'] >= month_start) & (sp500_table['date_key'] <= month_end)
    data = sp500_table.loc[mask, ['tic', 'date_key', 'stock_ret']]

    # Save raw data
    data.to_csv(f'yfinance/v2_data_{month_start}_{month_end}.csv')
    
    # Process returns
    returns = data.pivot(index='date_key', columns='tic', values='stock_ret')
    returns.to_csv(f'yfinance/v2_returns_{month_start}_{month_end}.csv')
    sp500_tickers = returns.columns

    # Organize data
    data_dict = {}
    for ticker in sp500_tickers:
        data_dict[ticker] = {
            'ticker': ticker,
            'Security': sp500_table[sp500_table['tic'] == ticker]['conm'].values[0],
            'GICS Sector': sp500_table[sp500_table['tic'] == ticker]['gics'].values[0],
            'GICS Sub-Industry': sp500_table[sp500_table['tic'] == ticker]['sic'].values[0],
            'pct_change': returns[ticker].tolist()
        }

    def make_system_prompt():
        return """You are a language model designed to predict stock returns. Given a time-series of daily returns as percentage change from the past month, along with the following company information in a dictionary:

        - symbol: The stock symbol
        - company name: The name of the company
        - GICS sector: The Global Industry Classification Standard (GICS) sector
        - GICS sub-industry: The GICS sub-industry

        You should predict the average daily return as a percentage change for the next month. Return a single float value representing the predicted average daily return for the next month. Do not include any additional commentary, explanations, or information. Only output the float value.
        """

    def make_user_prompt(ticker, data_dict):
        return f"""Ticker: {ticker}
        Security: {data_dict[ticker]['Security']}
        Sector: {data_dict[ticker]['GICS Sector']}
        Sub-Industry: {data_dict[ticker]['GICS Sub-Industry']}
        pct_change: {data_dict[ticker]['pct_change']}"""

    for ticker in tqdm(sp500_tickers):
        print("="*80)
        print(ticker)
        print("="*80)
        security = data_dict[ticker]['Security']
        sector = data_dict[ticker]['GICS Sector']
        sub_industry = data_dict[ticker]['GICS Sub-Industry']
        pct_change = data_dict[ticker]['pct_change']
        system_prompt = make_system_prompt()
        user_prompt = make_user_prompt(ticker, data_dict)
        if model_name == 'gemma':   
            # concat system prompt and user prompt (cuz gemma does not support system prompt)
            user_prompt = system_prompt + "\n\n" + user_prompt

        # Get responses 30 times, and use the variance of those values as the confidence of the view.
        answers = []
        for _ in range(30):
            if model_name == 'gpt':
                completion = client.chat.completions.create(
                    model="gpt-4o",
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": user_prompt},
                    ],
                    extra_body={"guided_json": ResearchPaperExtraction.model_json_schema()}
                )
            try:
                expected_return_dict = eval(completion.choices[0].message.content)
                print(expected_return_dict)
            except:
                # print(completion.choices[0].message.content)
                continue
            answers.append(expected_return_dict['expected_return'])
        data_dict[ticker]['expected_return'] = answers

    # Save responses for this month
    with open(f'responses/{model_name}_{month_start}_{month_end}.json', 'w') as f:
        json.dump(data_dict, f)
