In [1]:
# File: generate_rag_text_all.py
import os
import re
import pandas as pd

base_path = "../../../datasets/processed"
tickers = ["AAPL", "TSLA", "GOOG", "MSFT", "AMZN", "NFLX", "NIO", "COIN"]

for ticker in tickers:
    print(f"\n[🔧] Processing {ticker}")
    data_path = os.path.join(base_path, ticker, "financial_with_news_macro_summary.csv")
    agent_path = os.path.join(base_path, ticker, "cio_analysis_results.csv")

    if not os.path.exists(data_path) or not os.path.exists(agent_path):
        print(f"[❌] Missing files for {ticker}, skipping.")
        continue

    df_original = pd.read_csv(data_path)
    df_agent = pd.read_csv(agent_path)

    df_original["date"] = pd.to_datetime(df_original["date"]).dt.date
    df_agent["date"] = pd.to_datetime(df_agent["date"]).dt.date
    df_agent = df_agent[["date", "reasoning"]]

    df_merged = pd.merge(df_original, df_agent, on="date", how="inner")

    new_rows = []
    for _, row in df_merged.iterrows():
        data = row.to_dict()
        pattern = re.compile(
            r"\[(?P<role>.*?) Analyst\]: \(action: (?P<action>\w+)\) (?P<reasoning>.*?)\(Confidence: (?P<confidence>[\d.]+)\)",
            re.DOTALL,
        )
        results = pattern.findall(data["reasoning"])

        fields = {}
        for role, action, reasoning, confidence in results:
            key = role.strip().lower().replace(" ", "_")
            fields[f"{key}_action"] = action.strip()
            fields[f"{key}_confidence"] = float(confidence.strip())
            fields[f"{key}_reasoning"] = reasoning.strip()

        combined = {**data, **fields}
        new_rows.append(combined)

    df_with_roles = pd.DataFrame(new_rows)

    def build_rag_text(row):
        return f"""
Date: {row['date']} | Ticker: {ticker}

[Technical Analyst Reasoning]: {row.get('technical_reasoning', '')}
[Sentiment Analyst Reasoning]: {row.get('sentiment_reasoning', '')}
[Macro Analyst Reasoning]: {row.get('macro_reasoning', '')}
[Risk Analyst Reasoning]: {row.get('risk_reasoning', '')}

[Market Info]:
- Open: {row.get('open', '')}, High: {row.get('high', '')}, Low: {row.get('low', '')}, Close: {row.get('close', '')}
- Volume: {row.get('volume', '')}, VIX: {row.get('vix', '')}, Turbulence: {row.get('turbulence', '')}
- Sentiment Summary: {row.get('news_summary', '')}
- Macro Summary: {row.get('macro_summary', '')}
- Risk Tag: {row.get('risk_tag', '')}
        """.strip()

    df_with_roles["rag_text"] = df_with_roles.apply(build_rag_text, axis=1)
    save_path = os.path.join(base_path, ticker, "rag_text.csv")
    df_with_roles.to_csv(save_path, index=False)
    print(f"[✅] RAG text saved to {save_path}")



[🔧] Processing AAPL
[✅] RAG text saved to ../../../datasets/processed/AAPL/rag_text.csv

[🔧] Processing TSLA
[✅] RAG text saved to ../../../datasets/processed/TSLA/rag_text.csv

[🔧] Processing GOOG
[✅] RAG text saved to ../../../datasets/processed/GOOG/rag_text.csv

[🔧] Processing MSFT
[✅] RAG text saved to ../../../datasets/processed/MSFT/rag_text.csv

[🔧] Processing AMZN
[✅] RAG text saved to ../../../datasets/processed/AMZN/rag_text.csv

[🔧] Processing NFLX
[✅] RAG text saved to ../../../datasets/processed/NFLX/rag_text.csv

[🔧] Processing NIO
[✅] RAG text saved to ../../../datasets/processed/NIO/rag_text.csv

[🔧] Processing COIN
[✅] RAG text saved to ../../../datasets/processed/COIN/rag_text.csv


In [2]:
# File: embed_rag_text_all.py

import os
import pandas as pd
import time
from tqdm import tqdm
from openai import OpenAI
import my_config

# 初始化 OpenAI 客户端
client = OpenAI(
    api_key=my_config.api_key,
    base_url="https://api.openai.com/v1"
)

tickers = ["AAPL", "TSLA", "GOOG", "MSFT", "AMZN", "NFLX", "NIO", "COIN"]
base_path = "../../../datasets/processed"

def get_embedding(text):
    if not isinstance(text, str) or not text.strip():
        return None  # 跳过空文本
    for attempt in range(3):  # 最多重试三次
        try:
            resp = client.embeddings.create(
                model="text-embedding-ada-002",  # 可替换为 text-embedding-3-small
                input=text.strip()
            )
            return resp.data[0].embedding
        except Exception as e:
            print(f"[⚠️ Error] Attempt {attempt+1}: {e}")
            time.sleep(2)
    return None  # 超过重试次数仍失败则返回空

# 遍历每个 ticker 文件夹进行嵌入处理
for ticker in tickers:
    print(f"\n[🔧] Embedding for {ticker}")
    rag_path = os.path.join(base_path, ticker, "rag_text.csv")
    output_path = os.path.join(base_path, ticker, "rag_text_with_embedding.csv")

    if not os.path.exists(rag_path):
        print(f"[❌] RAG text not found for {ticker}")
        continue

    df = pd.read_csv(rag_path)

    if "embedding" not in df.columns:
        tqdm.pandas(desc=f"Embedding {ticker}")
        df["embedding"] = df["rag_text"].progress_apply(get_embedding)
        df.dropna(subset=["embedding"], inplace=True)
        df.to_csv(output_path, index=False)
        print(f"[✅] Saved to {output_path}")
    else:
        print(f"[⏭️] Already embedded: {output_path}")


KeyboardInterrupt: 

In [None]:
# File: rag_inference_all.py
import os
import json
import faiss
import numpy as np
import pandas as pd
from datetime import timedelta
from tqdm import tqdm
from openai import OpenAI
import my_config
from tools import function_schema

client = OpenAI(
    api_key=my_config.api_key,
    base_url="https://api.deepseek.com"
    )

base_path = "../../../datasets/processed"
tickers = ["AAPL", "TSLA", "GOOG", "MSFT", "AMZN", "NFLX", "NIO", "COIN"]

def generate_suggestion(target_row, df_range, top_k=5):
    query_vec = np.array(eval(target_row["embedding"])).astype("float32").reshape(1, -1)
    df_range["embedding"] = df_range["embedding"].apply(eval)

    index = faiss.IndexFlatL2(len(df_range["embedding"].iloc[0]))
    index.add(np.vstack(df_range["embedding"].values).astype("float32"))

    distances, indices = index.search(query_vec, top_k)
    similar_contexts = "\n\n".join(df_range.iloc[i]["rag_text"] for i in indices[0])
    current_context = target_row["rag_text"]
    start_date = df_range["date"].min().strftime("%Y-%m-%d")
    end_date = df_range["date"].max().strftime("%Y-%m-%d")

    prompt = (
        f"You are acting as a Chief Investment Officer (CIO) at a top-tier financial institution.\n"
        f"Your role is to make informed and professional investment decisions based on current market data and historical analogs.\n\n"
        f"**Current Date**: {target_row['date']}\n\n"
        f"**Current Market Context Summary (from RAG text)**:\n"
        f"{current_context}\n\n"
        f"**Most Relevant Historical Market Cases (from FAISS-matched RAG texts between {start_date} and {end_date})**:\n"
        f"{similar_contexts}\n\n"
        f"Please follow these steps:\n"
        f"1. Carefully analyze the **Current Market Context**, extracting key financial signals, trends, risks, or sentiments.\n"
        f"2. Compare these elements with the **Historical Cases** provided.\n"
        f"3. Identify patterns, analogies, and lessons from history that can help assess the current situation.\n"
        f"4. Formulate a reasoned investment decision based on your analysis.\n\n"
        f"Output Format:\n"
        f"- `action`: one of ['buy', 'sell', 'hold']\n"
        f"- `confidence`: float between 0 and 1, indicating how strongly you support this action\n"
        f"- `reason`: a concise but professional justification based on evidence from both current and historical contexts\n\n"
        f"\"Respond by calling the 'stock_decision' function according to the schema.\""
    )


    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": "You are a financial CIO giving investment decisions."},
            {"role": "user", "content": prompt}
        ],
        tools=[{
            "type": "function",
            "function": function_schema[0]
        }] if function_schema else [],

        tool_choice={"type": "function", "function": {"name": function_schema[0]["name"]}} if function_schema else "auto"
    )

    message = response.choices[0].message

    if message.tool_calls:
        tool_call = message.tool_calls[0]
        args = tool_call.function.arguments
        parsed = json.loads(args)
        return parsed
    else:
        print("[⚠️ tool_calls 为空，未触发函数调用]")
        return {
            "action": "hold",
            "confidence": 0.5,
            "reasoning": "Model did not return tool call"
        }

for ticker in tickers:
    print(f"\n[🧠] RAG inference for {ticker}")
    df_path = os.path.join(base_path, ticker, "rag_text_with_embedding.csv")
    output_path = os.path.join(base_path, ticker, "rag_agent_suggestions.csv")

    if not os.path.exists(df_path):
        print(f"[❌] Missing embedding file: {df_path}")
        continue

    df = pd.read_csv(df_path)
    df["date"] = pd.to_datetime(df["date"])
    all_dates = sorted(df["date"].unique())

    train_start = pd.to_datetime("2022-01-03")
    train_end = pd.to_datetime("2022-10-04")

    results = []

    for current_date in tqdm(all_dates):
        row = df[df["date"] == current_date]
        if row.empty:
            continue
        row = row.iloc[0]

        if current_date <= train_end:
            df_range = df[(df["date"] >= train_start) & (df["date"] <= train_end)]
        else:
            df_range = df[(df["date"] >= train_start) & (df["date"] < current_date)]

        if df_range.empty:
            print(f"[⏭️] No context for {current_date.date()}")
            continue

        try:
            suggestion = generate_suggestion(row, df_range)
            results.append({
                "date": current_date.strftime("%Y-%m-%d"),
                **suggestion
            })
        except Exception as e:
            print(f"[❌] Error on {current_date.date()}: {e}")
            continue

    pd.DataFrame(results).to_csv(output_path, index=False)
    print(f"[✅] Saved: {output_path}")
