# Instruction

1. Enter your base URL, API key and model name you want to use in block "Basic setting", then run it.

2. Select the basic data combination you wnat to use (raw, indicator, news and fake news), find the corresponding block and change the input and output path name to your own name in it.

3. Run the block. 

## Basic setting

In [None]:
from openai import OpenAI
import pandas as pd
import csv
import io
import os
import time
from tenacity import retry, stop_after_attempt, wait_exponential

BASE_URL = "Enter your base url here"
API_KEY = "Enter your api key here"
MODEL_NAME = "Enter your model name here"

# initialize OpenAI client
client = OpenAI(
    base_url=BASE_URL,
    api_key=API_KEY
)


## Raw data

In [None]:

# input and output paths
input_folder = "raw_data"
output_file = "raw_forecasts_claude-sonnet-4.5.csv"

csv_files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]
all_predictions = []

for file_name in csv_files:
    stock_name = os.path.splitext(file_name)[0]  # assume file name is stock name
    file_path = os.path.join(input_folder, file_name)

    print(f"\nüìà Processing {stock_name}...")

    try:
        # read historical data
        df = pd.read_csv(file_path)

        data_str = df.to_csv(index=False)

        prompt = f"""
        You are a financial data analyst.
        You are given the past 1-year daily stock price data of {stock_name} in CSV format below.
        Please perform the following tasks, do not include any explanations or extra text ‚Äî only output the CSV
        
        Tasks:
        1. Analyze the trend and volatility of the data.
        2. Predict the **closing price** of {stock_name} for 3, 5, and 10 trading days after the last date in the dataset, the exact dates are 2025/11/12, 2025/11/14, and 2025/11/21.
        3. Return your predictions **strictly as a CSV table** with columns:
           Date, Predicted_Close
        4. Do not include any explanations or extra text ‚Äî only output the CSV.
        
        The data columns are:
        - Date: Trading date
        - Open: Opening price
        - High: Highest price
        - Low: Lowest price
        - Close: Closing price
        - Volume: Trading volume

        Here is the historical data:
        {data_str}
        """

        # call the OpenAI API
        response = client.chat.completions.create(
            model="MODEL_NAME",
            messages=[
                {"role": "system", "content": "You are a professional financial analyst skilled in stock trend forecasting."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.4
        )
        csv_output = response.choices[0].message.content.strip()

        try:
            pred_df = pd.read_csv(io.StringIO(csv_output))
            if not {"Date", "Predicted_Close"}.issubset(pred_df.columns):
                raise ValueError("Missing required columns in GPT output.")
            pred_df["Stock"] = stock_name 
            all_predictions.append(pred_df)
            print(f"‚úÖ {stock_name} prediction completed.")
        except Exception as e:
            print(f"‚ö†Ô∏è {stock_name} prediction failed: invalid CSV format. Error: {e}")
            print("Model output was:\n", csv_output[:300], "...\n")

    except Exception as e:
        print(f"‚ùå Failed to process {stock_name}: {e}")

# combine all predictions and save to CSV
if all_predictions:
    final_df = pd.concat(all_predictions, ignore_index=True)
    final_df = final_df[["Stock", "Date", "Predicted_Close"]]
    final_df.to_csv(output_file, index=False)
    print(f"\n‚úÖ All forecasts saved to {output_file}")
else:
    print("\n‚ùå No valid forecasts generated.")

## Indicator data

In [None]:
# input and output paths
input_folder = "indicator_data"
output_file = "indicator_forecasts_claude-sonnet-4.5_sub.csv"

csv_files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]
all_predictions = []

def validate_prediction_result(csv_output, stock_name):
    try:
        lines = csv_output.split('\n')
        csv_lines = []
        for line in lines:
            if line.strip() and (line.startswith('Date') or (',' in line and not line.startswith('```'))):
                csv_lines.append(line)
        
        if not csv_lines:
            return None,
        
        clean_csv = '\n'.join(csv_lines)
        
        pred_df = pd.read_csv(io.StringIO(clean_csv))
        
        # check number of columns
        if not {"Date", "Predicted_Close"}.issubset(pred_df.columns):
            if len(pred_df.columns) >= 2:
                pred_df.columns = ['Date', 'Predicted_Close']
            else:
                return None, f"Lack of columns, expect 2 columns, get {len(pred_df.columns)} columns"
        
        # check number of rows
        if len(pred_df) != 3:
            return None, f"Unexpected prediction number, expect 3, get {len(pred_df)}"
        
        # check specific dates
        expected_dates = ['2025/11/12', '2025/11/14', '2025/11/21']
        actual_dates = pred_df['Date'].astype(str).tolist()
        
        if len(actual_dates) != 3:
            return None, f"Incorrect data number, expect 3, get {len(actual_dates)}"
        
        pred_df["Stock"] = stock_name
        
        return pred_df, "Verified"
        
    except Exception as e:
        return None, f"CSV read failed: {e}"

def predict_stock_with_retry(client, stock_name, df, max_retries=3):
    data_str = df.to_csv(index=False)
    
    for attempt in range(max_retries):
        try:
            print(f"  Prediction attempt {attempt + 1}/{max_retries}")
            
            prompt = f"""
            You are a financial data analyst.
            You are given the past 1-year daily stock price data and financial indicators of {stock_name} in CSV format below.
            Please perform the following tasks, do not include any explanations or extra text ‚Äî only output the CSV
            
            Tasks:
            1. Analyze the trend and volatility of the data.
            2. Predict the **closing price** of {stock_name} for 3, 5, and 10 trading days after the last date in the dataset, the exact dates are 2025/11/12, 2025/11/14, and 2025/11/21.
            3. Return your predictions **strictly as a CSV table** with columns:
               Date, Predicted_Close
            4. Do not include any explanations or extra text ‚Äî only output the CSV.
            
            The primary data includes:
            - Date: Trading date
            - Open: Opening price
            - High: Highest price
            - Low: Lowest price
            - Close: Closing price
            - Volume: Trading volume
            
            The financial indicators are:
            - Return: Daily return percentage
            - LogReturn: Daily log return percentage
            - SMA3, SMA5, SMA10, SMA30: 3, 5, 10, 30-day simple moving averages
            - RSI: 14-day relative strength index
            - MACD: Moving average convergence divergence
            - MACD_Signal: MACD signal line
            - BB-high, BB-low: Bollinger Bands high and low

            Here is the historical data and financial indicators:
            {data_str}
            """

            # Call API
            response = client.chat.completions.create(
                model="MODEL_NAME",
                messages=[
                    {"role": "system", "content": "You are a professional financial analyst skilled in stock trend forecasting."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.4
            )

            # Get CSV output
            csv_output = response.choices[0].message.content.strip()
            print(f"  API response length: {len(csv_output)}")

            pred_df, validation_msg = validate_prediction_result(csv_output, stock_name)
            
            if pred_df is not None:
                print(f"‚úÖ {stock_name} verification passed.")
                return pred_df
            else:
                print(csv_output)
                print(f"‚ö†Ô∏è {stock_name} verification failed: {validation_msg}")
                if attempt < max_retries - 1:
                    print("  Retry waiting...")
                    time.sleep(2)
                else:
                    print(f"üí• {stock_name} all retries failed")
                    
        except Exception as e:
            print(f"‚ö†Ô∏è {stock_name} No. {attempt + 1} retry failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2)
    
    return None

print(f"Find {len(csv_files)} stock files to process.")

# record processed and failed stocks
processed_stocks = []
failed_stocks = []

for file_name in csv_files:
    stock_name = os.path.splitext(file_name)[0]  # assume file name is stock name
    file_path = os.path.join(input_folder, file_name)

    print(f"\nüìà Processing {stock_name}...")

    try:
        # read historical data
        df = pd.read_csv(file_path)
        print(f"  data shape: {df.shape}")

        # check data sufficiency
        if len(df) < 10:
            print(f"‚ö†Ô∏è {stock_name} no enough data, skipping")
            failed_stocks.append(stock_name)
            continue

        pred_df = predict_stock_with_retry(client, stock_name, df, max_retries=3)
        
        if pred_df is not None:
            all_predictions.append(pred_df)
            processed_stocks.append(stock_name)
            print(f"‚úÖ {stock_name} prediction success, prediction points: {len(pred_df)}")
        else:
            print(f"üí• {stock_name} prediction failed, skipping")
            failed_stocks.append(stock_name)

    except Exception as e:
        print(f"‚ùå Process {stock_name} failed: {e}")
        failed_stocks.append(stock_name)
    
    # add delay between requests to avoid rate limits
    time.sleep(2)

# combine all predictions and save to CSV
if all_predictions:
    final_df = pd.concat(all_predictions, ignore_index=True)
    final_df = final_df[["Stock", "Date", "Predicted_Close"]]
    final_df.to_csv(output_file, index=False)

    success_stocks = final_df['Stock'].nunique()
    total_predictions = len(final_df)
    
    print(f"\nüéâ Prediction finished!")
    print(f"‚úÖ Success prediction: {success_stocks}/{len(csv_files)} stocks")
    print(f"üìä Total prediction number: {total_predictions} records")
    print(f"üíæ Saved to: {output_file}")
    
else:
    print("\n‚ùå No valid forecasts generated.")

## News data

In [None]:

# input and output paths
input_folder = "raw_data"  # historical stock price data file folder
news_folder = "news_data"  # news data file folder
output_file = "news_forecasts_claude_sub.csv"


csv_files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]
all_predictions = []


def load_news_data(stock_name):
    """load news data for the given stock"""
    news_file = os.path.join(news_folder, f"{stock_name}_news.csv")
    
    if not os.path.exists(news_file):
        print(f"‚ö†Ô∏è Cannot find news file: {news_file}")
        return None
    
    try:
        news_df = pd.read_csv(news_file)
        print(f"  Load news data: {len(news_df)} news")
        return news_df
    except Exception as e:
        print(f"‚ö†Ô∏è read news file failed: {e}")
        return None

def format_news_for_prompt(news_df):
    """Convert news DataFrame to formatted string for prompt"""
    if news_df is None or len(news_df) == 0:
        return "No relevant news data."
    
    news_content = []
    for i, row in news_df.iterrows():
        news_item = f"News {i+1}:\n"
        news_item += f"Title: {row.get('title', 'N/A')}\n"
        news_item += f"Source: {row.get('source', 'N/A')}\n"
        news_item += f"Date: {row.get('date', 'N/A')}\n"
        news_item += f"Content: {row.get('content', 'N/A')}...\n"
        news_item += "-" * 50
        news_content.append(news_item)
    
    return "\n".join(news_content)

# retry mechanism for API requests
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def make_api_request(client, prompt):
    try:
        response = client.chat.completions.create(
            model="MODEL_NAME",
            messages=[
                {"role": "system", "content": "You are a professional financial analyst skilled in stock trend forecasting."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.4,
            timeout=60
        )
        return response
    except Exception as e:
        print(f"‚ö†Ô∏è API request failed: {e}")
        raise

def validate_prediction_result(csv_output, stock_name):
    try:
        lines = csv_output.split('\n')
        csv_lines = []
        for line in lines:
            if line.strip() and (line.startswith('Date') or (',' in line and not line.startswith('```'))):
                csv_lines.append(line)
        
        if not csv_lines:
            return None,
        
        clean_csv = '\n'.join(csv_lines)
        
        pred_df = pd.read_csv(io.StringIO(clean_csv))
        
        # check number of columns
        if not {"Date", "Predicted_Close"}.issubset(pred_df.columns):
            if len(pred_df.columns) >= 2:
                pred_df.columns = ['Date', 'Predicted_Close']
            else:
                return None, f"Lack of columns, expect 2 columns, get {len(pred_df.columns)} columns"
        
        # check number of rows
        if len(pred_df) != 3:
            return None, f"Unexpected prediction number, expect 3, get {len(pred_df)}"
        
        # check specific dates
        expected_dates = ['2025/11/12', '2025/11/14', '2025/11/21']
        actual_dates = pred_df['Date'].astype(str).tolist()
        
        if len(actual_dates) != 3:
            return None, f"Incorrect data number, expect 3, get {len(actual_dates)}"
        
        pred_df["Stock"] = stock_name
        
        return pred_df, "Verified"
        
    except Exception as e:
        return None, f"CSV read failed: {e}"

def predict_stock_with_retry(client, stock_name, df, news_content, max_retries=3):
    """Prediction function with retries"""
    data_str = df.to_csv(index=False)
    
    for attempt in range(max_retries):
        try:
            print(f"  Prediction attempt {attempt + 1}/{max_retries}")
            
            # Prompt construction with news analysis
            prompt = f"""
            You are a financial data analyst.
            You are given two types of data for {stock_name}:

            1. PAST 1-YEAR STOCK PRICE DATA (in CSV format):
            - Date: Trading date
            - Open: Opening price
            - High: Highest price
            - Low: Lowest price
            - Close: Closing price
            - Volume: Trading volume

            2. RECENT NEWS DATA:
            {news_content}

            TASKS:
            1. Analyze both the historical price trend/volatility AND the information from recent news
            2. Consider how the news might impact future stock performance
            3. Predict the closing price of {stock_name} for 3, 5, and 10 trading days after the last date in the dataset
               Exact prediction dates: 2025/11/12, 2025/11/14, 2025/11/21
            4. Return your predictions strictly as a CSV table with columns:
               Date, Predicted_Close

            IMPORTANT:
            - Combine both technical analysis (price data) and fundamental analysis (news)
            - Do not include any explanations or extra text ‚Äî only output the CSV
            - Ensure the output has exactly 3 rows for the 3 prediction dates
            - Format must be valid CSV with headers: Date, Predicted_Close

            HISTORICAL PRICE DATA:
            {data_str}
            """

            # Call API
            response = make_api_request(client, prompt)
            csv_output = response.choices[0].message.content.strip()
            print(f"  API response length: {len(csv_output)}")

            # Validate prediction result
            pred_df, validation_msg = validate_prediction_result(csv_output, stock_name)
            
            if pred_df is not None:
                print(f"‚úÖ {stock_name} verification passed.")
                return pred_df
            else:
                print(f"‚ö†Ô∏è {stock_name} verification failed: {validation_msg}")
                print(csv_output)
                if attempt < max_retries - 1:
                    print("  Waiting for retry...")
                    time.sleep(2)
                else:
                    print(f"üí• {stock_name} all retries failed")
                    
        except Exception as e:
            print(f"‚ö†Ô∏è {stock_name} No. {attempt + 1} attempt failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2)
    
    return None


print(f"Find {len(csv_files)} stock files to process.")

# record processed and failed stocks
processed_stocks = []
failed_stocks = []

for file_name in csv_files:
    stock_name = os.path.splitext(file_name)[0]  # assume file name is stock name
    file_path = os.path.join(input_folder, file_name)

    print(f"\nüìà Processing {stock_name}...")

    try:
        # read historical data
        df = pd.read_csv(file_path)
        print(f"  data shape: {df.shape}")

        # check data sufficiency
        if len(df) < 10:
            print(f"‚ö†Ô∏è {stock_name} no enough data, skipping")
            failed_stocks.append(stock_name)
            continue

        pred_df = predict_stock_with_retry(client, stock_name, df, max_retries=3)
        
        if pred_df is not None:
            all_predictions.append(pred_df)
            processed_stocks.append(stock_name)
            print(f"‚úÖ {stock_name} prediction success, prediction points: {len(pred_df)}")
        else:
            print(f"üí• {stock_name} prediction failed, skipping")
            failed_stocks.append(stock_name)

    except Exception as e:
        print(f"‚ùå Process {stock_name} failed: {e}")
        failed_stocks.append(stock_name)
    
    # add delay between requests to avoid rate limits
    time.sleep(2)

# combine all predictions and save to CSV
if all_predictions:
    final_df = pd.concat(all_predictions, ignore_index=True)
    final_df = final_df[["Stock", "Date", "Predicted_Close"]]
    final_df.to_csv(output_file, index=False)

    success_stocks = final_df['Stock'].nunique()
    total_predictions = len(final_df)
    
    print(f"\nüéâ Prediction finished!")
    print(f"‚úÖ Success prediction: {success_stocks}/{len(csv_files)} stocks")
    print(f"üìä Total prediction number: {total_predictions} records")
    print(f"üíæ Saved to: {output_file}")
    
else:
    print("\n‚ùå No valid forecasts generated.")

# Fake news data

In [None]:
# input and output paths
input_folder = "raw_data"  # historical stock price data file folder
news_folder = "fake_news_data"  # FAKE news data file folder
output_file = "fake_news_forecasts_claude.csv"


csv_files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]
all_predictions = []


def load_news_data(stock_name):
    """load news data for the given stock"""
    news_file = os.path.join(news_folder, f"{stock_name}_news.csv")
    
    if not os.path.exists(news_file):
        print(f"‚ö†Ô∏è Cannot find news file: {news_file}")
        return None
    
    try:
        news_df = pd.read_csv(news_file)
        print(f"  Load news data: {len(news_df)} news")
        return news_df
    except Exception as e:
        print(f"‚ö†Ô∏è read news file failed: {e}")
        return None

def format_news_for_prompt(news_df):
    """Convert news DataFrame to formatted string for prompt"""
    if news_df is None or len(news_df) == 0:
        return "No relevant news data."
    
    news_content = []
    for i, row in news_df.iterrows():
        news_item = f"News {i+1}:\n"
        news_item += f"Title: {row.get('title', 'N/A')}\n"
        news_item += f"Source: {row.get('source', 'N/A')}\n"
        news_item += f"Date: {row.get('date', 'N/A')}\n"
        news_item += f"Content: {row.get('content', 'N/A')}...\n"
        news_item += "-" * 50
        news_content.append(news_item)
    
    return "\n".join(news_content)

# retry mechanism for API requests
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def make_api_request(client, prompt):
    try:
        response = client.chat.completions.create(
            model="MODEL_NAME",
            messages=[
                {"role": "system", "content": "You are a professional financial analyst skilled in stock trend forecasting."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.4,
            timeout=60
        )
        return response
    except Exception as e:
        print(f"‚ö†Ô∏è API request failed: {e}")
        raise

def validate_prediction_result(csv_output, stock_name):
    try:
        lines = csv_output.split('\n')
        csv_lines = []
        for line in lines:
            if line.strip() and (line.startswith('Date') or (',' in line and not line.startswith('```'))):
                csv_lines.append(line)
        
        if not csv_lines:
            return None,
        
        clean_csv = '\n'.join(csv_lines)
        
        pred_df = pd.read_csv(io.StringIO(clean_csv))
        
        # check number of columns
        if not {"Date", "Predicted_Close"}.issubset(pred_df.columns):
            if len(pred_df.columns) >= 2:
                pred_df.columns = ['Date', 'Predicted_Close']
            else:
                return None, f"Lack of columns, expect 2 columns, get {len(pred_df.columns)} columns"
        
        # check number of rows
        if len(pred_df) != 3:
            return None, f"Unexpected prediction number, expect 3, get {len(pred_df)}"
        
        # check specific dates
        expected_dates = ['2025/11/12', '2025/11/14', '2025/11/21']
        actual_dates = pred_df['Date'].astype(str).tolist()
        
        if len(actual_dates) != 3:
            return None, f"Incorrect data number, expect 3, get {len(actual_dates)}"
        
        pred_df["Stock"] = stock_name
        
        return pred_df, "Verified"
        
    except Exception as e:
        return None, f"CSV read failed: {e}"

def predict_stock_with_retry(client, stock_name, df, news_content, max_retries=3):
    """Prediction function with retries"""
    data_str = df.to_csv(index=False)
    
    for attempt in range(max_retries):
        try:
            print(f"  Prediction attempt {attempt + 1}/{max_retries}")
            
            # Prompt construction with news analysis
            prompt = f"""
            You are a financial data analyst.
            You are given two types of data for {stock_name}:

            1. PAST 1-YEAR STOCK PRICE DATA (in CSV format):
            - Date: Trading date
            - Open: Opening price
            - High: Highest price
            - Low: Lowest price
            - Close: Closing price
            - Volume: Trading volume

            2. RECENT NEWS DATA:
            {news_content}

            TASKS:
            1. Analyze both the historical price trend/volatility AND the information from recent news
            2. Consider how the news might impact future stock performance
            3. Predict the closing price of {stock_name} for 3, 5, and 10 trading days after the last date in the dataset
               Exact prediction dates: 2025/11/12, 2025/11/14, 2025/11/21
            4. Return your predictions strictly as a CSV table with columns:
               Date, Predicted_Close

            IMPORTANT:
            - Combine both technical analysis (price data) and fundamental analysis (news)
            - Do not include any explanations or extra text ‚Äî only output the CSV
            - Ensure the output has exactly 3 rows for the 3 prediction dates
            - Format must be valid CSV with headers: Date, Predicted_Close

            HISTORICAL PRICE DATA:
            {data_str}
            """

            # Call API
            response = make_api_request(client, prompt)
            csv_output = response.choices[0].message.content.strip()
            print(f"  API response length: {len(csv_output)}")

            # Validate prediction result
            pred_df, validation_msg = validate_prediction_result(csv_output, stock_name)
            
            if pred_df is not None:
                print(f"‚úÖ {stock_name} verification passed.")
                return pred_df
            else:
                print(f"‚ö†Ô∏è {stock_name} verification failed: {validation_msg}")
                print(csv_output)
                if attempt < max_retries - 1:
                    print("  Waiting for retry...")
                    time.sleep(2)
                else:
                    print(f"üí• {stock_name} all retries failed")
                    
        except Exception as e:
            print(f"‚ö†Ô∏è {stock_name} No. {attempt + 1} attempt failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2)
    
    return None


print(f"Find {len(csv_files)} stock files to process.")

# record processed and failed stocks
processed_stocks = []
failed_stocks = []

for file_name in csv_files:
    stock_name = os.path.splitext(file_name)[0]  # assume file name is stock name
    file_path = os.path.join(input_folder, file_name)

    print(f"\nüìà Processing {stock_name}...")

    try:
        # read historical data
        df = pd.read_csv(file_path)
        print(f"  data shape: {df.shape}")

        # check data sufficiency
        if len(df) < 10:
            print(f"‚ö†Ô∏è {stock_name} no enough data, skipping")
            failed_stocks.append(stock_name)
            continue

        pred_df = predict_stock_with_retry(client, stock_name, df, max_retries=3)
        
        if pred_df is not None:
            all_predictions.append(pred_df)
            processed_stocks.append(stock_name)
            print(f"‚úÖ {stock_name} prediction success, prediction points: {len(pred_df)}")
        else:
            print(f"üí• {stock_name} prediction failed, skipping")
            failed_stocks.append(stock_name)

    except Exception as e:
        print(f"‚ùå Process {stock_name} failed: {e}")
        failed_stocks.append(stock_name)
    
    # add delay between requests to avoid rate limits
    time.sleep(2)

# combine all predictions and save to CSV
if all_predictions:
    final_df = pd.concat(all_predictions, ignore_index=True)
    final_df = final_df[["Stock", "Date", "Predicted_Close"]]
    final_df.to_csv(output_file, index=False)

    success_stocks = final_df['Stock'].nunique()
    total_predictions = len(final_df)
    
    print(f"\nüéâ Prediction finished!")
    print(f"‚úÖ Success prediction: {success_stocks}/{len(csv_files)} stocks")
    print(f"üìä Total prediction number: {total_predictions} records")
    print(f"üíæ Saved to: {output_file}")
    
else:
    print("\n‚ùå No valid forecasts generated.")

# Raw + indicator + news

In [None]:
# input and output paths
input_folder = "raw_data"  # historical stock price data file folder
news_folder = "news_data"  # news data file folder
output_file = "news_forecasts_claude.csv"


csv_files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]
all_predictions = []

def load_news_data(stock_name):
    """Load news data for the given stock"""
    news_file = os.path.join(news_folder, f"{stock_name}_news.csv")
    
    if not os.path.exists(news_file):
        print(f"‚ö†Ô∏è Cannot find news file: {news_file}")
        return None
    
    try:
        news_df = pd.read_csv(news_file)
        print(f"  Load news data: {len(news_df)} news")
        return news_df
    except Exception as e:
        print(f"‚ö†Ô∏è Load news data failed: {e}")
        return None

def format_news_for_prompt(news_df):
    """Convert news DataFrame to formatted string for prompt"""
    if news_df is None or len(news_df) == 0:
        return "No relevant news data."
    
    news_content = []
    for i, row in news_df.iterrows():
        news_item = f"News {i+1}:\n"
        news_item += f"Title: {row.get('title', 'N/A')}\n"
        news_item += f"Sourch: {row.get('source', 'N/A')}\n"
        news_item += f"Date: {row.get('date', 'N/A')}\n"
        news_item += f"Content: {row.get('content', 'N/A')}...\n" 
        news_item += "-" * 50
        news_content.append(news_item)
    
    return "\n".join(news_content)

# Retry mechanism for API requests
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def make_api_request(client, prompt):
    try:
        response = client.chat.completions.create(
            model="MODEL_NAME",
            messages=[
                {"role": "system", "content": "You are a professional financial analyst skilled in stock trend forecasting."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.4,
            timeout=60
        )
        return response
    except Exception as e:
        print(f"‚ö†Ô∏è API request failed: {e}")
        raise


print(f"Find {len(csv_files)} stock files to process.")

for file_name in csv_files:
    stock_name = os.path.splitext(file_name)[0]  # assume file name is stock name
    file_path = os.path.join(input_folder, file_name)

    print(f"\nüìà Processing {stock_name}...")

    try:
        # Load historical data
        df = pd.read_csv(file_path)
        print(f"  Stock data shape: {df.shape}")

        # Check data sufficiency
        if len(df) < 10:
            print(f"‚ö†Ô∏è {stock_name} data not sufficient, skipping")
            continue

        # load and format news data
        news_df = load_news_data(stock_name)
        news_content = format_news_for_prompt(news_df)

        data_str = df.to_csv(index=False)

        # Prompt construction with indicator and news analysis
        prompt = f"""
        You are a financial data analyst.
        You are given 2 types of data for {stock_name}:

        1. PAST 1-YEAR STOCK PRICE DATA and financial indicators (in CSV format):
        - Date: Trading date
        - Open: Opening price
        - High: Highest price
        - Low: Lowest price
        - Close: Closing price
        - Volume: Trading volume
        
        The financial indicators including:
        - Return: Daily return percentage
        - LogReturn: Daily log return percentage
        - SMA3, SMA5, SMA10, SMA30: 3, 5, 10, 30-day simple moving averages
        - RSI: 14-day relative strength index
        - MACD: Moving average convergence divergence
        - MACD_Signal: MACD signal line
        - BB-high, BB-low: Bollinger Bands high and low
        
        2. RECENT NEWS DATA:
        {news_content}

        TASKS:
        1. Analyze both the historical price trend/volatility AND the information from recent news
        2. Consider how the news might impact future stock performance
        3. Predict the closing price of {stock_name} for 3, 5, and 10 trading days after the last date in the dataset
           Exact prediction dates: 2025/11/12, 2025/11/14, 2025/11/21
        4. Return your predictions strictly as a CSV table with columns:
           Date, Predicted_Close

        IMPORTANT:
        - Combine both technical analysis (price data) and fundamental analysis (news)
        - Do not include any explanations or extra text ‚Äî only output the CSV
        - Ensure the output has exactly 3 rows for the 3 prediction dates

        HISTORICAL PRICE DATA:
        {data_str}
        """

        # Call API with retry
        response = make_api_request(client, prompt)

        # Get CSV output
        csv_output = response.choices[0].message.content.strip()
        print(f"  API response length: {len(csv_output)}")

        try:
            # clean and parse CSV output
            lines = csv_output.split('\n')
            csv_lines = []
            for line in lines:
                if line.strip() and (line.startswith('Date') or ',' in line):
                    csv_lines.append(line)
            
            clean_csv = '\n'.join(csv_lines)
            
            pred_df = pd.read_csv(io.StringIO(clean_csv))
            
            # check columns
            if not {"Date", "Predicted_Close"}.issubset(pred_df.columns):
                if len(pred_df.columns) >= 2:
                    pred_df.columns = ['Date', 'Predicted_Close']
                else:
                    raise ValueError("Missing required columns in GPT output.")
            
            # check rows
            if len(pred_df) != 3:
                print(f"‚ö†Ô∏è Unexpect prediction number: expect 3, get {len(pred_df)}")
                print("Model output was:\n", csv_output[:300], "...\n")
            
            pred_df["Stock"] = stock_name
            all_predictions.append(pred_df)
            print(f"‚úÖ {stock_name} prediction success. Prediction points: {len(pred_df)}")
            
        except Exception as e:
            print(f"‚ö†Ô∏è {stock_name} prediction failed: {e}")
            print("Model output:\n", csv_output, "\n")

    except Exception as e:
        print(f"‚ùå process {stock_name} failed: {e}")

    # add delay between requests to avoid rate limits
    time.sleep(3)

# combine all predictions and save to CSV
if all_predictions:
    final_df = pd.concat(all_predictions, ignore_index=True)
    final_df = final_df[["Stock", "Date", "Predicted_Close"]]
    final_df.to_csv(output_file, index=False)
    print(f"\n‚úÖ Finished predictino of {len(all_predictions)} stocks.")
    print(f"‚úÖ All prediction saved to {output_file}")

else:
    print("\n‚ùå No valid forecasts generated.")

# Raw + Indicator + Fake news

In [None]:
# input and output paths
input_folder = "processed_data"  # historical stock price data file folder
news_folder = "fake_news_data"  # FAKE news data file folder
output_file = "fake_news_forecasts_claude.csv"


csv_files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]
all_predictions = []

def load_news_data(stock_name):
    """Load news data for the given stock"""
    news_file = os.path.join(news_folder, f"{stock_name}_news.csv")
    
    if not os.path.exists(news_file):
        print(f"‚ö†Ô∏è Cannot find news file: {news_file}")
        return None
    
    try:
        news_df = pd.read_csv(news_file)
        print(f"  Load news data: {len(news_df)} news")
        return news_df
    except Exception as e:
        print(f"‚ö†Ô∏è Load news data failed: {e}")
        return None

def format_news_for_prompt(news_df):
    """Convert news DataFrame to formatted string for prompt"""
    if news_df is None or len(news_df) == 0:
        return "No relevant news data."
    
    news_content = []
    for i, row in news_df.iterrows():
        news_item = f"News {i+1}:\n"
        news_item += f"Title: {row.get('title', 'N/A')}\n"
        news_item += f"Sourch: {row.get('source', 'N/A')}\n"
        news_item += f"Date: {row.get('date', 'N/A')}\n"
        news_item += f"Content: {row.get('content', 'N/A')}...\n" 
        news_item += "-" * 50
        news_content.append(news_item)
    
    return "\n".join(news_content)

# Retry mechanism for API requests
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def make_api_request(client, prompt):
    try:
        response = client.chat.completions.create(
            model="MODEL_NAME",
            messages=[
                {"role": "system", "content": "You are a professional financial analyst skilled in stock trend forecasting."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.4,
            timeout=60
        )
        return response
    except Exception as e:
        print(f"‚ö†Ô∏è API request failed: {e}")
        raise


print(f"Find {len(csv_files)} stock files to process.")

for file_name in csv_files:
    stock_name = os.path.splitext(file_name)[0]  # assume file name is stock name
    file_path = os.path.join(input_folder, file_name)

    print(f"\nüìà Processing {stock_name}...")

    try:
        # Load historical data
        df = pd.read_csv(file_path)
        print(f"  Stock data shape: {df.shape}")

        # Check data sufficiency
        if len(df) < 10:
            print(f"‚ö†Ô∏è {stock_name} data not sufficient, skipping")
            continue

        # load and format news data
        news_df = load_news_data(stock_name)
        news_content = format_news_for_prompt(news_df)

        data_str = df.to_csv(index=False)

        # Prompt construction with indicator and news analysis
        prompt = f"""
        You are a financial data analyst.
        You are given 2 types of data for {stock_name}:

        1. PAST 1-YEAR STOCK PRICE DATA and financial indicators (in CSV format):
        - Date: Trading date
        - Open: Opening price
        - High: Highest price
        - Low: Lowest price
        - Close: Closing price
        - Volume: Trading volume
        
        The financial indicators including:
        - Return: Daily return percentage
        - LogReturn: Daily log return percentage
        - SMA3, SMA5, SMA10, SMA30: 3, 5, 10, 30-day simple moving averages
        - RSI: 14-day relative strength index
        - MACD: Moving average convergence divergence
        - MACD_Signal: MACD signal line
        - BB-high, BB-low: Bollinger Bands high and low
        
        2. RECENT NEWS DATA:
        {news_content}

        TASKS:
        1. Analyze both the historical price trend/volatility AND the information from recent news
        2. Consider how the news might impact future stock performance
        3. Predict the closing price of {stock_name} for 3, 5, and 10 trading days after the last date in the dataset
           Exact prediction dates: 2025/11/12, 2025/11/14, 2025/11/21
        4. Return your predictions strictly as a CSV table with columns:
           Date, Predicted_Close

        IMPORTANT:
        - Combine both technical analysis (price data) and fundamental analysis (news)
        - Do not include any explanations or extra text ‚Äî only output the CSV
        - Ensure the output has exactly 3 rows for the 3 prediction dates

        HISTORICAL PRICE DATA:
        {data_str}
        """

        # Call API with retry
        response = make_api_request(client, prompt)

        # Get CSV output
        csv_output = response.choices[0].message.content.strip()
        print(f"  API response length: {len(csv_output)}")

        try:
            # clean and parse CSV output
            lines = csv_output.split('\n')
            csv_lines = []
            for line in lines:
                if line.strip() and (line.startswith('Date') or ',' in line):
                    csv_lines.append(line)
            
            clean_csv = '\n'.join(csv_lines)
            
            pred_df = pd.read_csv(io.StringIO(clean_csv))
            
            # check columns
            if not {"Date", "Predicted_Close"}.issubset(pred_df.columns):
                if len(pred_df.columns) >= 2:
                    pred_df.columns = ['Date', 'Predicted_Close']
                else:
                    raise ValueError("Missing required columns in GPT output.")
            
            # check rows
            if len(pred_df) != 3:
                print(f"‚ö†Ô∏è Unexpect prediction number: expect 3, get {len(pred_df)}")
                print("Model output was:\n", csv_output[:300], "...\n")
            
            pred_df["Stock"] = stock_name
            all_predictions.append(pred_df)
            print(f"‚úÖ {stock_name} prediction success. Prediction points: {len(pred_df)}")
            
        except Exception as e:
            print(f"‚ö†Ô∏è {stock_name} prediction failed: {e}")
            print("Model output:\n", csv_output, "\n")

    except Exception as e:
        print(f"‚ùå process {stock_name} failed: {e}")

    # add delay between requests to avoid rate limits
    time.sleep(3)

# combine all predictions and save to CSV
if all_predictions:
    final_df = pd.concat(all_predictions, ignore_index=True)
    final_df = final_df[["Stock", "Date", "Predicted_Close"]]
    final_df.to_csv(output_file, index=False)
    print(f"\n‚úÖ Finished predictino of {len(all_predictions)} stocks.")
    print(f"‚úÖ All prediction saved to {output_file}")

else:
    print("\n‚ùå No valid forecasts generated.")