### 1. Install Dependencies

In [1]:
!pip install pandas gdown huggingface-hub numpy matplotlib scikit-learn transformers torch tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

### 2. Imports

In [3]:
import os
import json
import sys
import argparse
import re
from typing import List, Union
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from google.colab import drive
from datetime import datetime

from abc import ABC, abstractmethod

import torch
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [4]:
torch.cuda.is_available()

True

### 3. Connect to Google Drive

In [5]:
drive.mount("/content/drive",force_remount=True)
os.chdir("/content/drive/My Drive")

Mounted at /content/drive


### 4. Utils Code

In [6]:
def format_time_difference(seconds):
    minutes = seconds // 60
    hours = minutes // 60
    days = hours // 24

    if days > 0:
        if hours % 24 > 0.1:
            return f"{days} days-{hours % 24} hours"
        else:
            return f"{days} days"
    elif hours > 0:
        if minutes % 60 > 0.1:
            return f"{hours} hours-{minutes % 60} minutes"
        else:
            return f"{hours} hours"
    elif minutes > 0:
        if seconds % 60 > 0.1:
            return f"{minutes} minutes-{seconds % 60} seconds"
        else:
            return f"{minutes} minutes"
    else:
        return f"{seconds} seconds"

def save_to_json(data, save_path):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    with open(save_path, "w") as f:
        json.dump(data, f, indent=4)

def plot_series(filename, input_ts, output_ts, predicted_ts, save_folder):
    plt.figure(figsize=(10, 5))
    plt.plot(range(len(input_ts)), input_ts, label="Input Time Series", marker='o')
    plt.plot(range(len(input_ts), len(input_ts) + len(output_ts)), output_ts, label="Ground Truth", marker='o')
    plt.plot(range(len(input_ts), len(input_ts) + len(predicted_ts)), predicted_ts, label="Predicted", linestyle='dashed')
    plt.legend()
    plt.title(f"Prediction for {filename}")
    plt.xlabel("Time Steps")
    plt.ylabel("Value")
    plt.grid()
    plt.savefig(os.path.join(save_folder, filename.replace('.json', '.png')))
    plt.close()

def calculate_mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)

    # Avoid division by zero
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100


def calculate_acc(result_list, regrouped_labels = None):
    if regrouped_labels is None:
        correct_pred = sum(1 for result in result_list if result["ground_truth"] in result["predict"])
    else:
        correct_pred = 0
        for result in result_list:
            gt_group = regrouped_labels[result['ground_truth']]
            for original_label in regrouped_labels.keys():
                if original_label in result['predict']:
                    predict_group = regrouped_labels[original_label]
                    if gt_group == predict_group:
                        correct_pred += 1
                        break

    total_pred = len(result_list)
    accuracy = correct_pred / total_pred

    return accuracy


def calculate_correlation_acc(result_list):
    model_predictions = {"total": 0, "exact_correct": 0, "brief_correct": 0}
    positive_correlations = ["Strong Positive Correlation", "Moderate Positive Correlation"]
    negative_correlations = ["Strong Negative Correlation", "Moderate Negative Correlation"]
    for result in result_list:
        prediction = result["predict"].strip()
        model_predictions["total"] += 1
        if prediction == result["ground_truth"]:
            model_predictions["exact_correct"] += 1

        # Brief accuracy
        pred_is_positive = prediction in positive_correlations
        pred_is_negative = prediction in negative_correlations
        truth_is_positive = result["ground_truth"] in positive_correlations
        truth_is_negative = result["ground_truth"] in negative_correlations

        if (pred_is_positive and truth_is_positive) or \
            (pred_is_negative and truth_is_negative) or \
            (prediction == result["ground_truth"]):
            model_predictions["brief_correct"] += 1

    # Calculate and format results
    total = model_predictions["total"]
    exact_accuracy = (model_predictions["exact_correct"] / total) * 100
    brief_accuracy = (model_predictions["brief_correct"] / total) * 100

    metric_results = {
        "exact_accuracy": f"{round(exact_accuracy, 2)}%",
        "brief_accuracy": f"{round(brief_accuracy, 2)}%",
        "total_samples": total
    }
    return metric_results


def calculate_mcqa_acc(result_list):
    correct = 0
    total = 0
    for result in result_list:
        predition = result["predict"].strip()
        predition = predition[0].upper()
        if predition == result["ground_truth"]:
            correct += 1

        total += 1

    accuracy = correct / total

    return accuracy * 100

### 5. Models Code

In [7]:
class BaseModel(ABC):
    @abstractmethod
    def inference(self, content: str) -> str:
        """
        Run inference on a given input prompt and return the generated output.
        """
        pass

In [8]:
class DeepSeekModel(BaseModel):
    def __init__(self, model_name: str = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", **kwargs):

        # The model is set in eval mode by default by using eval()
        # See: https://huggingface.co/docs/transformers/en/main_classes/model#transformers.PreTrainedModel
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto",
            **kwargs
        )
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def inference(self, content: str) -> str:
        messages = [{"role": "user", "content": content}]

        chat_prompt = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        tokenized_input = self.tokenizer([chat_prompt], return_tensors="pt").to(self.model.device)
        generated_output = self.model.generate(
            **tokenized_input,
            max_new_tokens=4096,
        )
        output_ids = generated_output[0][len(tokenized_input.input_ids[0]):].tolist()

        # parsing thinking content
        try:
            # rindex finding 151649 (</think>)
            index = len(output_ids) - output_ids[::-1].index(151649)
        except ValueError:
            index = 0
        outputs = self.tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

        return outputs

In [9]:
class LLaMAModel(BaseModel):
    def __init__(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", **kwargs):
        self.pipeline = pipeline(
            "text-generation",
            model=model_name,
            torch_dtype="auto",
            device_map="auto",
            token="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",
            **kwargs
        )

    def inference(self, content: str) -> str:
        messages = [{"role": "user", "content": content}]

        outputs = self.pipeline(messages, max_new_tokens=1024)

        return outputs[0]["generated_text"][-1]["content"]

In [10]:
class ModelFactory:
    def __init__(self, config: dict):
        self.config = config

    @staticmethod
    def get_model(model_type: str, model_name: str, **kwargs) -> BaseModel:
        if model_type == "deepseek":
            return DeepSeekModel(model_name=model_name, **kwargs)
        elif model_type == "llama":
            return LLaMAModel(model_name=model_name, **kwargs)
        else:
            raise ValueError(f"Unsupported model type: {model_type}")

### 6. Experiment Code

In [11]:
def finance_mse_metaprompt_generation(
    text: str,
    prices: List[float],
    start_datetime: str,
    end_datetime: str,
    pred_end_datetime: str,
    granularity: str,
    prediction_length: int,
    mode: str,
) -> str:
    """
    Generates a meta-prompt for hypothetical stock price trend analysis
    based on given inputs.

    Args:
        text (str): News article content within the input time series range.
        prices (List[float]): Historical stock prices.
        start_datetime (str): Start datetime of the input time series.
        end_datetime (str): End datetime of the input time series.
        pred_end_datetime (str): End datetime of the hypothetical projection.
        granularity (str): Granularity of the input time series (e.g., daily, hourly).
        prediction_length (int): Number of future time steps to estimate.
        mode (str): Mode of estimation ("timeseries_only", "text_only", "combined").

    Returns:
        str: Meta-prompt for ChatGPT.
    """
    prompt = (
        f"You are an AI assistant trained in data analysis and modeling. "
        f"Your task is to conduct a research-based timeseries estimation for the next {prediction_length} time steps "
        f"based on provided historical price movements and/or related news articles. "
        f"This analysis aims to explore patterns in the given dataset and should not be considered financial advice. "
        f"The input time series spans from {start_datetime} to {end_datetime}, with a granularity of {granularity}. "
        f"The estimation period extends from {end_datetime} to {pred_end_datetime}, maintaining the same granularity."
    )

    if mode == "timeseries_only":
        prompt += (
            "You will analyze the numerical patterns in historical prices and extrapolate potential movements. "
            f"The input prices are: {prices}. "
        )
    elif mode == "text_only":
        prompt += (
            "You will analyze sentiment and potential market impacts from the following news article content: "
            f"{text}. "
        )
    elif mode == "combined":
        prompt += (
            "You will use both historical price movements and relevant news sentiment analysis "
            f"to explore hypothetical market trends. The input prices are: {prices}. The news article states: {text}. "
        )
    else:
        raise ValueError(
            "Invalid mode. Choose from 'timeseries_only', 'text_only', or 'combined'."
        )

    prompt += (
        "\n\nPlease return your estimated values in a structured format as a  list of float numbers. "
        "Ensure the output follows this format strictly: "
        "\nPredicted Prices: value1, value2, ..., valueN. "
        f"The number of estimated values should be exactly {prediction_length}. "
    )

    return prompt

def finance_macd_metaprompt_generation(
    text: str,
    prices: List[float],
    start_datetime: str,
    end_datetime: str,
    pred_end_datetime: str,
    granularity: str,
    prediction_length: int,
    mode: str,
) -> str:

    prompt = (
        f"You are an AI assistant trained in data analysis and modeling. "
        f"Your task is to Predict the future Moving Average Convergence Divergence (MACD) values for the next {prediction_length} time steps "
        f"based on provided historical timeseries movements and/or related news articles. "
        # f"This analysis aims to explore patterns in the given dataset and should not be considered financial advice. "
        f"The input time series spans from {start_datetime} to {end_datetime}, with a granularity of {granularity}. "
        f"The estimation period extends from {end_datetime} to {pred_end_datetime}, maintaining the same granularity."
    )

    if mode == "timeseries_only":
        prompt += (
            "You will analyze the numerical patterns in historical prices. "
            f"The input prices are: {prices}. "
        )
    elif mode == "text_only":
        prompt += (
            "You will analyze sentiment and potential market impacts from the following news article content: "
            f"{text}. "
        )
    elif mode == "combined":
        prompt += (
            "You will use both historical price movements and relevant text sentiment analysis "
            f"The input prices are: {prices}. The news article states: {text}. "
        )
    else:
        raise ValueError(
            "Invalid mode. Choose from 'timeseries_only', 'text_only', or 'combined'."
        )

    prompt += (
        "\n\nPlease return your predicted MACD values in a structured format as a list of float numbers. Please predict the real possible values, do not use the naive linear extrapolation or similar methods"
        "Ensure the output follows this format strictly: "
        "\nPredicted Prices: value1, value2, ..., valueN. "
        f"The number of predicted values should be exactly {prediction_length}. "
    )

    return prompt

def finance_bb_metaprompt_generation(
    text: str,
    prices: List[float],
    start_datetime: str,
    end_datetime: str,
    pred_end_datetime: str,
    granularity: str,
    prediction_length: int,
    mode: str,
) -> str:

    prompt = (
        f"You are an AI assistant trained in data analysis and modeling. "
        f"Your task is to Predict the future upper Bollinger Band (BB) values  for the next {prediction_length} time steps "
        f"based on provided historical price movements and/or related news articles. "
        # f"This analysis aims to explore patterns in the given dataset and should not be considered financial advice. "
        f"The input time series spans from {start_datetime} to {end_datetime}, with a granularity of {granularity}. "
        f"The estimation period extends from {end_datetime} to {pred_end_datetime}, maintaining the same granularity."
    )

    if mode == "timeseries_only":
        prompt += (
            "You will analyze the numerical patterns in historical prices. "
            f"The input prices are: {prices}. "
        )
    elif mode == "text_only":
        prompt += (
            "You will analyze sentiment and potential market impacts from the following news article content: "
            f"{text}. "
        )
    elif mode == "combined":
        prompt += (
            "You will use both historical price movements and relevant news sentiment analysis "
            f"to explore hypothetical market trends. The input prices are: {prices}. The news article states: {text}. "
        )
    else:
        raise ValueError(
            "Invalid mode. Choose from 'timeseries_only', 'text_only', or 'combined'."
        )

    prompt += (
        "\n\nPlease return your estimated upper Bollinger Band (BB) values values in a structured format as a list of float numbers. "
        "Ensure the output follows this format strictly: "
        "\nPredicted Prices: value1, value2, ..., valueN. "
        f"The number of estimated values should be exactly {prediction_length}. "
    )

    return prompt

def parse_val_prediction_response(response: str) -> Union[List[float], None]:
    """
    Decodes the predicted prices from a response string.

    Args:
        response (str): The response containing the predicted prices.

    Returns:
        List[float]: A list of float numbers extracted from the response.
        None: If extraction fails.
    """
    match = re.search(r"Predicted Prices:\s*([-\d.,\s]+)", response)

    if match:
        try:
            price_list = [float(value) for value in match.group(1).split(',')]
            return price_list
        except ValueError:
            pass  # If conversion fails, try another approach

    # Alternative approach: Find all potential numbers in the response
    possible_numbers = re.findall(r"-?\d+\.\d+", response)
    if possible_numbers:
        try:
            return [float(num) for num in possible_numbers]
        except ValueError:
            pass  # If conversion fails, return None

    return None  # Return None if extraction fails

def finance_classification_metaprompt_generation(text=None, timestamps=None, prices=None, mode=None):
    time_series_data = ", ".join([f"{price}" for price in  prices])

    if mode == "combined":
        meta_prompt = f"""
            You are a financial prediction expert with knowledge of advanced machine learning models and time-series analysis.
            Your goal is to predict the stock trend (rise, neutral, or fall) based on the following inputs:

            1. **Time Series Stock Price Data**:
            - This data includes stock prices recorded at 1-hour intervals over the last month from {timestamps[0]} to {timestamps[-1]}.
            - Example data format:
                {time_series_data}

            2. **News Data**:
            - This includes news headlines and summaries relevant to the stock's company or sector.
            - Example data format:
                {text}

            ### Task:
            Analyze the provided time-series data and news to identify future trends of the stock performance. Ensure that the news data is used to supplement the insights from the time-series analysis, focusing on combining both inputs for a more accurate prediction.

            ### Output:
            Provide a prediction for the stock trend categorized one of the following labels:
            - "<-4%"
            - "-2% ~ -4%"
            - "-2% ~ +2%"
            - "+2% ~ +4%"
            - ">+4%"

            please think step-by-step and briefly explain how the combination of time-series data and news data led to the prediction;
            then wrap your final answer in the final predicted label in the format ^^^label^^^
        """

    elif mode == "text_only":
        meta_prompt = f"""
            You are a financial prediction expert with knowledge of advanced machine learning models and time-series analysis.
            Your goal is to predict the stock trend with given labels based on the following input:

            **News Data**:
            - This includes news headlines and summaries relevant to the stock's company or sector.
            - Example data format:
                {text}

            ### Output:
            Provide a prediction for the stock trend categorized one of the following labels:
            - "<-4%"
            - "-2% ~ -4%"
            - "-2% ~ +2%"
            - "+2% ~ +4%"
            - ">+4%"

            ### Task:
            Analyze the news semantics to identify trends and patterns that could impact stock performance.
            Then wrap your final answer in the final predicted label in the format ^^^label^^^
        """

    elif mode == "timeseries_only":
        meta_prompt = f"""
            You are a financial prediction expert with knowledge of advanced machine learning models and time-series analysis.
            Your goal is to predict the stock trend with given labels based on the following input:

            1. **Time Series Stock Price Data**:
            - This data includes stock prices recorded at 1-hour intervals over the last month from {timestamps[0]} to {timestamps[-1]}.
            - Example data format:
                {time_series_data}

            ### Output:
            Provide a prediction for the stock trend categorized one of the following labels:
            - "<-4%"
            - "-2% ~ -4%"
            - "-2% ~ +2%"
            - "+2% ~ +4%"
            - ">+4%"

            ### Task:
            Analyze the provided time-series data to identify trends and patterns that could impact stock performance. Focus solely on the time-series data for making predictions.
             then wrap your final answer in the final predicted label in the format ^^^label^^^
        """

    return meta_prompt

def parse_cls_response(answer):
    try:
        return  re.findall(r'\^\^\^(.*?)\^\^\^', answer)[-1]
    except:
        return  re.findall(r'\^+(.*?)\^+', answer)[-1]



def finance_correlation_metaprompt_generation(setting, sticker, time1, time2, in_price, news, time_news):

    time_interval = "1 hour" if setting == "long" else "5 minutes"

    if setting == "long":
        system_prompt ="You are an expert in finance and stock market analysis. Based on the given 30-day historical stock price time series and a financial analysis published at the last timestamp of the time series, your task is to predict the correlation between the stock's price fluctuations in the next 7 days and the analysis sentiment (positive correlation indicates that positive analysis leads to price increase and negative analysis leads to price decrease). Take into account external factors or market conditions that might affect stock price movement."
    else:
        system_prompt = "You are an expert in finance and stock market analysis. Based on the given 7-day historical stock price time series and a financial analysis published at the last timestamp of the time series, your task is to predict the correlation between the stock's price fluctuations in the next 1 day and the analysis sentiment (positive correlation indicates that positive analysis leads to price increase and negative analysis leads to price decrease). Take into account external factors or market conditions that might affect stock price movement."
    question = "Return your answer in one of the following without any other words: Strong Positive Correlation, Moderate Positive Correlation, No Correlation, Moderate Negative Correlation, Strong Negative Correlation."
    query = f"stock price of {sticker} between {time1} to {time2}, time interval is {time_interval}: \
            {in_price}\
            News published at {time_news}: \
            {news}\
            {question} Answer:"
    prompt = f"{system_prompt}\n\n{query}"

    return prompt




def finance_mcqa_metaprompt_generation(setting, sticker, time1, time2, in_price, news, time_news, question):
    time_interval = "1 hour" if setting == "long" else "5 minutes"
    if setting  == "long":
        system_prompt ="You are an expert in finance and stock market analysis. Your task is to answer the question based on the given 30-day historical stock price time series and a financial analysis published at the last timestamp of the time series. Return your answer only in the letter (A, B, C, or D). "
    else:
        system_prompt ="You are an expert in finance and stock market analysis. Your task is to answer the question based on the given 7-day historical stock price time series and a financial analysis published at the last timestamp of the time series. Return your answer only in the letter (A, B, C, or D). "
    query = f"stock price of {sticker} between {time1} to {time2}, time interval is {time_interval}: \
            {in_price}\
            News published at {time_news}: \
            {news}\
            Question: {question}. Give your answer in the letter (A, B, C, or D) without any other words. Answer:"
    prompt = f"{system_prompt}\n\n{query}"
    return prompt

In [12]:
import sys

sys.argv = [
    "script_name",  # Placeholder for script name (ignored by argparse)
    "--dataset_folder", "./MTBench-Test/MTBench_finance_QA_short",
    "--save_path", "./MTBench-Test/deepseek/correlation_short",
    "--model_type", "deepseek",
    "--model", "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "--setting", "short"
]

# import sys

# sys.argv = [
#     "script_name",  # Placeholder for script name (ignored by argparse)
#     "--dataset_path", "./MTBench-Test/MTBench_finance_aligned_pairs_short/train-00000-of-00001.parquet",
#     "--save_path", "./MTBench-Test/<model>/correlation_short",
#     "--model_type", "llama",
#     "--model", "meta-llama/Llama-3.2-1B-Instruct",
#     "--mode", "timeseries_only"
# ]

In [13]:
parser = argparse.ArgumentParser()
parser.add_argument("--dataset_folder", type=str, help="path to the datasets")
parser.add_argument("--save_path", type=str, help="path to save the results")
parser.add_argument("--model_type",  type=str, help="deepseek or llama")
parser.add_argument("--model",  type=str, help="model name")
parser.add_argument("--setting",  type=str, help="short or long")
args = parser.parse_args()

data_list = []
directory_path = Path(args.dataset_folder)
for json_file in directory_path.glob("*.json"):
    with open(json_file, 'r') as file:
        data = json.load(file)
        sticker = json_file.name.split('_')[1].split('.')[0]
        extracted_data = {
            "filename": json_file.name,
            "sticker": sticker,
            "index": int(json_file.name.split('_')[0]),
            "input_timestamps": data.get("input_timestamps"),
            "input_window": data.get("input_window"),
            "output_timestamps": data.get("output_timestamps"),
            "output_window": data.get("output_window"),
            "correlation": data.get('news_price_correlation'),
            "text": data.get("text"),
            "published_utc": data.get("published_utc")
        }
        data_list.append(extracted_data)

data_list = data_list[:100]
os.makedirs(Path(args.save_path), exist_ok=True)

model = ModelFactory.get_model(model_type=args.model_type, model_name=args.model)

result_list = []
tot_samples = len(data_list)
print("Evaluating {} samples......".format(tot_samples))

for idx, sample in tqdm(enumerate(data_list), total=tot_samples):
    designed_prompt = finance_correlation_metaprompt_generation(
        setting=args.setting,
        sticker=sample["sticker"],
        time1=datetime.fromtimestamp(sample["input_timestamps"][0]),
        time2=datetime.fromtimestamp(sample["input_timestamps"][-1]),
        in_price=sample["input_window"],
        news=sample["text"],
        time_news=sample["published_utc"]
    )
    try:
        answer = model.inference(designed_prompt)
        answer = answer.strip().replace('"', '')
        res = {
            "cnt": len(result_list),
            "filename": sample["filename"],
            "ground_truth": sample["correlation"],
            "predict": answer,
        }
        result_list.append(res)
    except Exception as e:
        print(f"An error occurred: {e}")

    if (idx +1) % 20 == 0:
        save_to_json(result_list, save_path=f"{args.save_path}/results.json")


save_to_json(result_list, save_path=f"{args.save_path}/results.json")
metric_results = calculate_correlation_acc(result_list)
metric_results["model"] = args.model
save_to_json(metric_results, save_path=f"{args.save_path}/final_results.json")
print(f"Processing complete. Results saved to {args.save_path}/final_results.json")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Evaluating 100 samples......


  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  1%|          | 1/100 [04:27<7:21:08, 267.36s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  2%|▏         | 2/100 [05:21<3:52:07, 142.11s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  3%|▎         | 3/100 [06:01<2:33:57, 95.23s/it] Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  4%|▍         | 4/100 [07:02<2:11:03, 81.91s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  5%|▌         | 5/100 [07:46<1:47:54, 68.15s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  6%|▌         | 6/100 [08:51<1:45:02, 67.05s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  7%|▋         | 7/100 [09:14<1:21:27, 52.55s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  8%|▊         | 8/100 [13:27<2

An error occurred: CUDA out of memory. Tried to allocate 11.09 GiB. GPU 0 has a total capacity of 14.74 GiB of which 9.39 GiB is free. Process 2783 has 5.35 GiB memory in use. Of the allocated memory 4.99 GiB is allocated by PyTorch, and 241.49 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 11%|█         | 11/100 [18:29<2:19:00, 93.72s/it] Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 12%|█▏        | 12/100 [19:06<1:52:06, 76.44s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 13%|█▎        | 13/100 [19:33<1:29:14, 61.55s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 14%|█▍        | 14/100 [20:08<1:16:38, 53.47s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 15%|█▌        | 15/100 [20:32<1:03:14, 44.64s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 16%|█▌        | 16/100 [21:11<1:00:18, 43.07s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 17%|█▋        | 17/100 [21:58<1:00:49, 43.97s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 18%|█▊        | 18/100 [22:43<1:00:43, 44.43s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 19%|█▉        

An error occurred: CUDA out of memory. Tried to allocate 5.09 GiB. GPU 0 has a total capacity of 14.74 GiB of which 4.51 GiB is free. Process 2783 has 10.22 GiB memory in use. Of the allocated memory 9.27 GiB is allocated by PyTorch, and 848.06 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 32%|███▏      | 32/100 [32:38<43:49, 38.67s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 33%|███▎      | 33/100 [33:36<49:36, 44.43s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 34%|███▍      | 34/100 [34:54<1:00:00, 54.56s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 35%|███▌      | 35/100 [36:31<1:12:55, 67.32s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 36%|███▌      | 36/100 [36:32<50:22, 47.22s/it]  Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


An error occurred: CUDA out of memory. Tried to allocate 1.10 GiB. GPU 0 has a total capacity of 14.74 GiB of which 156.12 MiB is free. Process 2783 has 14.59 GiB memory in use. Of the allocated memory 12.84 GiB is allocated by PyTorch, and 1.62 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 37%|███▋      | 37/100 [37:23<50:49, 48.41s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 38%|███▊      | 38/100 [38:03<47:33, 46.02s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 39%|███▉      | 39/100 [39:01<50:13, 49.41s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 40%|████      | 40/100 [39:35<45:00, 45.01s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 41%|████      | 41/100 [44:40<2:00:45, 122.81s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 42%|████▏     | 42/100 [45:20<1:34:48, 98.07s/it] Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 43%|████▎     | 43/100 [46:15<1:20:43, 84.98s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 44%|████▍     | 44/100 [47:15<1:12:29, 77.67s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 45%|████▌     | 45/10

An error occurred: CUDA out of memory. Tried to allocate 6.45 GiB. GPU 0 has a total capacity of 14.74 GiB of which 3.17 GiB is free. Process 2783 has 11.57 GiB memory in use. Of the allocated memory 10.80 GiB is allocated by PyTorch, and 659.87 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 50%|█████     | 50/100 [50:28<27:56, 33.53s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 51%|█████     | 51/100 [51:02<27:21, 33.50s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 52%|█████▏    | 52/100 [52:19<37:12, 46.51s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 53%|█████▎    | 53/100 [53:03<35:56, 45.87s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 54%|█████▍    | 54/100 [53:53<36:10, 47.19s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 55%|█████▌    | 55/100 [56:05<54:19, 72.44s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 56%|█████▌    | 56/100 [56:46<46:14, 63.05s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 57%|█████▋    | 57/100 [57:23<39:32, 55.17s/it]Setting `pad_token_id` to `eos_t

An error occurred: CUDA out of memory. Tried to allocate 12.06 GiB. GPU 0 has a total capacity of 14.74 GiB of which 3.17 GiB is free. Process 2783 has 11.57 GiB memory in use. Of the allocated memory 5.10 GiB is allocated by PyTorch, and 6.34 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 66%|██████▌   | 66/100 [1:06:25<19:47, 34.92s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 67%|██████▋   | 67/100 [1:07:14<21:34, 39.24s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 68%|██████▊   | 68/100 [1:08:01<22:10, 41.56s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 69%|██████▉   | 69/100 [1:12:38<57:52, 112.03s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 70%|███████   | 70/100 [1:13:44<49:08, 98.29s/it] Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 71%|███████   | 71/100 [1:14:09<36:56, 76.44s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 72%|███████▏  | 72/100 [1:14:30<27:54, 59.82s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 73%|███████▎  | 73/100 [1:14:51<21:39, 48.13s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 74%|███████▍ 

An error occurred: CUDA out of memory. Tried to allocate 7.54 GiB. GPU 0 has a total capacity of 14.74 GiB of which 2.07 GiB is free. Process 2783 has 12.67 GiB memory in use. Of the allocated memory 12.03 GiB is allocated by PyTorch, and 525.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 93%|█████████▎| 93/100 [1:41:04<04:33, 39.02s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


An error occurred: CUDA out of memory. Tried to allocate 8.30 GiB. GPU 0 has a total capacity of 14.74 GiB of which 2.07 GiB is free. Process 2783 has 12.67 GiB memory in use. Of the allocated memory 4.65 GiB is allocated by PyTorch, and 7.89 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 94%|█████████▍| 94/100 [1:41:39<03:46, 37.79s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 95%|█████████▌| 95/100 [1:42:32<03:31, 42.24s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 96%|█████████▌| 96/100 [1:43:07<02:40, 40.15s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 97%|█████████▋| 97/100 [1:44:33<02:42, 54.05s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 98%|█████████▊| 98/100 [1:45:00<01:31, 45.84s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
 99%|█████████▉| 99/100 [1:45:46<00:45, 45.80s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
100%|██████████| 100/100 [1:46:14<00:00, 63.75s/it]

Processing complete. Results saved to ./MTBench-Test/deepseek/correlation_short/final_results.json



