In [1]:
import os
import re
import json
import base64
import tiktoken
import time
import fitz  # PyMuPDF
import pandas as pd
import openai
from tqdm.notebook import tqdm
# from openai import OpenAI
import anthropic
from anthropic import HUMAN_PROMPT, AI_PROMPT, Anthropic


In [2]:
MODEL_LIMITS = {
    "gpt-3.5-turbo-0125": 16_385,
    "gpt-4-turbo-2024-04-09": 128_000,
    "gpt-4o-2024-05-13": 128_000,
    "gpt-4o-mini-2024-07-18": 128_000,
    "claude-3-5-sonnet-20240620": 200_000,
}

# The cost per token for each model input.
MODEL_COST_PER_INPUT = {
    "gpt-3.5-turbo-0125": 0.0000005,
    "gpt-4-turbo-2024-04-09": 0.00001,
    "gpt-4o-2024-05-13": 0.000005,
    "gpt-4o-mini-2024-07-18": 0.00000015,
    "claude-3-5-sonnet-20240620": 0.000003,
}

# The cost per token for each model output.
MODEL_COST_PER_OUTPUT = {
    "gpt-3.5-turbo-0125": 0.0000015,
    "gpt-4-turbo-2024-04-09": 0.00003,
    "gpt-4o-2024-05-13": 0.000015,
    "gpt-4o-mini-2024-07-18": 0.0000006,
    "claude-3-5-sonnet-20240620":0.000015,
}



In [3]:
# If the question is a multi-choice question and you are unsure which one is correct, you must guess an option.  Please don't ask me any questions and give me the answer in the response.

def call_anthropic_v2(text, image, model, client):
    system_messages = "You are a data analyst. I will give  you a background introduction and data analysis question. You must answer the question. "
    try:
        if image:
            base64_image = encode_image(image)
            messages = [
                    {"role": "user", "content":[{"type": "image",
              "source": {
                "type": "base64",
                "media_type": "image/jpeg",
                "data": base64_image
              }},
              {"type": "text", "text": text}            
            ] }]
        else:
            messages = [{"role": "user", "content": text}]
            
        response = client.messages.create(
                messages=messages,
                max_tokens=4096,
                model=model,
                temperature=0,
                top_p=1,
                system=system_messages,
            )
        return response
    except Exception as e:
        print(e)
        time.sleep(5)
        return None

In [4]:
samples = []
with open("./data.json", "r") as f:
    for line in f:
        samples.append(eval(line.strip()))
len(samples)

43

In [5]:
def gpt_tokenize(string: str, encoding) -> int:
    """Returns the number of tokens in a text string."""
    num_tokens = len(encoding.encode(string))
    return num_tokens

def claude_tokenize(string: str, api) -> int:
    """Returns the number of tokens in a text string."""
    num_tokens = api.count_tokens(string)
    return num_tokens


def find_jpg_files(directory):
    jpg_files = [file for file in os.listdir(directory) if file.lower().endswith('.jpg') or file.lower().endswith('.png')]
    return jpg_files if jpg_files else None

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')


def find_excel_files(directory):
    jpg_files = [file for file in os.listdir(directory) if (file.lower().endswith('xlsx') or file.lower().endswith('xlsb') or file.lower().endswith('xlsm')) and not "answer" in file.lower()]
    return jpg_files if jpg_files else None

def read_excel(file_path):
    # 读取Excel文件中的所有sheet
    xls = pd.ExcelFile(file_path)
    sheets = {}
    for sheet_name in xls.sheet_names:
        sheets[sheet_name] = xls.parse(sheet_name)
    return sheets

def dataframe_to_text(df):
    # 将DataFrame转换为文本
    text = df.to_string(index=False)
    return text

def combine_sheets_text(sheets):
    # 将所有sheet的文本内容组合起来
    combined_text = ""
    for sheet_name, df in sheets.items():
        sheet_text = dataframe_to_text(df)
        combined_text += f"Sheet name: {sheet_name}\n{sheet_text}\n\n"
    return combined_text

def read_txt(path):
    with open(path, "r") as f:
        return f.read()

def truncate_text(text, max_tokens=128000):
    # 计算当前文本的token数
    tokens = text.split()
    if len(tokens) > max_tokens:
        # 截断文本以确保不超过最大token数
        text = ' '.join(tokens[-max_tokens:])
    return text

In [6]:
import os
import anthropic

os.environ['ANTHROPIC_AUTH_TOKEN'] = 'your-token-id'
os.environ['ANTHROPIC_BASE_URL'] = 'https://api.xiaoai.plus'
client = anthropic.Anthropic()
message = client.messages.create(
    model="claude-3-5-sonnet-20240620",
    max_tokens=1000,
    temperature=0,
    system="You are a world-class poet. Respond only with short poems.",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Why is the ocean salty?"
                }
            ]
        }
    ]
)
print(message)

NotFoundError: Error code: 404 - {'error': {'message': 'Invalid URL (POST /v1/chat/messages)', 'type': 'invalid_request_error', 'param': '', 'code': ''}}

In [None]:
# your_api = ""
# client = Anthropic(api_key=your_api)
# model = "claude-3-5-sonnet-20240620"
# from transformers import GPT2TokenizerFast

# tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/claude-tokenizer')


In [None]:
response = call_anthropic_v2("你好",None, model, client)

In [None]:
response

In [None]:


data_path = './data'
# model = "gpt-3.5-turbo-0125"
# model = "gpt-4o-mini-2024-07-18"
# total_cost = 65.39828
total_cost = 0
# encoding = tiktoken.encoding_for_model(model)
## record 3
for id in tqdm(range(0, len(samples))):
    # print(sample)
    sample =samples[id]
    if len(sample["questions"]) > 0:
        start = sample["questions"][0]
        end = sample["questions"][-1]
        # print(start)
        # print(end)
        image = find_jpg_files(os.path.join(data_path, sample["id"]))
        if image:
            image = os.path.join(data_path, sample["id"], image[0])
        
        excel_content = ""
        excels = find_excel_files(os.path.join(data_path, sample["id"]))
        if excels:
            for excel in excels:
                excel_file_path = os.path.join(data_path,  sample["id"], excel)
                # print(excel_file_path)
                sheets = read_excel(excel_file_path)
                combined_text = combine_sheets_text(sheets)
                excel_content += f"The excel file {excel} is: " + combined_text

        introduction = read_txt(os.path.join(data_path, sample["id"], "introduction.txt"))
        questions = []
        for question_name in sample["questions"]:
            questions.append(read_txt(os.path.join(data_path, sample["id"], question_name+".txt")))
            
        # print(workbooks)
        
        text = ""
        if excel_content:
            text += f"The workbook is detailed as follows. {excel_content} \n"
        text += f"The introduction is detailed as follows. \n {introduction} \n"
        answers = []
        
        line_num = 0
        save_path = os.path.join("./modeloff/evaluation/save_process", model)
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        if os.path.exists(os.path.join(save_path, sample['id']+".json")):
            with open(os.path.join(save_path, sample['id']+".json"), "r") as f:
                for line in f:
                    line_num += 1
        print(f"existing {line_num} answers")
        save_f =  open(os.path.join(save_path, sample['id']+".json"), "a+")
        
        
        for question in tqdm(questions[line_num:]):
            prompt = text +  f"The questions are detailed as follows. \n {question}"
        
            # print(len(encoding.encode(prompt)))
            cut_text = tokenizer.decode(tokenizer.encode(prompt)[6000-MODEL_LIMITS[model]:])
            # print(len(encoding.encode(prompt)))
            # print(prompt)
            # text = truncate_text(text, 20000)
            try:
                while True:
                    start = time.time()
                    response = call_anthropic_v2(cut_text, image, model, client)
                    cost = response.usage.output_tokens * MODEL_COST_PER_OUTPUT[model] + response.usage.input_tokens * MODEL_COST_PER_INPUT[model]
                    ans = {"id": sample["id"], "model": response.model, "input": response.usage.input_tokens,
                                    "output": response.usage.output_tokens, "cost": cost, "time": time.time()-start, "response": response.content[0].text}
                    answers.append({"id": sample["id"], "model": response.model, "input": response.usage.input_tokens,
                                    "output": response.usage.output_tokens, "cost": cost, "time": time.time()-start, "response": response.content[0].text})
                    total_cost += cost
                    print("Total cost: ", total_cost)
                    break
            except Exception as e:
                print(f"error: {e}")
                time.sleep(10)
            json.dump(ans, save_f)
            save_f.write("\n")
            save_f.flush()
            # time.sleep(60)
            # break
        save_f.close()

            

In [None]:
response

In [None]:
response.usage.o

In [None]:
print(response.usage.completion_tokens)
print(response.usage.prompt_tokens)
print(response.choices[0].message.content)
print(response.model)

print(response)

In [None]:


data_path = './data'
# model = "gpt-3.5-turbo-0125"
# model = "gpt-4o-mini-2024-07-18"
# total_cost = 65.39828
total_cost = 0
# encoding = tiktoken.encoding_for_model(model)
## record 3
for id in tqdm(range(0, len(samples))):
    # print(sample)
    sample =samples[id]
    if len(sample["questions"]) > 0:
        start = sample["questions"][0]
        end = sample["questions"][-1]
        # print(start)
        # print(end)
        image = find_jpg_files(os.path.join(data_path, sample["id"]))
        if image:
            image = os.path.join(data_path, sample["id"], image[0])
        
        excel_content = ""
        excels = find_excel_files(os.path.join(data_path, sample["id"]))
        if excels:
            for excel in excels:
                excel_file_path = os.path.join(data_path,  sample["id"], excel)
                # print(excel_file_path)
                sheets = read_excel(excel_file_path)
                combined_text = combine_sheets_text(sheets)
                excel_content += f"The excel file {excel} is: " + combined_text

        introduction = read_txt(os.path.join(data_path, sample["id"], "introduction.txt"))
        questions = []
        for question_name in sample["questions"]:
            questions.append(read_txt(os.path.join(data_path, sample["id"], question_name+".txt")))
            
        # print(workbooks)
        
        text = ""
        if excel_content:
            text += f"The workbook is detailed as follows. {excel_content} \n"
        text += f"The introduction is detailed as follows. \n {introduction} \n"
        answers = []
        for question in questions:
            prompt = text +  f"The questions are detailed as follows. \n {question}"
        
            # print(len(encoding.encode(prompt)))
            cut_text = tokenizer.decode(tokenizer.encode(prompt)[6000-MODEL_LIMITS[model]:])
            # print(len(encoding.encode(prompt)))
            # print(prompt)
            # text = truncate_text(text, 20000)
            start = time.time()
            response = call_anthropic_v2(cut_text, image, model, client)
            cost = response.usage.output_tokens * MODEL_COST_PER_OUTPUT[model] + response.usage.input_tokens * MODEL_COST_PER_INPUT[model]
            
            answers.append({"id": sample["id"], "model": response.model, "input": response.usage.input_tokens,
                            "output": response.usage.output_tokens, "cost": cost, "time": time.time()-start, "response": response.content[0].text})
            total_cost += cost
            print("Total cost: ", total_cost)
            # time.sleep(60)
            # break
        save_path = os.path.join("./evaluation/save_process", model)
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        with open(os.path.join(save_path, sample['id']+".json"), "w") as f:
            for answer in answers:
                json.dump(answer, f)
                f.write("\n")
        # break
            