In [1]:
import os
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv

from transformers import AutoTokenizer
from openai import OpenAI
import tiktoken
from langchain_openai import ChatOpenAI

from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

from src.dataimport import list_files_with_extension_directory, list_files_with_extension, load_text, list_files
from src.llmlib import num_tokens_from_string

# Loading files

In [2]:
TXT_FILES_PATH = 'data/original/brat-project-final/'
JSON_FILES_PATH = 'data/transformed/'

In [None]:
txt_files_directory_list = list_files_with_extension_directory(TXT_FILES_PATH, '.txt')
# txt_files_directory_list

json_files_directory_list = list_files_with_extension_directory(JSON_FILES_PATH, '.json')
# json_files_directory_list

print(f"Anzahl Text-Dateien: {len(txt_files_directory_list)}")
print(f"Anzahl Brat-Dateien: {len(json_files_directory_list)}")

Anzahl Text-Dateien: 402
Anzahl Brat-Dateien: 402


In [4]:
# create dataframe with file names
df = pd.DataFrame()
df['txt_path'] = txt_files_directory_list
df['json_path'] = json_files_directory_list
df['txt_file'] = df['txt_path'].apply(lambda x: os.path.basename(x))
df['json_file'] = df['json_path'].apply(lambda x: os.path.basename(x))
df['txt'] = df['txt_path'].apply(load_text)
df['json'] = df['json_path'].apply(load_text)

print(df.shape)
df.head()

# save to csv
#df.to_csv('dataframe.csv', index=False)
# load dataframe
# df = pd.read_csv('dataframe.csv')
# df.head()

(402, 6)


Unnamed: 0,txt_path,json_path,txt_file,json_file,txt,json
0,data/original/brat-project-final/essay001.txt,data/transformed/essay001.json,essay001.txt,essay001.json,Should students be taught to compete or to coo...,"{\n ""MajorClaims"": [\n {\n ""ID"": ""MC1..."
1,data/original/brat-project-final/essay002.txt,data/transformed/essay002.json,essay002.txt,essay002.json,More people are migrating to other countries t...,"{\n ""MajorClaims"": [\n {\n ""ID"": ""MC1..."
2,data/original/brat-project-final/essay003.txt,data/transformed/essay003.json,essay003.txt,essay003.json,International tourism is now more common than ...,"{\n ""MajorClaims"": [\n {\n ""ID"": ""MC1..."
3,data/original/brat-project-final/essay004.txt,data/transformed/essay004.json,essay004.txt,essay004.json,International tourism is now more common than ...,"{\n ""MajorClaims"": [\n {\n ""ID"": ""MC1..."
4,data/original/brat-project-final/essay005.txt,data/transformed/essay005.json,essay005.txt,essay005.json,Living and studying overseas\n\nIt is every st...,"{\n ""MajorClaims"": [\n {\n ""ID"": ""MC1..."


# Train test split

In [5]:
# Split the dataframe into training and test sets
train_df, test_df = train_test_split(df, train_size=40, random_state=42)

# Display the first few rows of the training and test sets
print(f"Training DataFrame: {train_df.shape}")
print(f"\nTest DataFrame: {test_df.shape}")

Training DataFrame: (40, 6)

Test DataFrame: (362, 6)


In [None]:
# sort the dataframes
train_df = train_df.sort_values(by='txt_file')
train_df.head()

Unnamed: 0,txt_path,json_path,txt_file,json_file,txt,json
20,data/original/brat-project-final/essay021.txt,data/transformed/essay021.json,essay021.txt,essay021.json,Advertisements affects on consumer goods\n\nEv...,"{\n ""MajorClaims"": [\n {\n ""ID"": ""MC1..."
21,data/original/brat-project-final/essay022.txt,data/transformed/essay022.json,essay022.txt,essay022.json,Young people should go to university or not\n\...,"{\n ""MajorClaims"": [\n {\n ""ID"": ""MC1..."
48,data/original/brat-project-final/essay049.txt,data/transformed/essay049.json,essay049.txt,essay049.json,Do modern communication technologies benefit a...,"{\n ""MajorClaims"": [\n {\n ""ID"": ""MC1..."
50,data/original/brat-project-final/essay051.txt,data/transformed/essay051.json,essay051.txt,essay051.json,Universities should give money to sport activi...,"{\n ""MajorClaims"": [\n {\n ""ID"": ""MC1..."
54,data/original/brat-project-final/essay055.txt,data/transformed/essay055.json,essay055.txt,essay055.json,Should teenagers learn all school subjects/foc...,"{\n ""MajorClaims"": [\n {\n ""ID"": ""MC1..."


In [None]:
test_df = test_df.sort_values(by='txt_file')
test_df.head()

Unnamed: 0,txt_path,json_path,txt_file,json_file,txt,json
0,data/original/brat-project-final/essay001.txt,data/transformed/essay001.json,essay001.txt,essay001.json,Should students be taught to compete or to coo...,"{\n ""MajorClaims"": [\n {\n ""ID"": ""MC1..."
1,data/original/brat-project-final/essay002.txt,data/transformed/essay002.json,essay002.txt,essay002.json,More people are migrating to other countries t...,"{\n ""MajorClaims"": [\n {\n ""ID"": ""MC1..."
2,data/original/brat-project-final/essay003.txt,data/transformed/essay003.json,essay003.txt,essay003.json,International tourism is now more common than ...,"{\n ""MajorClaims"": [\n {\n ""ID"": ""MC1..."
3,data/original/brat-project-final/essay004.txt,data/transformed/essay004.json,essay004.txt,essay004.json,International tourism is now more common than ...,"{\n ""MajorClaims"": [\n {\n ""ID"": ""MC1..."
4,data/original/brat-project-final/essay005.txt,data/transformed/essay005.json,essay005.txt,essay005.json,Living and studying overseas\n\nIt is every st...,"{\n ""MajorClaims"": [\n {\n ""ID"": ""MC1..."


# Prompt Templates

In [8]:
BUILDING_BLOCKS_PATH = 'prompts/building-blocks/'
PROMPTS_PATH = 'prompts/final-prompts/'

list_files(BUILDING_BLOCKS_PATH)

['chain-of-thought.txt',
 'output-structure.txt',
 'persona.txt',
 'task-description.txt']

In [9]:
# zero-shot prompt
task_description = load_text(BUILDING_BLOCKS_PATH + 'task-description.txt')
persona = load_text(BUILDING_BLOCKS_PATH + 'persona.txt')
cot = load_text(BUILDING_BLOCKS_PATH + 'chain-of-thought.txt')
output_structure = load_text(BUILDING_BLOCKS_PATH + 'output-structure.txt')

## Zero Shot (ZS)

In [10]:
zs = task_description
zs_persona = persona + task_description
zs_cot = task_description + '\n' + cot
zs_persona_cot = persona + task_description + '\n' + cot

# save prompts to files
with open(PROMPTS_PATH + 'zero-shot.txt', 'w') as f:
    f.write(zs)

with open(PROMPTS_PATH + 'zero-shot-persona.txt', 'w') as f:
    f.write(zs_persona)

with open(PROMPTS_PATH + 'zero-shot-cot.txt', 'w') as f:
    f.write(zs_cot)

with open(PROMPTS_PATH + 'zero-shot-persona-cot.txt', 'w') as f:
    f.write(zs_persona_cot)

## One-Shot (OS)

In [11]:
# one-shot prompt - 1 example from the training set
examples_1 = train_df.sample(1, random_state=42)

# extract the text and json from the row
os_txt = examples_1['txt'].values[0]
os_json = examples_1['json'].values[0]
os_example = f"## Input:\n{os_txt}\n## Output:\n{os_json}"

os = task_description + 'Here is one example of a text and its corresponding json data:\n' + os_example
os_persona = persona + task_description + '\n' + os_example
os_cot = task_description + '\n' + cot + '\n' + os_example
os_persona_cot = persona + task_description + '\n' + cot + '\n' + os_example

# save the prompts to files
with open(PROMPTS_PATH + 'one-shot.txt', 'w') as f:
    f.write(os)

with open(PROMPTS_PATH + 'one-shot-persona.txt', 'w') as f:
    f.write(os_persona)

with open(PROMPTS_PATH + 'one-shot-cot.txt', 'w') as f:
    f.write(os_cot)

with open(PROMPTS_PATH + 'one-shot-persona-cot.txt', 'w') as f:
    f.write(os_persona_cot)

## Few-Shot (FS)

## Test mit LangChain FewshotPromptTemplate
M.E nicht mehr notwendig, da bereits eigener Weg gefunden wurde um Modell Template zu erstellen.

In [71]:
# results = []

# for idx, row in examples_10.iterrows():
#     input = row['txt']
#     output = row['json']
#     results.append({'input': input, 'output': output})

# # save the results in a dataframe
# examples_df = pd.DataFrame(results)
# examples_df

In [72]:
# Beipsiele als Input-Output-Liste
# examples_list = [f"## Input: {row['txt']} \n## Output: {row['json']}" for idx, row in examples_10.iterrows()] 
# examples_list

In [73]:
# example_str_1, example_str_2, example_str_3, example_str_4, example_str_5, example_str_6, example_str_7, example_str_8, example_str_9, example_str_10 = examples_list
# print(example_str_1)

In [74]:
# from langchain_core.prompts import FewShotChatMessagePromptTemplate

# zero_shot = examples_df[examples_df['prompt_file'] == 'zero-shot']['prompt'].values[0] 

# # examples = [
# #     {"input": row['txt'], "output": row['json']} for idx, row in examples_10.iterrows()
# # ]
# example_prompt = ChatPromptTemplate.from_messages(
#     [('user', '{input}'), ('assistent', '{output}')] # user, system,
#     )

# few_shot_prompt = FewShotChatMessagePromptTemplate(
#     examples=examples_list,
#     # This is a ormpt template used to format each individual example
#     example_prompt=example_prompt,
#     # prefix = ""
#     # suffix = "Text: {input}\n Output:",
#     # input_variable_names = ['input'],
# )

# final_prompt = ChatPromptTemplate.from_messages(
#     [
#         ("system", zero_shot),
#         few_shot_prompt,
#         ("user", '{input}'),
#     ]
# )

# print(final_prompt.format(input=test_df_sample['txt'][25]))


# Quelle: https://python.langchain.com/api_reference/core/prompts/langchain_core.prompts.few_shot.FewShotChatMessagePromptTemplate.html

In [75]:
# # # invoke the chain
# few_shot_answer = final_prompt.invoke({"input": test_df_sample['txt'][25]})
# print(few_shot_answer)

## FS 10 - 40

In [12]:
# few-shot prompt - 10 examples from the training set
examples_10 = train_df.sample(10, random_state=42)

few_shot_examples_10 = f"\nHere are 10 examples of text and their corresponding json data:\n" # adding task description to the beginning of the prompt
example_counter = 1
for idx, row in examples_10.iterrows():
    example_str = f"\n# Example {example_counter}\n## Input:\n{row['txt']}\n## Output:\n{row['json']}"
    few_shot_examples_10 += example_str
    example_counter += 1

fs = task_description + few_shot_examples_10
fs_persona = persona + task_description + few_shot_examples_10
fs_cot = task_description + '\n' + cot + few_shot_examples_10
fs_persona_cot = persona + task_description + '\n' + cot + few_shot_examples_10

# save the prompts to files
with open(PROMPTS_PATH + 'few-shot-10.txt', 'w') as f:
    f.write(fs)

with open(PROMPTS_PATH + 'few-shot-10-persona.txt', 'w') as f:
    f.write(fs_persona)

with open(PROMPTS_PATH + 'few-shot-10-cot.txt', 'w') as f:
    f.write(fs_cot)

with open(PROMPTS_PATH + 'few-shot-10-persona-cot.txt', 'w') as f:
    f.write(fs_persona_cot)

In [15]:
# few-shot prompt - 20 examples from the training set
examples_20 = train_df.sample(20, random_state=42)

few_shot_examples_20 = f"\nHere are 20 examples of text and their corresponding json data:\n" # adding task description to the beginning of the prompt
example_counter = 1
for idx, row in examples_20.iterrows():
    example_str = f"\n# Example {example_counter}\n## Input:\n{row['txt']}\n## Output:\n{row['json']}"
    few_shot_examples_20 += example_str
    example_counter += 1

fs = task_description + few_shot_examples_20
fs_persona = persona + task_description + few_shot_examples_20
fs_cot = task_description + '\n' + cot + few_shot_examples_20
fs_persona_cot = persona + task_description + '\n' + cot + few_shot_examples_20

# save the prompts to files
with open(PROMPTS_PATH + 'few-shot-20.txt', 'w') as f:
    f.write(fs)

with open(PROMPTS_PATH + 'few-shot-20-persona.txt', 'w') as f:
    f.write(fs_persona)

with open(PROMPTS_PATH + 'few-shot-20-cot.txt', 'w') as f:
    f.write(fs_cot)

with open(PROMPTS_PATH + 'few-shot-20-persona-cot.txt', 'w') as f:
    f.write(fs_persona_cot)

In [16]:
# few-shot prompt - 40 examples from the training set
examples_40 = train_df.sample(40, random_state=42)

few_shot_str_40 = f"\nHere are 40 examples of text and their corresponding json data:\n" # adding task description to the beginning of the prompt
example_counter = 1
for idx, row in examples_40.iterrows():
    example_str = f"\n# Example {example_counter}\n## Input:\n{row['txt']}\n## Output:\n{row['json']}"
    few_shot_str_40 += example_str
    example_counter += 1

fs = task_description + few_shot_str_40
fs_persona = persona + task_description + few_shot_str_40
fs_cot = task_description + '\n' + cot + few_shot_str_40
fs_persona_cot = persona + task_description + '\n' + cot + few_shot_str_40

# save the prompts to files
with open(PROMPTS_PATH + 'few-shot-40.txt', 'w') as f:
    f.write(fs)

with open(PROMPTS_PATH + 'few-shot-40-persona.txt', 'w') as f:
    f.write(fs_persona)

with open(PROMPTS_PATH + 'few-shot-40-cot.txt', 'w') as f:
    f.write(fs_cot)

with open(PROMPTS_PATH + 'few-shot-40-persona-cot.txt', 'w') as f:
    f.write(fs_persona_cot)

# list prompt files

In [17]:
# Eigentlich wird die os-Bibliothek bereits obengeladen. Da es aber vereinzelt zu Fehlermeldungen kam, wird sie hier nochmals geladen.
import os

prompt_files_directory_list = list_files_with_extension_directory(PROMPTS_PATH, '.txt')
prompt_files_directory_list
prompt_files_list = [os.path.basename(x) for x in prompt_files_directory_list]
# remove the .txt extension
prompt_names = [x.split('.')[0] for x in prompt_files_list]

prompt_df = pd.DataFrame()
# get the file name without the extension from prompt_files, 'str' object has no attribute 'path'
prompt_df['prompt_name'] = prompt_names
prompt_df['prompt_txt'] = prompt_files_directory_list
prompt_df['prompt_txt'] = prompt_df['prompt_txt'].apply(load_text)
print(F"Es gibt {prompt_df.shape[0]} Prompts")
#prompt_df

Es gibt 20 Prompts


# Berechnung der Tokenanzahl
Todo:
- explain how to get access to the model
- explain how to get Hugging Face token

## Hugging Face
Kann entfallen

In [80]:
# # get the API key from the .env file
# load_dotenv() 
# llama_api = os.getenv("HUGGINGFACE_TOKEN")

# #TO ggf. anderes Modell als Tokenizer verwenden, bspw. passend zum verwendeten Modell GPT-4o-mini
# model_id = "meta-llama/Llama-3.2-3B-Instruct"
# # model_id = "meta-llama/Llama-3.3-70B-Instruct" # requires HugginFace Pro subscription

In [81]:
# Function to calculate token count
# def calculate_token_count(prompt):
#     tokenizer = AutoTokenizer.from_pretrained(model_id)
#     tokenized_prompt = tokenizer(prompt, return_tensors='pt') # pt for PyTorch tensors
#     return tokenized_prompt.input_ids.size(1)

# # Apply the function to the 'prompt' column and create a new column 'token_count'
# prompt_df['token_count_hf'] = prompt_df['prompt_txt'].apply(calculate_token_count)

# prompt_df = prompt_df.sort_values(by='token_count_hf')
# prompt_df

## Tokenanzahl bestimmen mit Tiktoken
Open-Source Tokenizer von OpenAI. Schneller als über AutoTokenizer von Hugging Face mit Llama 3.2-B

In [18]:
model = 'gpt-4o-mini'

# Beispiel zur Nachvollziehbarkeit der Tokenisierung
encoding = tiktoken.encoding_for_model(model)
print(encoding)

sample_txt = "This is a sample text."
# Get the token count for the sample text
token_integer = encoding.encode(sample_txt) # mit .decode() kann der Text wieder dekodiert werden. 
token_bytes = [encoding.decode_single_token_bytes(token) for token in token_integer] # Integer Token können mit wiederum in Bytes umgewandelt werden, die sie repräsentieren.
print(f"Beispieltext: {sample_txt}")
print(f"Encodierter Text (Integer): {token_integer}")
print(f"Encodierter Text (Bytes): {token_bytes}")


count_tokens = num_tokens_from_string(sample_txt, model)
print(f"Anzahl Tokens: {count_tokens}")

# Quelle: https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken

<Encoding 'o200k_base'>
Beispieltext: This is a sample text.
Encodierter Text (Integer): [2500, 382, 261, 10176, 2201, 13]
Encodierter Text (Bytes): [b'This', b' is', b' a', b' sample', b' text', b'.']
Anzahl Tokens: 6


In [19]:
prompt_df['token_count'] = prompt_df['prompt_txt'].apply(num_tokens_from_string, model_name=model)
prompt_df = prompt_df.sort_values(by='token_count')
prompt_df['max_lines_jsonl'] = round(20_000_000 / prompt_df['token_count'], 0) # Maximale Anzahl an Tokens pro JSONL-Datei (enqueued tokens)
prompt_df['#batches'] = round(test_df.shape[0] / prompt_df['max_lines_jsonl'], 2) # Anzahl der Batches, die für die Verarbeitung des Testdatensatzes benötigt werden
prompt_df

Unnamed: 0,prompt_name,prompt_txt,token_count,max_lines_jsonl,#batches
19,zero-shot,You will be given a text. Extract the argument...,82,243902.0,0.0
18,zero-shot-persona,You are a expert in Argument Mining and theref...,105,190476.0,0.0
16,zero-shot-cot,You will be given a text. Extract the argument...,480,41667.0,0.01
17,zero-shot-persona-cot,You are a expert in Argument Mining and theref...,503,39761.0,0.01
15,one-shot,You will be given a text. Extract the argument...,1780,11236.0,0.03
14,one-shot-persona,You are a expert in Argument Mining and theref...,1790,11173.0,0.03
12,one-shot-cot,You will be given a text. Extract the argument...,2166,9234.0,0.04
13,one-shot-persona-cot,You are a expert in Argument Mining and theref...,2189,9137.0,0.04
3,few-shot-10,You will be given a text. Extract the argument...,13848,1444.0,0.25
2,few-shot-10-persona,You are a expert in Argument Mining and theref...,13871,1442.0,0.25


## Schätzung der anfallenden Kosten

In [20]:
#TODO: Hier fehlen die Token für die Essays, die zusätzlich zum Input-Tokenanzahl anfallen.
# Schätzung der anfallenden Kosten anhand der Tokenanzahl
prompt_token_sum = prompt_df['token_count'].sum()
print(f"Input-Token:\nDie Summe der Input-Tokenanzahl aller Prompts beträgt: {prompt_token_sum:,} Tokens")
test_token_sum = prompt_token_sum * test_df.shape[0]
print(f"Multipliziert mit der Anzahl der Testdurchläufe ergibt das: {test_token_sum:,} Tokens für Input-Token")
max_enqueued_tokens = 20_000_000 # Maximale Anzahl an Tokens, die von der Batch API
print(f"Das entspicht bei einer maximalen Anzahl von {max_enqueued_tokens:,} Tokens pro JSONL-Datei: {test_token_sum/20_000_000:.2f} Batches") 
input_token_price = 0.15 # input token price per 1 Mio tokens
output_token_price = 0.6 # output token price per 1 Mio tokens
input_token_cost = input_token_price * test_token_sum/1_000_000
print(f"Die Kosten für die Input-Tokens betragen: {input_token_cost:.2f} $") 
#TODO: Ggf. wäre es genauer die JSON-Dateien zu verwenden
min_output_token_count = 300 # aufgerundeter Minimalwert für Tokenanzahl für ann-Dateien aus EDA. Umfang der Ausgabe des LLMs kann auch außerhalb des Bereichs liegen. 
max_output_token_count = 1_100 # aufgerundeter Maximalwert für Tokenanzahl für ann-Dateien aus EDA
min_output_token_sum = min_output_token_count * test_df.shape[0]
max_output_token_sum = max_output_token_count * test_df.shape[0]
print(f"\nOutput-Token:\nDie Output-Tokenanzahl multipliziert mit der Anzahl der Aufsätze im Testdatensaatz beträgt zwischen: {min_output_token_sum:,} und {max_output_token_sum:,} Tokens")
min_output_token_cost = output_token_price * min_output_token_sum/1_000_000
max_output_token_cost = output_token_price * max_output_token_sum/1_000_000
print(f"Die Kosten für die Output-Tokens betragen zwischen: {min_output_token_cost:.2f} $ und {max_output_token_cost:.2f} $")
total_cost_min = input_token_cost + min_output_token_cost
total_cost_max = input_token_cost + max_output_token_cost
print(f"\nGesamt:\nDie Gesamtkosten liegen schätzungsweise in einem Bereich von {total_cost_min:.2f} $ und {total_cost_max:.2f} $")
print(f"Bei der Anwendung der Batch-API gibt es einen Rabatt von 50% auf die Tokenpreise. Damit würden die Kosten zwischen {total_cost_min/2:.2f} $ und {total_cost_max/2:.2f} $ liegen.")

# Quelle für Tokenpreise: https://openai.com/api/pricing/

Input-Token:
Die Summe der Input-Tokenanzahl aller Prompts beträgt: 393,935 Tokens
Multipliziert mit der Anzahl der Testdurchläufe ergibt das: 142,604,470 Tokens für Input-Token
Das entspicht bei einer maximalen Anzahl von 20,000,000 Tokens pro JSONL-Datei: 7.13 Batches
Die Kosten für die Input-Tokens betragen: 21.39 $

Output-Token:
Die Output-Tokenanzahl multipliziert mit der Anzahl der Aufsätze im Testdatensaatz beträgt zwischen: 108,600 und 398,200 Tokens
Die Kosten für die Output-Tokens betragen zwischen: 0.07 $ und 0.24 $

Gesamt:
Die Gesamtkosten liegen schätzungsweise in einem Bereich von 21.46 $ und 21.63 $
Bei der Anwendung der Batch-API gibt es einen Rabatt von 50% auf die Tokenpreise. Damit würden die Kosten zwischen 10.73 $ und 10.81 $ liegen.


## Problem mit Llama 3.2-3B

Sofern die Summe aus Input und Output Token die Grenze von 4096 Token überschreiten landet die Abfrage im folgenden Error:
"""
422 Client Error: Unprocessable Entity for url: https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-3B-Instruct

Input validation error: `inputs` tokens + `max_new_tokens` must be <= 4096. Given: 12479 `inputs` tokens and 256 `max_new_tokens`
"""

Die Verwendung von Speicher (Memory) um die Anzahl der Tokens pro Anfrage zu reduzieren und das Kontext-Fenster des LLM auszunutzen, hat nicht funktioniert und landet im gleichen Error.

```python	
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationChain

memory = ConversationBufferMemory(size=10)
buffer = ConversationChain(llm= llm, memory=memory)
buffer.invoke(intro_text)
buffer.invoke(example_str_1)
buffer.invoke(example_str_2)
buffer.invoke(example_str_3)
buffer.invoke(example_str_4)
buffer.invoke("Text: " test_text)

buffer.get_memory()
```

Laut Forenbeiträgen ist das ein Limit von der Hugging Face API (Quelle: https://huggingface.co/spaces/huggingchat/chat-ui/discussions/430). Ein Test mit Google Collab, bei dem das Modell heruntergeladen wurde anstatt die HuggingFace API zu verwenden, hat mit 6082 Input Tokens funktioniert. 


LangChain unterstützt die anwendung der Batch API nicht. 

# LLM Abfrage

In [2]:
load_dotenv()
openai_api = os.getenv("OPENAI_API_KEY")
                       
client = OpenAI(api_key=openai_api)

## Strukturierte Ausgabe des LLMs

In [86]:
# # structured output 
# from pydantic import BaseModel, Field
# from typing import List, Dict

# class ArgumentRelation(BaseModel):
#     """Argumentative relation between the origin and target"""
#     Origin: str = Field(description="ID of the origin (e.g.Claim or Premise)")
#     Relation: str = Field(description="Type of relation (e.g., 'For', 'Against', 'Support', 'Attack')")
#     Target: str = Field(description="ID of the target (e.g., MajorClaim, Claim or Premise)")

# class ArgumentMiningExtraction(BaseModel):
#     """Extraction of argument components and relations from a text"""
#     MajorClaims: Dict[str, str] = Field(description="Dictionary of major claims with their IDs as keys and text as values")
#     Claims: Dict[str, str] = Field(description="Dictionary of claims with their IDs as keys and text as values")
#     Premises: Dict[str, str] = Field(description="Dictionary of premises with their IDs as keys and text as values")
#     ArgumentativeRelations: List[ArgumentRelation] = Field(description="List of Dictionaries containing the argumentative relations between origin and target")


# response_format = ArgumentMiningExtraction.model_json_schema()


# response_format

# landet weiterhin im error: 
# "response": {"status_code": 400, "request_id": "249a9d8849ab386154c2eb12f27b0a19", "body": {"error": {"message": "Invalid schema for response_format 'ArgumentMiningExtraction': In context=(), 'additionalProperties' is required to be supplied and to be false.", "type": "invalid_request_error"
    
# Quellen Structured Outputs:
# - https://platform.openai.com/docs/guides/structured-outputs
# - https://cookbook.openai.com/examples/structured_outputs_intro
# - https://python.langchain.com/docs/concepts/structured_outputs/
# - https://python.langchain.com/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html

In [87]:
# response_format2 = {
#     "type": "object",
#     "properties": {
#         "MajorClaims": {
#             "type": "object",
#             "additionalProperties": {
#                 "type": "string"
#             }
#         },
#         "Claims": {
#             "type": "object",
#             "additionalProperties": {
#                 "type": "string"
#             }
#         },
#         "Premises": {
#             "type": "object",
#             "additionalProperties": {
#                 "type": "string"
#             }
#         },
#         "ArgumentativeRelations": {
#             "type": "array",
#             "items": {
#                 "type": "object",
#                 "properties": {
#                     "Origin": {
#                         "type": "string"
#                     },
#                     "Relation": {
#                         "type": "string",
#                         "enum": ["for", "against", "supports", "attacks"]
#                     },
#                     "Target": {
#                         "type": "string"
#                     }
#                 },
#                 "required": ["Origin", "Relation", "Target"],
#                 "additionalProperties": False
#             }
#         }
#     },
#     "required": ["MajorClaims", "Claims", "Premises", "ArgumentativeRelations"],
#     "additionalProperties": False
# }

# response_format2

In [88]:
# from openai.lib._pydantic import to_strict_json_schema

# response_format = to_strict_json_schema(ArgumentMiningExtraction)
# response_format

# Ansatz von: https://community.openai.com/t/structured-outputs-with-batch-processing/911076/6

## standard-api

In [89]:
# llm_seed = 123
# # Llama via HuggingFaceAPI
# # max_new_tokens = 1024  # standard 512. Orientiert an der Tokenanzahl der JSON-Dateien (Ground-Truth) 
# # llm = HuggingFaceEndpoint(repo_id=model_id,
# #                           huggingfacehub_api_token=llama_api,
# #                           max_new_tokens=max_new_tokens,
# #                           max_input_tokens=1024,
# #                           #top_k=, # standard None
# #                           #top_p=, # standard 0.95
# #                           temperature=0.1, # standard 0.8
# #                           )

# llm = ChatOpenAI(
#     model="gpt-4o-mini",
#     #max_tokens=1024,
#     #max_tokens_input=1024,
#     # timeout=None,
#     # max_retries=2,
#     api_key=openai_api,
#     temperature=0,
#     seed=llm_seed,
#     # system_fingerprint will be returned in the response
#     model_kwargs={"response_format": ArgumentMiningExtraction}
# )

# Quelle Verwendung OpenAI via LangChain: https://python.langchain.com/docs/integrations/chat/openai/
# Quelle Reproduzierbarkeit von LLM-Ausgaben: https://cookbook.openai.com/examples/reproducible_outputs_with_the_seed_parameter

In [90]:
# max_input_tokens = 4096 - max_new_tokens
# print(f"Der Input darf die Tokenanzahl von {max_input_tokens} Token nicht überschreiten.")

In [91]:
# template
# prompt_template = ChatPromptTemplate.from_messages(
#     [
#         ("system", "{system_message}"),
#         ("user", "Text: {argument_text}"),
#     ]
# )

# # output_parser = StrOutputParser() # turns the output into a string 

# # combine the prompt template, llm and output parser
# llm_chain = prompt_template | llm #| output_parser

# # invoke the chain
# one_shot_answer = llm_chain.invoke({"system_message": one_shot,
#                            "argument_text": test_df_sample[0]})
# print(one_shot_answer)

In [92]:
# das würde über die normale API gehen, aber nicht über die Batch-API

# # dataframe to store the input and output of the llm chain
# results_df = pd.DataFrame()

# # iterate over the zero-shot prompt dataframe
# for _, prompt_row in zero_shot_df.iterrows():
#     # iterate over the  dataframe with the test data
#     for _, test_df_row in test_df.iterrows():
#         # invoke the chain
#         try: 
#             answer = llm_chain.invoke({"system_message": prompt_row['prompt'],
#                                        "argument_text": test_df_row['txt']})
#             # store the input and output in the dataframe
#             new_row = pd.DataFrame({'prompt_file': [prompt_row['prompt_file']],
#                                     'txt_file': [test_df_row['txt_file']],
#                                     'json_file': [test_df_row['json_file']],
#                                     'ground_truth': [test_df_row['json']],
#                                     'answer': [answer]
#                                     })
#             results_df = pd.concat([results_df, new_row], ignore_index=True)
#         except Exception as e: # catch errors like HTTPError, HfHubHTTPError
#             new_row = pd.DataFrame({'prompt_file': [prompt_row['prompt_file']],
#                                     'txt_file': [test_df_row['txt_file']],
#                                     'json_file': [test_df_row['json_file']],
#                                     'ground_truth': [test_df_row['json']],
#                                     'answer': e})
#             results_df = pd.concat([results_df, new_row], ignore_index=True)
#         print(f"Finished {test_df_row['txt_file']} with prompt {prompt_row['prompt_file']}")

# # save the results to a csv file
# results_df.to_csv('results.csv', header=True index=False)


# # iterate over the one-shot prompt dataframe
# for _, prompt_row in one_shot_df.iterrows():
#     # iterate over the  dataframe with the test data
#     for _, test_df_row in test_df.iterrows():
#         # invoke the chain
#         try: 
#             answer = llm_chain.invoke({"system_message": prompt_row['prompt'],
#                                        "argument_text": test_df_row['txt']})
#             # store the input and output in the dataframe
#             new_row = pd.DataFrame({'prompt_file': [prompt_row['prompt_file']],
#                                     'txt_file': [test_df_row['txt_file']],
#                                     'json_file': [test_df_row['json_file']],
#                                     'ground_truth': [test_df_row['json']],
#                                     'answer': [answer]
#                                     })
#             results_df = pd.concat([results_df, new_row], ignore_index=True)
#         except Exception as e: # catch errors like HTTPError, HfHubHTTPError
#             new_row = pd.DataFrame({'prompt_file': [prompt_row['prompt_file']],
#                                     'txt_file': [test_df_row['txt_file']],
#                                     'json_file': [test_df_row['json_file']],
#                                     'ground_truth': [test_df_row['json']],
#                                     'answer': e})
#             results_df = pd.concat([results_df, new_row], ignore_index=True)
#         print(f"Finished {test_df_row['txt_file']} with prompt {prompt_row['prompt_file']}")

# # append the results to the csv file
# results_df.to_csv('results.csv', mode='a', header=False, index=False) # mode='a' for append



# # Create a batch api request for the openai api to reduce costs
# {custom_id: 'request_1', prompt: 'prompt_1', text: 'text_1'},


# test buffer memory und sequential chain

### sequential chain prompt

In [93]:
# from langchain.chains import SequentialChain
# from langchain.chains import LLMChain

# template = """
# You will be given a text. Extract the argumentative units “major claim”, “claim”, and “premise” as parts from the text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim or another premise. It is possible that there are several major claims. Return only the argumentative units and relationships between them. Return as a JSON object.

# # Example
# ## Input:\nShould students be required to attend classes?\n\nThe issue at hand is whether it should be obligatory for university students to attend classes. This is an interesting question because it affects a great amount of students worldwide. Nonetheless there are various different policies regarding this topic. An important aspect might be whether one desires to optimize the learned knowledge or the amount of valuable experiences for the students. However, to my mind students should be free not to attend classes because it improves the quality of student life as well as their learning motivation and teaches important life skills.\nFirstly, a liberal policy is very feasible. Checking for attendance requires a lot of bureaucracy. Especially for larger classes it is impossible to check for everybody to attend. So an optional attendance saves time and money.\nSecondly, some students might learn better at home on their own, for instance, by reading the textbook. This problem occurs especially if the lecturer is lethargic. By letting students choose not to attend class you give them the opportunity to escape bad teaching. Thus they are able to save precious study time and dive into the course syllabus independently.\nIn addition, being free to stay away from classes improves flexibility and therefore quality of student life. Sometimes the wild party on Thursday night is too good to end already at midnight only because of a lecture on Friday in the morning. With a liberal policy students are able to postpone the learning to the afternoon which gives a feeling of freedom and improves time efficiency. Research has shown that the more satisfied the students are with those life aspects, the better they perform in academic areas.\nFinally, psychology knows two types of motivation. There is intrinsic motivation which comes from your own mindset. And there is extrinsic motivation which comes from the praise and laud of other people. Intrinsic motivation is known to be much more desirable because it leads to better learning and well-being. However, in order to gain intrinsic motivation students need to become aware of their strengths and aims. By giving students the freedom to choose about class attendance they might rather be thinking about why they decided to study and learn to motivate themselves. These are crucial skills for the duration of their study and their whole life time.\nTo conclude, it is clear that going to classes should be optional for students. I hold this belief due to the improvement of students current experience as well as the valuable skills and knowledge they obtain for their whole life afterwards.
# ## Output:\n{{\n  'MajorClaims': {{\n    'MC1': 'students should be free not to attend classes',\n    'MC2': 'it is clear that going to classes should be optional for students'\n  }},\n  'Claims': {{\n    'C1': 'it improves the quality of student life as well as their learning motivation and teaches important life skills',\n    'C2': 'I hold this belief due to the improvement of students\' current experience as well as the valuable skills and knowledge they obtain for their whole life afterwards',\n    'C3': 'some students might learn better at home on their own, for instance, by reading the textbook',\n    'C4': 'being free to stay away from classes improves flexibility and therefore quality of student life',\n    'C5': 'By giving students the freedom to choose about class attendance they might rather be thinking about why they decided to study and learn to motivate themselves'\n  }},\n  'Premises': {{\n    'P1': 'This problem occurs especially if the lecturer is lethargic',\n    'P2': 'By letting students choose not to attend class you give them the opportunity to escape bad teaching',\n    'P3': 'they are able to save precious study time and dive into the course syllabus independently',\n    'P4': 'Sometimes the wild party on Thursday night is too good to end already at midnight only because of a lecture on Friday in the morning',\n    'P5': 'With a liberal policy students are able to postpone the learning to the afternoon which gives a feeling of freedom and improves time efficiency',\n    'P6': 'Research has shown that the more satisfied the students are with those life aspects, the better they perform in academic areas',\n    'P7': 'Intrinsic motivation is known to be much more desirable because it leads to better learning and well-being',\n    'P8': 'These are crucial skills for the duration of their study and their whole life time',\n    'P9': 'in order to gain intrinsic motivation students need to become aware of their strengths and aims'\n  }},\n  'ArgumentativeRelations': [\n    {{\n      'Claim': 'C1',\n      'Relation': 'For',\n      'Target': 'MC'\n    }},\n    {{\n      'Claim': 'C2',\n      'Relation': 'For',\n      'Target': 'MC'\n    }},\n    {{\n      'Claim': 'C3',\n      'Relation': 'For',\n      'Target': 'MC'\n    }},\n    {{\n      'Claim': 'P2',\n      'Relation': 'supports',\n      'Target': 'P3'\n    }},\n    {{\n      'Claim': 'P3',\n      'Relation': 'supports',\n      'Target': 'C3'\n    }},\n    {{\n      'Claim': 'P1',\n      'Relation': 'supports',\n      'Target': 'C3'\n    }},\n    {{\n      'Claim': 'C4',\n      'Relation': 'For',\n      'Target': 'MC'\n    }},\n    {{\n      'Claim': 'P4',\n      'Relation': 'supports',\n      'Target': 'C4'\n    }},\n    {{\n      'Claim': 'P5',\n      'Relation': 'supports',\n      'Target': 'C4'\n    }},\n    {{\n      'Claim': 'P6',\n      'Relation': 'supports',\n      'Target': 'C4'\n    }},\n    {{\n      'Claim': 'C5',\n      'Relation': 'For',\n      'Target': 'MC'\n    }},\n    {{\n      'Claim': 'P9',\n      'Relation': 'supports',\n      'Target': 'C5'\n    }},\n    {{\n      'Claim': 'P7',\n      'Relation': 'supports',\n      'Target': 'C5'\n    }},\n    {{\n      'Claim': 'P8',\n      'Relation': 'supports',\n      'Target': 'C5'\n    }}\n  ]\n}}

# Text: {{Text}}
# """
# template = template.replace('{', '{{').replace('}', '}}')

# prompt_template = PromptTemplate(template=template, input_variables=['Text'])

# first_chain = prompt_template | llm | output_parser

# template2 = """{input}
# Refine this output by looking at these examples: """ + sample_str.replace('{', '{{').replace('}', '}}')

# prompt_template2 = PromptTemplate(template=template2, input_variables=['input'])
# second_chain = prompt_template2 | llm | output_parser 

# # invoke the chain
# first_chain_answer = first_chain.invoke({"Text": test_df_sample['txt'][25]})
# print(first_chain_answer)

# second_chain_answer = second_chain.invoke({"input": first_chain_answer})
# print(second_chain_answer)

### buffer memory

In [94]:
# intro_text = task_description + " You will be given a some examples of text input and the corresponding JSON output. Wait with your answer until you will be given a text to analyze."
# #intro_text
# example_text = "Here is an example: "
# example_str_1, example_str_2, example_str_3, example_str_4, example_str_5, example_str_6, example_str_7, example_str_8, example_str_9, example_str_10 = examples
# print(example_str_1)



In [95]:
# from langchain.memory import ConversationBufferMemory
# from langchain.chains import ConversationChain

# memory = ConversationBufferMemory(size=10)
# buffer = ConversationChain(llm= llm, memory=memory)
# buffer.invoke(intro_text)
# buffer.invoke(example_text + example_str_1)
# buffer.invoke(example_text + example_str_2)
# buffer.invoke(example_text + example_str_3)
# buffer.invoke(example_text + example_str_4)
# # buffer.invoke(example_text + example_str_5)
# #buffer.invoke("Text: " + test_df_sample['txt'][25])

In [96]:
#"Text: " + test_df_sample['txt'][25]

In [97]:
#buffer.memory.chat_memory.messages

In [98]:
# buffer_chain = prompt_template | buffer | output_parser

# Batch API anfragen 
Langchain hat bisher noch keine Möglichkeit die Batch API von OpenAI zu verwenden, sondern nur über die Standard API. Dies geht unter anderem aus dem folgenden Forenbeitrag hervor (Stand 03.01.24): https://github.com/langchain-ai/langchain/discussions/21643

Um die Kosten der Anfragen weiter zu reduzieren wurde der ursprüngliche Ansatz über LangChain verworfen.

In [22]:
# Ppromp_df in 4 Teile aufteilen, damit die Prompts nacheinander an das Modell übergeben werden können.
# Für den Fall, dass es zu Fehlermeldungen kommen sollte, bspw. aufgrund Tokenanzahl, muss man so nicht von vorne beginnen und reduziert eventuell anfallende Mehrkosten.
zs_prompt_df = prompt_df[prompt_df['prompt_name'].str.contains('zero-shot')]
os_prompt_df = prompt_df[prompt_df['prompt_name'].str.contains('one-shot')]
fs10_prompt_df = prompt_df[prompt_df['prompt_name'].str.contains('few-shot-10')]
fs20_prompt_df = prompt_df[prompt_df['prompt_name'].str.contains('few-shot-20')]
fs40_prompt_df = prompt_df[prompt_df['prompt_name'].str.contains('few-shot-40')]

print(f"{zs_prompt_df[['prompt_name', 'token_count', '#batches']]}")
print(f"\n{os_prompt_df[['prompt_name', 'token_count', '#batches']]}")
print(f"\n{fs10_prompt_df[['prompt_name', 'token_count', '#batches']]}")
print(f"\n{fs20_prompt_df[['prompt_name', 'token_count', '#batches']]}")
print(f"\n{fs40_prompt_df[['prompt_name', 'token_count', '#batches']]}")

              prompt_name  token_count  #batches
19              zero-shot           82      0.00
18      zero-shot-persona          105      0.00
16          zero-shot-cot          480      0.01
17  zero-shot-persona-cot          503      0.01

             prompt_name  token_count  #batches
15              one-shot         1780      0.03
14      one-shot-persona         1790      0.03
12          one-shot-cot         2166      0.04
13  one-shot-persona-cot         2189      0.04

               prompt_name  token_count  #batches
3              few-shot-10        13848      0.25
2      few-shot-10-persona        13871      0.25
0          few-shot-10-cot        14247      0.26
1  few-shot-10-persona-cot        14270      0.26

               prompt_name  token_count  #batches
7              few-shot-20        27681      0.50
6      few-shot-20-persona        27704      0.50
4          few-shot-20-cot        28080      0.51
5  few-shot-20-persona-cot        28103      0.51

           

In [23]:
# count rows in the dataframes
zs_rows = zs_prompt_df.shape[0]
os_rows = os_prompt_df.shape[0]
fs10_rows = fs10_prompt_df.shape[0]
fs20_rows = fs20_prompt_df.shape[0]
fs40_rows = fs40_prompt_df.shape[0]

# calculate the amount of combinations to be processed if all prompts are used for all test essays
combinations = test_df.shape[0] * (zs_rows + os_rows + fs10_rows + fs20_rows + fs40_rows)
print(f"Es gibt insgesamt {combinations} Kombinationen, die verarbeitet werden müssen.")
print(f"Davon entfallen {test_df.shape[0] * zs_rows} auf Zero-Shot-Prompts.")
print(f"Davon entfallen {test_df.shape[0] * os_rows} auf One-Shot-Prompts.")
print(f"Davon entfallen {test_df.shape[0] * fs10_rows} auf Few-Shot-Prompts mit 10 Beispielen.")
print(f"Davon entfallen {test_df.shape[0] * fs20_rows} auf Few-Shot-Prompts mit 20 Beispielen.")
print(f"Davon entfallen {test_df.shape[0] * fs40_rows} auf Few-Shot-Prompts mit 40 Beispielen.")

Es gibt insgesamt 7240 Kombinationen, die verarbeitet werden müssen.
Davon entfallen 1448 auf Zero-Shot-Prompts.
Davon entfallen 1448 auf One-Shot-Prompts.
Davon entfallen 1448 auf Few-Shot-Prompts mit 10 Beispielen.
Davon entfallen 1448 auf Few-Shot-Prompts mit 20 Beispielen.
Davon entfallen 1448 auf Few-Shot-Prompts mit 40 Beispielen.


In [24]:
# Berechnung des Tokenverbrauchs pro Batch. Das limit des verwendeten Clients liegt bei 2,000,000 enqueued tokens
zs_prompt_df_sum = zs_prompt_df['token_count'].sum()
os_prompt_df_sum = os_prompt_df['token_count'].sum()
fs10_df_sum = fs10_prompt_df['token_count'].sum()
fs20_df_sum = fs20_prompt_df['token_count'].sum()
fs40_df_sum = fs40_prompt_df['token_count'].sum()
requests = 1448 # Anzahl der Anfragen, die in einem Batch verarbeitet werden

# Anzahl der Batch-Anfragen
zs_batches = round(zs_prompt_df['#batches'].sum(), 2)
os_batches = round(os_prompt_df['#batches'].sum(), 2)
fs10_batches = round(fs10_prompt_df['#batches'].sum(), 2)
fs20_batches = round(fs20_prompt_df['#batches'].sum(), 2)
fs40_batches = round(fs40_prompt_df['#batches'].sum(), 2)

print(f"Zero-Shot-Prompts: {zs_batches} Batches mit {zs_prompt_df_sum} Tokens")
print(f"One-Shot-Prompts: {os_batches} Batches mit {os_prompt_df_sum} Tokens")
print(f"Few-Shot-Prompts mit 10 Beispielen: {fs10_batches} Batches mit {fs10_df_sum} Tokens")
print(f"Few-Shot-Prompts mit 20 Beispielen: {fs20_batches} Batches mit {fs20_df_sum} Tokens")
print(f"Few-Shot-Prompts mit 40 Beispielen: {fs40_batches} Batches mit {fs40_df_sum} Tokens")


Zero-Shot-Prompts: 0.02 Batches mit 1170 Tokens
One-Shot-Prompts: 0.14 Batches mit 7925 Tokens
Few-Shot-Prompts mit 10 Beispielen: 1.02 Batches mit 56236 Tokens
Few-Shot-Prompts mit 20 Beispielen: 2.02 Batches mit 111568 Tokens
Few-Shot-Prompts mit 40 Beispielen: 3.94 Batches mit 217036 Tokens


In [102]:
# Group by prompt_name and calculate the sum of #batches
zero_shot_batches_sum = prompt_df[prompt_df['prompt_name'].str.contains('zero-shot')]['#batches'].sum()
one_shot_batches_sum = prompt_df[prompt_df['prompt_name'].str.contains('one-shot')]['#batches'].sum()
few_shot_10_batches_sum = prompt_df[prompt_df['prompt_name'].str.contains('few-shot-10')]['#batches'].sum()
few_shot_20_batches_sum = prompt_df[prompt_df['prompt_name'].str.contains('few-shot-20')]['#batches'].sum()
few_shot_40_batches_sum = prompt_df[prompt_df['prompt_name'].str.contains('few-shot-40')]['#batches'].sum()

print(f"Sum of #batches for zero-shot prompts: {zero_shot_batches_sum}")
print(f"Sum of #batches for one-shot prompts: {one_shot_batches_sum}")
print(f"Sum of #batches for few-shot prompts with 10 examples: {few_shot_10_batches_sum:2f}")
print(f"Sum of #batches for few-shot prompts with 20 examples: {few_shot_20_batches_sum:2f}")
print(f"Sum of #batches for few-shot prompts with 40 examples: {few_shot_40_batches_sum:2f}")

Sum of #batches for zero-shot prompts: 0.02
Sum of #batches for one-shot prompts: 0.14
Sum of #batches for few-shot prompts with 10 examples: 1.020000
Sum of #batches for few-shot prompts with 20 examples: 2.020000
Sum of #batches for few-shot prompts with 40 examples: 3.940000


In [103]:
zs_prompt_df['#lines_jsonl'] = round(20_000_000 / zs_prompt_df['token_count'], 0)
zs_prompt_df['#batches'] = round(test_df.shape[0] / zs_prompt_df['#lines_jsonl'], 1)
print(f"ZS: {zs_prompt_df['#batches'].sum()}")

os_prompt_df['#lines_jsonl'] = round(20_000_000 / os_prompt_df['token_count'], 0)
os_prompt_df['#batches'] = round(test_df.shape[0] / os_prompt_df['#lines_jsonl'], 1)
print(f"OS: {os_prompt_df['#batches'].sum()}")

fs10_prompt_df['#lines_jsonl'] = round(20_000_000 / fs10_prompt_df['token_count'], 0)
fs10_prompt_df['#batches'] = round(test_df.shape[0] / fs10_prompt_df['#lines_jsonl'], 1)
print(f"FS10: {fs10_prompt_df['#batches'].sum()}")

fs20_prompt_df['#lines_jsonl'] = round(20_000_000 / fs20_prompt_df['token_count'], 0)
fs20_prompt_df['#batches'] = round(test_df.shape[0] / fs20_prompt_df['#lines_jsonl'], 1)
print(f"FS20: {fs20_prompt_df['#batches'].sum()}")

fs40_prompt_df['#lines_jsonl'] = round(20_000_000 / fs40_prompt_df['token_count'], 0)
fs40_prompt_df['#batches'] = round(test_df.shape[0] / fs40_prompt_df['#lines_jsonl'], 0)
print(f"FS40: {fs40_prompt_df['#batches'].sum()}")

ZS: 0.0
OS: 0.0
FS10: 1.2
FS20: 2.0
FS40: 4.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zs_prompt_df['#lines_jsonl'] = round(20_000_000 / zs_prompt_df['token_count'], 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zs_prompt_df['#batches'] = round(test_df.shape[0] / zs_prompt_df['#lines_jsonl'], 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  os_prompt_df['#lines_jsonl'] = round(2

In [104]:
response_format = {
    "type": "object",
    "properties": {
        "MajorClaims": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "ID": {
                        "type": "string"
                    },
                    "Text": {
                        "type": "string"
                    }
                },
                "required": ["ID", "Text"],
                "additionalProperties": False
            }
        },
        "Claims": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "ID": {
                        "type": "string"
                    },
                    "Text": {
                        "type": "string"
                    }
                },
                "required": ["ID", "Text"],
                "additionalProperties": False
            }
        },
        "Premises": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "ID": {
                        "type": "string"
                    },
                    "Text": {
                        "type": "string"
                    }
                },
                "required": ["ID", "Text"],
                "additionalProperties": False
            }
        },
        "ArgumentativeRelations": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "Origin": {
                        "type": "string"
                    },
                    "Relation": {
                        "type": "string",
                        "enum": ["for", "against", "supports", "attacks"]
                    },
                    "Target": {
                        "type": "string"
                    }
                },
                "required": ["Origin", "Relation", "Target"],
                "additionalProperties": False
            }
        }
    },
    "required": ["MajorClaims", "Claims", "Premises", "ArgumentativeRelations"],
    "additionalProperties": False
    }

## Batch API Input Dateien erstellen

In [105]:
def generate_batch_input(test_df: pd.DataFrame, prompt_df: pd.DataFrame, file_name: str):
    temperature = 0
    llm_seed = 123
    model = "gpt-4o-mini"

    # response format for structured output
    # response_format = {
    # "type": "object",
    # "properties": {
    #     "MajorClaims": {
    #         "type": "array",
    #         "items": {
    #             "type": "object",
    #             "properties": {
    #                 "ID": {
    #                     "type": "string"
    #                 },
    #                 "Text": {
    #                     "type": "string"
    #                 }
    #             },
    #             "required": ["ID", "Text"],
    #             "additionalProperties": False
    #         }
    #     },
    #     "Claims": {
    #         "type": "array",
    #         "items": {
    #             "type": "object",
    #             "properties": {
    #                 "ID": {
    #                     "type": "string"
    #                 },
    #                 "Text": {
    #                     "type": "string"
    #                 }
    #             },
    #             "required": ["ID", "Text"],
    #             "additionalProperties": False
    #         }
    #     },
    #     "Premises": {
    #         "type": "array",
    #         "items": {
    #             "type": "object",
    #             "properties": {
    #                 "ID": {
    #                     "type": "string"
    #                 },
    #                 "Text": {
    #                     "type": "string"
    #                 }
    #             },
    #             "required": ["ID", "Text"],
    #             "additionalProperties": False
    #         }
    #     },
    #     "ArgumentativeRelations": {
    #         "type": "array",
    #         "items": {
    #             "type": "object",
    #             "properties": {
    #                 "Origin": {
    #                     "type": "string"
    #                 },
    #                 "Relation": {
    #                     "type": "string",
    #                     "enum": ["for", "against", "supports", "attacks"]
    #                 },
    #                 "Target": {
    #                     "type": "string"
    #                 }
    #             },
    #             "required": ["Origin", "Relation", "Target"],
    #             "additionalProperties": False
    #         }
    #     }
    # },
    # "required": ["MajorClaims", "Claims", "Premises", "ArgumentativeRelations"],
    # "additionalProperties": False
    # }

    dict_list = []
    # iteration über Zero-Shot-Prompts
    for _, prompt_row in prompt_df.iterrows():
        # Iteration über Testdaten
        for _, test_df_row in test_df.iterrows():
            custom_id_str = prompt_row['prompt_name'] + "_" + test_df_row['txt_file']# + "_" + str(id_counter)
            # write batch input for jsonl file
            input_dict = {"custom_id": custom_id_str, 
                          "method": "POST", "url": "/v1/chat/completions",
                          "body": {"model": model,
                                   "messages": [{"role": "developer", "content": prompt_row['prompt_txt']}, # system Rolle wurde in developer umbenannt
                                                {"role": "user", "content": "Text: " + test_df_row['txt']}], # user Rolle für Eingaben des Nutzers wie bei ChatGPT 
                                                "temperature": temperature,
                                                "seed": llm_seed,
                                                "response_format": {
                                                    "type": "json_schema", # wichtig festzulegen, da sonst Fehlermeldung
                                                    "json_schema": {
                                                        "name": "ArgumentMiningExtraction", # wichtig festzulegen, da sonst Fehlermeldung
                                                        "schema": response_format,
                                                        "strict": True 
                                                    }
                                                    }
                                                }
                                     }
            dict_list.append(input_dict)

    jsonl_output = "\n".join(json.dumps(item) for item in dict_list)

    # Output in JSONL-Datei schreiben
    with open("batch_api/input/" + file_name + ".jsonl", 'w') as f:
        f.write(jsonl_output)

    return jsonl_output

# Quelle Batch API: https://platform.openai.com/docs/guides/batch?lang=python
# Quelle text generation: https://platform.openai.com/docs/guides/text-generation

# Quellen Structured Outputs:
# - https://platform.openai.com/docs/guides/structured-outputs
# - https://cookbook.openai.com/examples/structured_outputs_intro
# - https://python.langchain.com/docs/concepts/structured_outputs/
# - https://python.langchain.com/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html

In [106]:
def generate_batch_input_split(test_df: pd.DataFrame, prompt_df: pd.DataFrame, file_name: str, num_files: int):
    if num_files <= 0:
        raise ValueError("num_files muss größer als 0 sein.")

    temperature = 0
    llm_seed = 123
    model = "gpt-4o-mini"

    # response format for structured output
    # response_format = {
    # "type": "object",
    # "properties": {
    #     "MajorClaims": {
    #         "type": "array",
    #         "items": {
    #             "type": "object",
    #             "properties": {
    #                 "ID": {
    #                     "type": "string"
    #                 },
    #                 "Text": {
    #                     "type": "string"
    #                 }
    #             },
    #             "required": ["ID", "Text"],
    #             "additionalProperties": False
    #         }
    #     },
    #     "Claims": {
    #         "type": "array",
    #         "items": {
    #             "type": "object",
    #             "properties": {
    #                 "ID": {
    #                     "type": "string"
    #                 },
    #                 "Text": {
    #                     "type": "string"
    #                 }
    #             },
    #             "required": ["ID", "Text"],
    #             "additionalProperties": False
    #         }
    #     },
    #     "Premises": {
    #         "type": "array",
    #         "items": {
    #             "type": "object",
    #             "properties": {
    #                 "ID": {
    #                     "type": "string"
    #                 },
    #                 "Text": {
    #                     "type": "string"
    #                 }
    #             },
    #             "required": ["ID", "Text"],
    #             "additionalProperties": False
    #         }
    #     },
    #     "ArgumentativeRelations": {
    #         "type": "array",
    #         "items": {
    #             "type": "object",
    #             "properties": {
    #                 "Origin": {
    #                     "type": "string"
    #                 },
    #                 "Relation": {
    #                     "type": "string",
    #                     "enum": ["for", "against", "supports", "attacks"]
    #                 },
    #                 "Target": {
    #                     "type": "string"
    #                 }
    #             },
    #             "required": ["Origin", "Relation", "Target"],
    #             "additionalProperties": False
    #         }
    #     }
    # },
    # "required": ["MajorClaims", "Claims", "Premises", "ArgumentativeRelations"],
    # "additionalProperties": False
    # }

    dict_list = []
    # iteration über Zero-Shot-Prompts
    for _, prompt_row in prompt_df.iterrows():
        # Iteration über Testdaten
        for _, test_df_row in test_df.iterrows():
            custom_id_str = prompt_row['prompt_name'] + "_" + test_df_row['txt_file']# + "_" + str(id_counter)
            # write batch input for jsonl file
            input_dict = {"custom_id": custom_id_str, 
                          "method": "POST", "url": "/v1/chat/completions",
                          "body": {"model": model,
                                   "messages": [{"role": "developer", "content": prompt_row['prompt_txt']}, # system Rolle wurde in developer umbenannt
                                                {"role": "user", "content": "Text: " + test_df_row['txt']}], # user Rolle für Eingaben des Nutzers wie bei ChatGPT 
                                                "temperature": temperature,
                                                "seed": llm_seed,
                                                "response_format": {
                                                    "type": "json_schema", # wichtig festzulegen, da sonst Fehlermeldung
                                                    "json_schema": {
                                                        "name": "ArgumentMiningExtraction", # wichtig festzulegen, da sonst Fehlermeldung
                                                        "schema": response_format, # strukturiertes Output-Format von oben
                                                        "strict": True 
                                                    }
                                                    }
                                                }
                                     }
            dict_list.append(input_dict)

    
    chunk_size = len(dict_list) // num_files + (len(dict_list) % num_files > 0) # Ermittelt die größe der Chunks, indem die Anzahl der Elemente durch die Anzahl der Dateien geteilt wird. Bei einem Rest wird ein Chunk mehr erstellt.
    # Floor-Operator "//" dividiert und rundet auf die nächste ganze Zahl ab (Bsp.: 7 / 2 = 3.5 . 7 // 2 = 3). Modulo Operator "%" gibt den Rest der Division an. Wenn 
    chunks = [dict_list[i:i + chunk_size] for i in range(0, len(dict_list), chunk_size)] # Teilt die Liste in gleich große Teile auf. 
    # Beispiel für 7 Elemente (dict_list) und 2 Dateien (num_files): chunk_size = 7 // 2 + (7 % 2 > 0) = 4. cunks = [dict_list[0:4], dict_list[4:7]]. Intervall [0:4] = 0, 1, 2, 3. Intervall [4:7] = 4, 5, 6. Der letzte Index wird nicht mit eingeschlossen. 

    # Output in JSONL-Dateien schreiben
    for i, chunk in enumerate(chunks): # enumerate iteriert über die Chunks-Liste und gibt den Index und das Element zurück
        jsonl_output = "\n".join(json.dumps(item) for item in chunk) # Schreibt die Elemente in einzelne Zeilen in einen JSONL-String
        with open(f"batch_api/input/{file_name}_{i + 1}.jsonl", 'w') as f: # Schreibt den JSONL-String in eine Datei mit dem übergebenen Dateinamen und der fortlaufender Nummerierung
            f.write(jsonl_output)

    return [f"{file_name}_{i + 1}.jsonl" for i in range(num_files)] # Bezeichnet die Dateien fortlaufend anhand der Anzahl der Dateien


# Quelle Batch API: https://platform.openai.com/docs/guides/batch?lang=python
# Quelle text generation: https://platform.openai.com/docs/guides/text-generation

# Quellen Structured Outputs:
# - https://platform.openai.com/docs/guides/structured-outputs
# - https://cookbook.openai.com/examples/structured_outputs_intro
# - https://python.langchain.com/docs/concepts/structured_outputs/
# - https://python.langchain.com/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html

In [107]:
list = [0, 1, 2, 3, 4, 5, 6, 7]
chunk_size = 4
chunks = [list[i:i + chunk_size] for i in range(0, len(list), chunk_size)]
print(chunks)

for i , chunk in enumerate(chunks):
    output = "\n".join(str(item) for item in chunk)
    print(f"Output {i + 1}:\n{output}")


[[0, 1, 2, 3], [4, 5, 6, 7]]
Output 1:
0
1
2
3
Output 2:
4
5
6
7


## Test splitting full json file into smaller files using token limit of 20 Mio.

In [108]:
full_jsonl = generate_batch_input(test_df, prompt_df, "full_batch_input")

In [117]:
BATCH_INPUT_PATH = "batch_api/input/"
BATCH_OUTPUT_PATH = "batch_api/output/"

def split_jsonl_file(input_file, max_tokens=20_000_000):
    
    def write_to_file(data, file_index):
        with open(f"{BATCH_INPUT_PATH}batch_input_{file_index}.jsonl", 'w') as f:
            for item in data:
                f.write(json.dumps(item) + '\n')

    current_tokens = 0
    file_index = 1
    current_data = []

    with open(input_file, 'r') as f:
        for line in f:
            json_obj = json.loads(line)
            messages = json_obj.get("body", {}).get("messages", []) # get messages from body inside json_obj
            messages_str = json.dumps(messages)
            tokens = num_tokens_from_string(messages_str, model_name="gpt-4o-mini") # calculate tokens from messages

            if current_tokens + tokens > max_tokens: # wenn die Anzahl der bisherigen Tokens plus die Tokens des aktuellen Objekts größer als das Limit ist
                write_to_file(current_data, file_index) # schreibe die Daten in die Datei
                file_index += 1 # erhöhe den Dateiindex
                current_data = [] # setze die aktuellen Daten zurück
                current_tokens = 0 # setze die aktuellen Tokens zurück
            
            # wenn die Anzahl der bisherigen Tokens plus die Tokens des aktuellen Objekts kleiner als das Limit ist. Mit einem Else-Statement würde das letzte Objekt nicht in die Datei geschrieben werden.
            current_data.append(json_obj) # füge das aktuelle Objekt zu den aktuellen Daten hinzu
            current_tokens += tokens # erhöhe die Anzahl der aktuellen Tokens um die Tokens des aktuellen Objekts

    if current_data: # wenn es nach der Iteration noch Daten gibt, die noch nicht in eine Datei geschrieben wurden
        write_to_file(current_data, file_index) # schreibe die Daten in die Datei

# Example usage
split_jsonl_file("batch_api/input/full_batch_input.jsonl")

In [119]:
# count lines of all imput_batch files
BATCH_INPUT_PATH = "batch_api/input/"
input_files = ['full_batch_input.jsonl', 'batch_input_1.jsonl', 'batch_input_2.jsonl', 'batch_input_3.jsonl', 'batch_input_4.jsonl', 'batch_input_5.jsonl', 'batch_input_6.jsonl', 'batch_input_7.jsonl', 'batch_input_8.jsonl','batch_input_9.jsonl']
line_counts = [sum(1 for line in open(BATCH_INPUT_PATH + file)) for file in input_files]
print(line_counts)

[7240, 3816, 878, 622, 549, 323, 322, 320, 320, 90]


In [140]:
count=[3816, 878, 622, 549, 323, 322, 320, 320, 90]
print(sum(count))

7240


In [50]:
batch_input_files_list = list_files(BATCH_INPUT_PATH)
batch_input_files_list

['few-shot-10_1.jsonl',
 'few-shot-10_2.jsonl',
 'few-shot-20_1.jsonl',
 'few-shot-20_2.jsonl',
 'few-shot-20_3.jsonl',
 'few-shot-40_1.jsonl',
 'few-shot-40_2.jsonl',
 'few-shot-40_3.jsonl',
 'few-shot-40_4.jsonl',
 'few-shot-40_5.jsonl',
 'one-shot.jsonl',
 'zero-shot.jsonl']

## Batches hochladen und erstellen

In [3]:
BATCH_INPUT_PATH = "batch_api/input/"
BATCH_OUTPUT_PATH = "batch_api/output/"

# Hilfsfunktionen für die Batch-API
def upload_batch_file(filepath):
    response = client.files.create(
        file=open(filepath, 'rb'),
        purpose='batch'
    )
    return response


def create_batch(input_file_id, metadata_dict):
    batch = client.batches.create(
        input_file_id=input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata=metadata_dict
    )
    return batch

# Check the status of the batch
def check_batch_status(batch_id):
    batch = client.batches.retrieve(batch_id)
    print(f"Status: {batch.status}")

    if batch.status == "failed":
        print(f"Error: {batch.errors}")
    elif batch.status == "in_progress" or batch.status == "validating" or batch.status == "finalizing":
        print("Der Batch wird noch verarbeitet. Bitte warten und später erneut prüfen.")

    print(f"\nBeschreibung des Batches: {batch.metadata["description"]}")
    print(f"Anfragen gesamt: {batch.request_counts.total}")
    print(f"Davon erfolgreich: {batch.request_counts.completed}")
    print(f"Davon fehlerhaft: {batch.request_counts.failed}")
    if batch.output_file_id is not None:
        print(f"Erfolgreiche Abfragen können abgerufen werden mit ID: {batch.output_file_id}")
    else:
        print("Keine erfolgreichen Abfragen zum herunterladen vorhanden.")
    
    if batch.error_file_id is not None:
        print(f"Für weiter Informationen zum Fehler Abfrage an Error-File mit ID: {batch.error_file_id}\n")
    else:
        print("Keine fehlerhaften Abfragen zum herunterladen vorhanden.\n")
    
    return batch

# retrieving the results
def retrieve_and_save_batch_results(batch_file_id, file_name):
    file_response = client.files.content(batch_file_id)
    results = file_response.text
    with open(BATCH_OUTPUT_PATH + file_name + ".jsonl", 'w') as f:
        f.write(results)
    return results

### Batch 1

In [142]:
# Uploading der Batch Input-Datei auf die OpenAI-Plattform
batch_file_1 = upload_batch_file(BATCH_INPUT_PATH + "batch_input_1.jsonl")
batch_file_1

FileObject(id='file-73X2eNMUnxhwLndGXvNdZ5', bytes=84798929, created_at=1736407855, filename='batch_input_1.jsonl', object='file', purpose='batch', status='processed', status_details=None)

In [143]:
# Erstellen eines Batches
metadata_dict = {"description": "Batch 1/9 for Argument Mining"}
batch_1 = create_batch(batch_file_1.id, metadata_dict) # sofern die Batch Datei bereits hochgeladen wurde, aber nicht erfolgreich war, kann die Batch-ID erneut verwendet werden. Ein erneuter Uplaod ist nicht notwendig und würde der Datei eine neue ID zuweisen.
print(batch_1)

Batch(id='batch_677f7b3b1b48819086a5a27b7173c2c3', completion_window='24h', created_at=1736407867, endpoint='/v1/chat/completions', input_file_id='file-73X2eNMUnxhwLndGXvNdZ5', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1736494267, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Batch 1/9 for Argument Mining'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [156]:
# Status des Batches abfragen - Diese Funktion kann mehrfach aufgerufen werden, um den Status des Batches zu überprüfen, ohne Zusatzkosten zu verursachen.
# Die Bearbeitung des Batches kann bis zu 24 Stunden dauern, funktioniert aber in der Regel schneller.
batch_1_status = check_batch_status(batch_1.id)
print(batch_1_status)

Status: completed

Beschreibung des Batches: Batch 1/9 for Argument Mining
Anfragen gesamt: 3816
Davon erfolgreich: 3816
Davon fehlerhaft: 0
Erfolgreiche Abfragen können abgerufen werden mit ID: file-Bwq5PPm4rnvL2jk7KFt2cL
Keine fehlerhaften Abfragen zum herunterladen vorhanden.

Batch(id='batch_677f7b3b1b48819086a5a27b7173c2c3', completion_window='24h', created_at=1736407867, endpoint='/v1/chat/completions', input_file_id='file-73X2eNMUnxhwLndGXvNdZ5', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1736411765, error_file_id=None, errors=None, expired_at=None, expires_at=1736494267, failed_at=None, finalizing_at=1736411356, in_progress_at=1736407871, metadata={'description': 'Batch 1/9 for Argument Mining'}, output_file_id='file-Bwq5PPm4rnvL2jk7KFt2cL', request_counts=BatchRequestCounts(completed=3816, failed=0, total=3816))


In [157]:
# Abrufen und Speichern der Batch-Ergebnisse (Output-Datei)
batch_1_results = retrieve_and_save_batch_results(batch_1_status.output_file_id, "output-batch-1")
print(batch_1_results[:1000])

{"id": "batch_req_677f88dcb8c881908e9275650fab34fa", "custom_id": "zero-shot_essay001.txt", "response": {"status_code": 200, "request_id": "82330270e662bb47722cf42d0c0ab869", "body": {"id": "chatcmpl-AnhIbjUujngIZN41Ogf87fCAje0x7", "object": "chat.completion", "created": 1736407877, "model": "gpt-4o-mini-2024-07-18", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\"MajorClaims\":[{\"ID\":\"MC1\",\"Text\":\"We should attach more importance to cooperation during primary education.\"}],\"Claims\":[{\"ID\":\"C1\",\"Text\":\"Competition can effectively promote the development of economy.\"},{\"ID\":\"C2\",\"Text\":\"Cooperation helps children learn interpersonal skills.\"},{\"ID\":\"C3\",\"Text\":\"Competition makes society more effective.\"},{\"ID\":\"C4\",\"Text\":\"Victory in competition often requires cooperation.\"}],\"Premises\":[{\"ID\":\"P1\",\"Text\":\"Companies improve their products and services to survive in competition.\"},{\"ID\":\"P2\",\"Text\":\"Inter

### Batch 2

In [28]:
# Uploading der Batch Input-Datei auf die OpenAI-Plattform
batch_file_2 = upload_batch_file(BATCH_INPUT_PATH + "batch_input_2.jsonl")
batch_file_2

FileObject(id='file-NyvUx7xYfyFu6C4QNZ6qYs', bytes=80171245, created_at=1736498896, filename='batch_input_2.jsonl', object='file', purpose='batch', status='processed', status_details=None)

In [29]:
# Erstellen eines Batches
metadata_dict = {"description": "Batch 2/9 for Argument Mining"}
batch_2 = create_batch(batch_file_2.id, metadata_dict) # sofern die Batch Datei bereits hochgeladen wurde, aber nicht erfolgreich war, kann die Batch-ID erneut verwendet werden. Ein erneuter Uplaod ist nicht notwendig und würde der Datei eine neue ID zuweisen.
print(batch_2)

Batch(id='batch_6780ded3cd58819080b492f6647492b4', completion_window='24h', created_at=1736498899, endpoint='/v1/chat/completions', input_file_id='file-NyvUx7xYfyFu6C4QNZ6qYs', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1736585299, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Batch 2/9 for Argument Mining'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [32]:
# Status des Batches abfragen - Diese Funktion kann mehrfach aufgerufen werden, um den Status des Batches zu überprüfen, ohne Zusatzkosten zu verursachen.
batch_2_status = check_batch_status(batch_2.id)
print(batch_2_status)

Status: completed

Beschreibung des Batches: Batch 2/9 for Argument Mining
Anfragen gesamt: 878
Davon erfolgreich: 878
Davon fehlerhaft: 0
Erfolgreiche Abfragen können abgerufen werden mit ID: file-Fg7gsDHnWPhPKX5BLzQhaH
Keine fehlerhaften Abfragen zum herunterladen vorhanden.

Batch(id='batch_6780ded3cd58819080b492f6647492b4', completion_window='24h', created_at=1736498899, endpoint='/v1/chat/completions', input_file_id='file-NyvUx7xYfyFu6C4QNZ6qYs', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1736499712, error_file_id=None, errors=None, expired_at=None, expires_at=1736585299, failed_at=None, finalizing_at=1736499609, in_progress_at=1736498902, metadata={'description': 'Batch 2/9 for Argument Mining'}, output_file_id='file-Fg7gsDHnWPhPKX5BLzQhaH', request_counts=BatchRequestCounts(completed=878, failed=0, total=878))


In [33]:
# Abrufen und Speichern der Batch-Ergebnisse (Output-Datei)
batch_2_results = retrieve_and_save_batch_results(batch_2_status.output_file_id, "output-batch-2")
print(batch_2_results[:1000])

{"id": "batch_req_6780e1bbdd408190b622fa829f21b366", "custom_id": "few-shot-10-cot_essay220.txt", "response": {"status_code": 200, "request_id": "7efa009d49bf99540b29b3fb45a5b4d6", "body": {"id": "chatcmpl-Ao4yrOhe57gU4dz9cROmIgRKN2NNL", "object": "chat.completion", "created": 1736498909, "model": "gpt-4o-mini-2024-07-18", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n  \"MajorClaims\": [\n    {\n      \"ID\": \"MC1\",\n      \"Text\": \"Learning to be independent is essential for young adults\"\n    },\n    {\n      \"ID\": \"MC2\",\n      \"Text\": \"staying longer with parents is a better choice\"\n    }\n  ],\n  \"Claims\": [\n    {\n      \"ID\": \"C1\",\n      \"Text\": \"staying with the parents for longer time does more benefits than disadvantages to the young adult\"\n    },\n    {\n      \"ID\": \"C2\",\n      \"Text\": \"the young adult can have more experience with his parents\"\n    },\n    {\n      \"ID\": \"C3\",\n      \"Text\": \"living at ho

### Batch 3

In [8]:
# Uploading der Batch Input-Datei auf die OpenAI-Plattform
batch_file_3 = upload_batch_file(BATCH_INPUT_PATH + "batch_input_3.jsonl")
batch_file_3

FileObject(id='file-1VeM9QWs9utTGbREGjvauV', bytes=79782274, created_at=1736585156, filename='batch_input_3.jsonl', object='file', purpose='batch', status='processed', status_details=None)

In [9]:
# Erstellen eines Batches
metadata_dict = {"description": "Batch 3/9 for Argument Mining"}
batch_3 = create_batch(batch_file_3.id, metadata_dict) # sofern die Batch Datei bereits hochgeladen wurde, aber nicht erfolgreich war, kann die Batch-ID erneut verwendet werden. Ein erneuter Uplaod ist nicht notwendig und würde der Datei eine neue ID zuweisen.
print(batch_3)

Batch(id='batch_67822fc8058c8190b864f42b92ad646d', completion_window='24h', created_at=1736585160, endpoint='/v1/chat/completions', input_file_id='file-1VeM9QWs9utTGbREGjvauV', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1736671560, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Batch 3/9 for Argument Mining'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [16]:
# Status des Batches abfragen - Diese Funktion kann mehrfach aufgerufen werden, um den Status des Batches zu überprüfen, ohne Zusatzkosten zu verursachen.
batch_3_status = check_batch_status(batch_3.id)
print(batch_3_status)

Status: completed

Beschreibung des Batches: Batch 3/9 for Argument Mining
Anfragen gesamt: 622
Davon erfolgreich: 622
Davon fehlerhaft: 0
Erfolgreiche Abfragen können abgerufen werden mit ID: file-Dn8EPeVM9BjT1VpRaaAnvS
Keine fehlerhaften Abfragen zum herunterladen vorhanden.

Batch(id='batch_67822fc8058c8190b864f42b92ad646d', completion_window='24h', created_at=1736585160, endpoint='/v1/chat/completions', input_file_id='file-1VeM9QWs9utTGbREGjvauV', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1736586694, error_file_id=None, errors=None, expired_at=None, expires_at=1736671560, failed_at=None, finalizing_at=1736586509, in_progress_at=1736585162, metadata={'description': 'Batch 3/9 for Argument Mining'}, output_file_id='file-Dn8EPeVM9BjT1VpRaaAnvS', request_counts=BatchRequestCounts(completed=622, failed=0, total=622))


In [17]:
# Abrufen und Speichern der Batch-Ergebnisse (Output-Datei)
batch_3_results = retrieve_and_save_batch_results(batch_3_status.output_file_id, "output-batch-3")
print(batch_3_results[:1000])

{"id": "batch_req_6782350f50148190ae1ecbe23a044ce3", "custom_id": "few-shot-20_essay389.txt", "response": {"status_code": 200, "request_id": "429b27845229d782546ed0cbc4fb17c3", "body": {"id": "chatcmpl-AoRVz57qqhjznstY4shTiOqFnQRQC", "object": "chat.completion", "created": 1736585531, "model": "gpt-4o-mini-2024-07-18", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n  \"MajorClaims\": [\n    {\n      \"ID\": \"MC1\",\n      \"Text\": \"education plays an important role in the socioeconomic status of a country\"\n    },\n    {\n      \"ID\": \"MC2\",\n      \"Text\": \"education is the single most important factor in the development of a country\"\n    }\n  ],\n  \"Claims\": [\n    {\n      \"ID\": \"C1\",\n      \"Text\": \"education is undeniably an economic necessity\"\n    },\n    {\n      \"ID\": \"C2\",\n      \"Text\": \"not many can afford to send their children to school in a developing country\"\n    },\n    {\n      \"ID\": \"C3\",\n      \"Text\": \"

### Batch 4

In [10]:
# Uploading der Batch Input-Datei auf die OpenAI-Plattform
batch_file_4 = upload_batch_file(BATCH_INPUT_PATH + "batch_input_4.jsonl")
batch_file_4

FileObject(id='file-CUxcAUKiZJZ64KkV8YAsG4', bytes=79467621, created_at=1736670809, filename='batch_input_4.jsonl', object='file', purpose='batch', status='processed', status_details=None)

In [11]:
# Erstellen eines Batches
metadata_dict = {"description": "Batch 4/9 for Argument Mining"}
batch_4 = create_batch(batch_file_4.id, metadata_dict) # sofern die Batch Datei bereits hochgeladen wurde, aber nicht erfolgreich war, kann die Batch-ID erneut verwendet werden. Ein erneuter Uplaod ist nicht notwendig und würde der Datei eine neue ID zuweisen.
print(batch_4)

Batch(id='batch_67837fec4f04819088f396053ef790f2', completion_window='24h', created_at=1736671212, endpoint='/v1/chat/completions', input_file_id='file-CUxcAUKiZJZ64KkV8YAsG4', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1736757612, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Batch 4/9 for Argument Mining'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [14]:
# Status des Batches abfragen - Diese Funktion kann mehrfach aufgerufen werden, um den Status des Batches zu überprüfen, ohne Zusatzkosten zu verursachen.
batch_4_status = check_batch_status(batch_4.id)
print(batch_4_status)

Status: completed

Beschreibung des Batches: Batch 4/9 for Argument Mining
Anfragen gesamt: 549
Davon erfolgreich: 549
Davon fehlerhaft: 0
Erfolgreiche Abfragen können abgerufen werden mit ID: file-CTPtvsw1e79hupNn1R1DXf
Keine fehlerhaften Abfragen zum herunterladen vorhanden.

Batch(id='batch_67837fec4f04819088f396053ef790f2', completion_window='24h', created_at=1736671212, endpoint='/v1/chat/completions', input_file_id='file-CUxcAUKiZJZ64KkV8YAsG4', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1736673370, error_file_id=None, errors=None, expired_at=None, expires_at=1736757612, failed_at=None, finalizing_at=1736673312, in_progress_at=1736671215, metadata={'description': 'Batch 4/9 for Argument Mining'}, output_file_id='file-CTPtvsw1e79hupNn1R1DXf', request_counts=BatchRequestCounts(completed=549, failed=0, total=549))


In [15]:
# Abrufen und Speichern der Batch-Ergebnisse (Output-Datei)
batch_4_results = retrieve_and_save_batch_results(batch_4_status.output_file_id, "output-batch-4")
print(batch_4_results[:1000])

{"id": "batch_req_67838820cb908190bb1db05670129112", "custom_id": "few-shot-20-cot_essay276.txt", "response": {"status_code": 200, "request_id": "856774c31c28b8df1dfb7b6a6a981df5", "body": {"id": "chatcmpl-AonoLHzczRc9vHXpuohv76KsztCIX", "object": "chat.completion", "created": 1736671237, "model": "gpt-4o-mini-2024-07-18", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n  \"MajorClaims\": [\n    {\n      \"ID\": \"MC1\",\n      \"Text\": \"dancing is an important part of culture\"\n    }\n  ],\n  \"Claims\": [\n    {\n      \"ID\": \"C1\",\n      \"Text\": \"dancing are significant part of culture that could show to something that people believe\"\n    },\n    {\n      \"ID\": \"C2\",\n      \"Text\": \"dancing can represent to civilization of that culture\"\n    },\n    {\n      \"ID\": \"C3\",\n      \"Text\": \"dancing is one of the ways people entertain themselves\"\n    }\n  ],\n  \"Premises\": [\n    {\n      \"ID\": \"P1\",\n      \"Text\": \"some cultur

### Batch 5

In [5]:
# Uploading der Batch Input-Datei auf die OpenAI-Plattform
batch_file_5 = upload_batch_file(BATCH_INPUT_PATH + "batch_input_5.jsonl")
batch_file_5

FileObject(id='file-BAg7ZagvUiaPgNUeoPjxVc', bytes=79286180, created_at=1736757463, filename='batch_input_5.jsonl', object='file', purpose='batch', status='processed', status_details=None)

In [6]:
# Erstellen eines Batches
metadata_dict = {"description": "Batch 5/9 for Argument Mining"}
batch_5 = create_batch(batch_file_5.id, metadata_dict) # sofern die Batch Datei bereits hochgeladen wurde, aber nicht erfolgreich war, kann die Batch-ID erneut verwendet werden. Ein erneuter Uplaod ist nicht notwendig und würde der Datei eine neue ID zuweisen.
print(batch_5)

Batch(id='batch_6784d168bb288190bda2b11e3b42da3c', completion_window='24h', created_at=1736757608, endpoint='/v1/chat/completions', input_file_id='file-BAg7ZagvUiaPgNUeoPjxVc', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1736844008, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Batch 5/9 for Argument Mining'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [None]:
# Status des Batches abfragen - Diese Funktion kann mehrfach aufgerufen werden, um den Status des Batches zu überprüfen, ohne Zusatzkosten zu verursachen.
batch_5_status = check_batch_status(batch_5.id)
print(batch_5_status)

Status: completed

Beschreibung des Batches: Batch 5/9 for Argument Mining
Anfragen gesamt: 323
Davon erfolgreich: 323
Davon fehlerhaft: 0
Erfolgreiche Abfragen können abgerufen werden mit ID: file-XCidZa5LAvqwrL52vCSaS2
Keine fehlerhaften Abfragen zum herunterladen vorhanden.

Batch(id='batch_6784d168bb288190bda2b11e3b42da3c', completion_window='24h', created_at=1736757608, endpoint='/v1/chat/completions', input_file_id='file-BAg7ZagvUiaPgNUeoPjxVc', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1736791620, error_file_id=None, errors=None, expired_at=None, expires_at=1736844008, failed_at=None, finalizing_at=1736791592, in_progress_at=1736757613, metadata={'description': 'Batch 5/9 for Argument Mining'}, output_file_id='file-XCidZa5LAvqwrL52vCSaS2', request_counts=BatchRequestCounts(completed=323, failed=0, total=323))


In [5]:
# Abrufen und Speichern der Batch-Ergebnisse (Output-Datei)
batch_5_results = retrieve_and_save_batch_results(batch_5_status.output_file_id, "output-batch-5")
print(batch_5_results[:1000])

{"id": "batch_req_6785562970d48190890e1b3db0550e4e", "custom_id": "few-shot-40_essay081.txt", "response": {"status_code": 200, "request_id": "961d841d6ad27aec5dccc2ac0ef053e5", "body": {"id": "chatcmpl-ApJ1kDVOSD1DoKECkc3fctxXCi9xt", "object": "chat.completion", "created": 1736791232, "model": "gpt-4o-mini-2024-07-18", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n  \"MajorClaims\": [\n    {\n      \"ID\": \"MC1\",\n      \"Text\": \"artists must be given freedom so that they will produce some really marvelous masterpiece\"\n    },\n    {\n      \"ID\": \"MC2\",\n      \"Text\": \"there should not be any restrictions on artists' work\"\n    }\n  ],\n  \"Claims\": [\n    {\n      \"ID\": \"C1\",\n      \"Text\": \"if there is control over artists' ideas, they will definitely lose their sense of creativity in the long run\"\n    },\n    {\n      \"ID\": \"C2\",\n      \"Text\": \"it is every human's right to be able to voice out their opinions in any ways as lo

### Batch 6

In [32]:
# Uploading der Batch Input-Datei auf die OpenAI-Plattform
batch_file_6 = upload_batch_file(BATCH_INPUT_PATH + "batch_input_6.jsonl")
batch_file_6

FileObject(id='file-58BfmzR7AzjHXBQKiRdeTG', bytes=79088105, created_at=1736788702, filename='batch_input_6.jsonl', object='file', purpose='batch', status='processed', status_details=None)

In [None]:
# Erstellen eines Batches
metadata_dict = {"description": "Batch 6/9 for Argument Mining"}
batch_6 = create_batch(batch_file_6.id, metadata_dict) # sofern die Batch Datei bereits hochgeladen wurde, aber nicht erfolgreich war, kann die Batch-ID erneut verwendet werden. Ein erneuter Uplaod ist nicht notwendig und würde der Datei eine neue ID zuweisen.
print(batch_6)

Batch(id='batch_67860a74a03481909107e2a89f10a187', completion_window='24h', created_at=1736837748, endpoint='/v1/chat/completions', input_file_id='file-58BfmzR7AzjHXBQKiRdeTG', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1736924148, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Batch 6/9 for Argument Mining'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [10]:
# Status des Batches abfragen - Diese Funktion kann mehrfach aufgerufen werden, um den Status des Batches zu überprüfen, ohne Zusatzkosten zu verursachen.
batch_6_status = check_batch_status(batch_6.id)
print(batch_6_status)

Status: completed

Beschreibung des Batches: Batch 6/9 for Argument Mining
Anfragen gesamt: 322
Davon erfolgreich: 322
Davon fehlerhaft: 0
Erfolgreiche Abfragen können abgerufen werden mit ID: file-11jo846QQ1pmGXxp5JDNyZ
Keine fehlerhaften Abfragen zum herunterladen vorhanden.

Batch(id='batch_67860a74a03481909107e2a89f10a187', completion_window='24h', created_at=1736837748, endpoint='/v1/chat/completions', input_file_id='file-58BfmzR7AzjHXBQKiRdeTG', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1736839559, error_file_id=None, errors=None, expired_at=None, expires_at=1736924148, failed_at=None, finalizing_at=1736839526, in_progress_at=1736837752, metadata={'description': 'Batch 6/9 for Argument Mining'}, output_file_id='file-11jo846QQ1pmGXxp5JDNyZ', request_counts=BatchRequestCounts(completed=322, failed=0, total=322))


In [11]:
# Abrufen und Speichern der Batch-Ergebnisse (Output-Datei)
batch_6_results = retrieve_and_save_batch_results(batch_6_status.output_file_id, "output-batch-6")
print(batch_6_results[:1000])

{"id": "batch_req_6786116710148190b712ba1d6fd42154", "custom_id": "few-shot-40-persona_essay037.txt", "response": {"status_code": 200, "request_id": "8dddee82cd827ecaaa9339f9b31859c8", "body": {"id": "chatcmpl-ApV8ul7jazbBrLlmszvK1SEYlAgly", "object": "chat.completion", "created": 1736837804, "model": "gpt-4o-mini-2024-07-18", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n  \"MajorClaims\": [\n    {\n      \"ID\": \"MC1\",\n      \"Text\": \"international sporting occasions are essential in easing international tensions\"\n    },\n    {\n      \"ID\": \"MC2\",\n      \"Text\": \"International sporting events will make the world more peaceful\"\n    }\n  ],\n  \"Claims\": [\n    {\n      \"ID\": \"C1\",\n      \"Text\": \"international sporting events are a good change to create a multi-nation community of fans having the same passion\"\n    },\n    {\n      \"ID\": \"C2\",\n      \"Text\": \"people around the world understand each other more\"\n    },\n    {\

### Batch 7

In [4]:
# Uploading der Batch Input-Datei auf die OpenAI-Plattform
batch_file_7 = upload_batch_file(BATCH_INPUT_PATH + "batch_input_7.jsonl")
batch_file_7

FileObject(id='file-FVN2b1jfp2RS5JKC9ng7Em', bytes=79095032, created_at=1736924736, filename='batch_input_7.jsonl', object='file', purpose='batch', status='processed', status_details=None)

In [5]:
# Erstellen eines Batches
metadata_dict = {"description": "Batch 7/9 for Argument Mining"}
batch_7 = create_batch(batch_file_7.id, metadata_dict) # sofern die Batch Datei bereits hochgeladen wurde, aber nicht erfolgreich war, kann die Batch-ID erneut verwendet werden. Ein erneuter Uplaod ist nicht notwendig und würde der Datei eine neue ID zuweisen.
print(batch_7)

Batch(id='batch_67875e439c0081909378f77faa6010ce', completion_window='24h', created_at=1736924739, endpoint='/v1/chat/completions', input_file_id='file-FVN2b1jfp2RS5JKC9ng7Em', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1737011139, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Batch 7/9 for Argument Mining'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [8]:
# Status des Batches abfragen - Diese Funktion kann mehrfach aufgerufen werden, um den Status des Batches zu überprüfen, ohne Zusatzkosten zu verursachen.
batch_7_status = check_batch_status(batch_7.id)
print(batch_7_status)

Status: completed

Beschreibung des Batches: Batch 7/9 for Argument Mining
Anfragen gesamt: 320
Davon erfolgreich: 320
Davon fehlerhaft: 0
Erfolgreiche Abfragen können abgerufen werden mit ID: file-SntzBR9MciREe9ZepXmZ6o
Keine fehlerhaften Abfragen zum herunterladen vorhanden.

Batch(id='batch_67875e439c0081909378f77faa6010ce', completion_window='24h', created_at=1736924739, endpoint='/v1/chat/completions', input_file_id='file-FVN2b1jfp2RS5JKC9ng7Em', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1736925178, error_file_id=None, errors=None, expired_at=None, expires_at=1737011139, failed_at=None, finalizing_at=1736925160, in_progress_at=1736924742, metadata={'description': 'Batch 7/9 for Argument Mining'}, output_file_id='file-SntzBR9MciREe9ZepXmZ6o', request_counts=BatchRequestCounts(completed=320, failed=0, total=320))


In [9]:
# Abrufen und Speichern der Batch-Ergebnisse (Output-Datei)
batch_7_results = retrieve_and_save_batch_results(batch_7_status.output_file_id, "output-batch-7")
print(batch_7_results[:1000])

{"id": "batch_req_67875fe848308190b163df1129df2b14", "custom_id": "few-shot-40-persona_essay395.txt", "response": {"status_code": 200, "request_id": "e075021751db2fedcaf1daf7e6408d10", "body": {"id": "chatcmpl-AprlxQdnDoYgdeBHR0ieSwayx0UsH", "object": "chat.completion", "created": 1736924793, "model": "gpt-4o-mini-2024-07-18", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n  \"MajorClaims\": [\n    {\n      \"ID\": \"MC1\",\n      \"Text\": \"these taxes are absolutely essential\"\n    },\n    {\n      \"ID\": \"MC2\",\n      \"Text\": \"taxes paying for state schools are necessary to be compulsory for all members of society no matter where their children enroll in\"\n    }\n  ],\n  \"Claims\": [\n    {\n      \"ID\": \"C1\",\n      \"Text\": \"affluent people effectively contribute to narrowing down the gap between rich and poor\"\n    },\n    {\n      \"ID\": \"C2\",\n      \"Text\": \"the tax reduction for parents of children studying in private schools wou

### Batch 8

In [None]:
# Uploading der Batch Input-Datei auf die OpenAI-Plattform
batch_file_8 = upload_batch_file(BATCH_INPUT_PATH + "batch_input_8.jsonl")
batch_file_8

In [None]:
# Erstellen eines Batches
metadata_dict = {"description": "Batch 8/9 for Argument Mining"}
batch_8 = create_batch(batch_file_8.id, metadata_dict) # sofern die Batch Datei bereits hochgeladen wurde, aber nicht erfolgreich war, kann die Batch-ID erneut verwendet werden. Ein erneuter Uplaod ist nicht notwendig und würde der Datei eine neue ID zuweisen.
print(batch_8)

In [None]:
# Status des Batches abfragen - Diese Funktion kann mehrfach aufgerufen werden, um den Status des Batches zu überprüfen, ohne Zusatzkosten zu verursachen.
batch_8_status = check_batch_status(batch_8.id)
print(batch_8_status)

In [None]:
# Abrufen und Speichern der Batch-Ergebnisse (Output-Datei)
batch_8_results = retrieve_and_save_batch_results(batch_8_status.output_file_id, "output-batch-8")
print(batch_8_results[:1000])

### Batch 9

In [None]:
# Uploading der Batch Input-Datei auf die OpenAI-Plattform
batch_file_9 = upload_batch_file(BATCH_INPUT_PATH + "batch_input_9.jsonl")
batch_file_9

In [None]:
# Erstellen eines Batches
metadata_dict = {"description": "Batch 9/9 for Argument Mining"}
batch_9 = create_batch(batch_file_9.id, metadata_dict) # sofern die Batch Datei bereits hochgeladen wurde, aber nicht erfolgreich war, kann die Batch-ID erneut verwendet werden. Ein erneuter Uplaod ist nicht notwendig und würde der Datei eine neue ID zuweisen.
print(batch_9)

In [None]:
# Status des Batches abfragen - Diese Funktion kann mehrfach aufgerufen werden, um den Status des Batches zu überprüfen, ohne Zusatzkosten zu verursachen.
batch_9_status = check_batch_status(batch_9.id)
print(batch_9_status)

In [None]:
# Abrufen und Speichern der Batch-Ergebnisse (Output-Datei)
batch_9_results = retrieve_and_save_batch_results(batch_9_status.output_file_id, "output-batch-9")
print(batch_9_results[:1000])

## Ende Test

In [49]:

# # Erstellung der Batch-API-Dateien
# zs_batch_input = generate_batch_input(test_df=test_df, prompt_df=zs_prompt_df, file_name='zero-shot')
# os_batch_input = generate_batch_input(test_df=test_df, prompt_df=os_prompt_df, file_name='one-shot')
# fs10_batch_input = generate_batch_input_split(test_df=test_df, prompt_df=fs10_prompt_df, file_name='few-shot-10', num_files=2)
# fs20_batch_input = generate_batch_input_split(test_df=test_df, prompt_df=fs20_prompt_df, file_name='few-shot-20', num_files=3)
# fs40_batch_input = generate_batch_input_split(test_df=test_df, prompt_df=fs40_prompt_df, file_name='few-shot-40', num_files=4)

# # Ausgabe der ersten 1000 Zeichen der Batch-API-Dateien
# print(f"Zero-Shot-Batch-Input:\n{zs_batch_input[:1000]}")
# print(f"\nOne-Shot-Batch-Input:\n{os_batch_input[:1000]}")
# print(f"\nFew-Shot-10-Batch-Input:\n{fs10_batch_input[:1000]}")
# print(f"\nFew-Shot-20-Batch-Input:\n{fs20_batch_input[:1000]}")
# print(f"\nFew-Shot-40-Batch-Input:\n{fs40_batch_input[:1000]}")

Zero-Shot-Batch-Input:
{"custom_id": "zero-shot_essay001.txt", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o-mini", "messages": [{"role": "developer", "content": "You will be given a text. Extract the argumentative units major claim, claim, and premise as parts of the text. Also extract the argumentative relationships between the units. Claims can be for or against the major claims. Premises, on the other hand, can support or attack a claim or another premise. There may be several major claims. Return the argumentative units and the relationships between them as a JSON object."}, {"role": "user", "content": "Text: Should students be taught to compete or to cooperate?\n\nIt is always said that competition can effectively promote the development of economy. In order to survive in the competition, companies continue to improve their products and service, and as a result, the whole society prospers. However, when we discuss the issue of competition or cooperati

## Batches hochladen und erstellen

Die Batches wurden aufgeteilt, damit sie einzeln hochgeladen werden können. Das wurde gemacht, dass falls es vereinzelt zu Fehlern kommt, nicht alle Batches neu erstellt und hochgeladen werden müssen. Damit sollen die Kosten reduziert werden. 

Beschreibung der Status Codes:

| Status       | Description                                                                 |
|--------------|-----------------------------------------------------------------------------|
| validating   | the input file is being validated before the batch can begin                |
| failed       | the input file has failed the validation process                            |
| in_progress  | the input file was successfully validated and the batch is currently being run |
| finalizing   | the batch has completed and the results are being prepared                  |
| completed    | the batch has been completed and the results are ready                      |
| expired      | the batch was not able to be completed within the 24-hour time window       |
| cancelling   | the batch is being cancelled (may take up to 10 minutes)                    |
| cancelled    | the batch was cancelled                                                     |

Tabelle entnommen aus: https://platform.openai.com/docs/guides/batch/batch-api

### zero-shot batch

In [None]:
# Uploading the batch input file to OpenAI
zs_batch_file = upload_batch_file(BATCH_INPUT_PATH + "zero-shot.jsonl")
print(f"Zero-Shot-Batch-File:\n{zs_batch_file}")	

# creating a batch
metadata_dict = {"description": "Zero-shot prompts"}
zs_batch = create_batch(zs_batch_file.id, metadata_dict) # sofern die Batch Datei bereits hochgeladen wurde, aber nicht erfolgreich war, kann die Batch-ID erneut verwendet werden. Ein erneuter Uplaod ist nicht notwendig und würde der Datei eine neue ID zuweisen.
print(zs_batch)

Zero-Shot-Batch-File:
FileObject(id='file-6hoKEvaGuHd6z4DPf1ACLc', bytes=6835954, created_at=1736100157, filename='zero-shot.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Batch(id='batch_677ac93e91848190b7c82d16cddf2a30', completion_window='24h', created_at=1736100158, endpoint='/v1/chat/completions', input_file_id='file-6hoKEvaGuHd6z4DPf1ACLc', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1736186558, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Zero-shot prompts'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [72]:
# Status des Batches abfragen - Diese Funktion kann mehrfach aufgerufen werden, um den Status des Batches zu überprüfen, ohne Zusatzkosten zu verursachen.
# Die Bearbeitung des Batches kann bis zu 24 Stunden dauern, funktioniert aber in der Regel schneller.
zs_batch_id = zs_batch.id
zs_batch_status = check_batch_status(zs_batch_id)
print(zs_batch_status)

Status: completed

Beschreibung des Batches: Zero-shot prompts
Anfragen gesamt: 1448
Davon erfolgreich: 1448
Davon fehlerhaft: 0
Erfolgreiche Abfragen können abgerufen werden mit ID: file-W59dkF32eVSB6pqiXp6zAF
Keine fehlerhaften Abfragen zum herunterladen vorhanden.

Batch(id='batch_677ac93e91848190b7c82d16cddf2a30', completion_window='24h', created_at=1736100158, endpoint='/v1/chat/completions', input_file_id='file-6hoKEvaGuHd6z4DPf1ACLc', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1736102265, error_file_id=None, errors=None, expired_at=None, expires_at=1736186558, failed_at=None, finalizing_at=1736102103, in_progress_at=1736100160, metadata={'description': 'Zero-shot prompts'}, output_file_id='file-W59dkF32eVSB6pqiXp6zAF', request_counts=BatchRequestCounts(completed=1448, failed=0, total=1448))


In [89]:
# retrieve the results
zs_batch_results = retrieve_and_save_batch_results(zs_batch_status.output_file_id, "zero-shot")
print(zs_batch_results[:1000])

{"id": "batch_req_677ad0d784408190aaf56abea0bb14c8", "custom_id": "zero-shot_essay001.txt", "response": {"status_code": 200, "request_id": "8e5859b52be80ea2511383311d3c7c7a", "body": {"id": "chatcmpl-AmPFZZDNNOznxV2py3BY7O7iFRwh5", "object": "chat.completion", "created": 1736100169, "model": "gpt-4o-mini-2024-07-18", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n  \"MajorClaims\": [\n    {\n      \"ID\": \"MC1\",\n      \"Text\": \"We should attach more importance to cooperation during primary education.\"\n    }\n  ],\n  \"Claims\": [\n    {\n      \"ID\": \"C1\",\n      \"Text\": \"Competition can effectively promote the development of economy.\"\n    },\n    {\n      \"ID\": \"C2\",\n      \"Text\": \"Competition makes the society more effective.\"\n    },\n    {\n      \"ID\": \"C3\",\n      \"Text\": \"A more cooperative attitude towards life is more profitable in one's success.\"\n    }\n  ],\n  \"Premises\": [\n    {\n      \"ID\": \"P1\",\n      \"Tex

**Achtung**

Die Batches können nicht alle auf einmal erstellt werden, da es zu einem Fehler kommt. Es wird empfohlen die Batches einzeln in Auftrag zu geben, sobald der vorherige Batch abgeschlossen ist. Sofern eine Batch-Datei hochgeladen wurde und der Batch an sich jedoch fehlschlägt, kann die Batch-Datei weiterhin verwendet werden anhand der ID der Batch-Datei.


Es kommt derzeit der Fehler:
```
'Enqueued token limit reached for gpt-4o-mini in organization. Limit: 2,000,000 enqueued tokens. Please try again once some in_progress batches have been completed.
```
Das Problem ist, dass keinerlei Batches noch im Status `in_progress` sind, wie anhand der Batch-Übersicht hervorgeht. Es wurde deshalb ein kleiner Batch erstellt, um zu sehen, ob das Problem weiterhin besteht. Es hat sich herausgestellt, dass es an der Größe der Batches liegt. Die Batches wurden deshalb weiter aufgeteilt. 

In [151]:
# # Testbatch, da enqueued token limit besteht, ohne in_progress batches.
# # Uploading the batch input file to OpenAI
# test_batch_file = upload_batch_file(BATCH_INPUT_PATH + "zero-shot.jsonl")
# print(f"Zero-Shot-Batch-File:\n{test_batch_file}")	

# # creating a batch
# metadata_dict = {"description": "TEST-Batch für enqueued token limit error"}
# test_batch = create_batch(test_batch_file.id, metadata_dict)
# print(test_batch)

Zero-Shot-Batch-File:
FileObject(id='file-XLZRkTjEt4eqasQXqeb1E3', bytes=6835954, created_at=1736108729, filename='zero-shot.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Batch(id='batch_677aeabb672c8190a426ef895e470861', completion_window='24h', created_at=1736108731, endpoint='/v1/chat/completions', input_file_id='file-XLZRkTjEt4eqasQXqeb1E3', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1736195131, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'TEST-Batch für enqueued token limit error'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [163]:
# test_batch_id = test_batch.id
# test_batch_status = check_batch_status(test_batch_id)
# print(test_batch_status)

Status: in_progress
Der Batch wird noch verarbeitet. Bitte warten und später erneut prüfen.

Beschreibung des Batches: TEST-Batch für enqueued token limit error
Anfragen gesamt: 1448
Davon erfolgreich: 1368
Davon fehlerhaft: 0
Keine erfolgreichen Abfragen zum herunterladen vorhanden.
Keine fehlerhaften Abfragen zum herunterladen vorhanden.

Batch(id='batch_677aeabb672c8190a426ef895e470861', completion_window='24h', created_at=1736108731, endpoint='/v1/chat/completions', input_file_id='file-XLZRkTjEt4eqasQXqeb1E3', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1736195131, failed_at=None, finalizing_at=None, in_progress_at=1736108733, metadata={'description': 'TEST-Batch für enqueued token limit error'}, output_file_id=None, request_counts=BatchRequestCounts(completed=1368, failed=0, total=1448))


### one-shot batch

In [148]:
# Uploading the batch input file to OpenAI
os_batch_file = upload_batch_file(BATCH_INPUT_PATH + "one-shot.jsonl")
print(f"One-Shot-Batch-File:\n{os_batch_file}")	

One-Shot-Batch-File:
FileObject(id='file-Be32Y3pStfTVfh4K4RfbAQ', bytes=17295582, created_at=1736108327, filename='one-shot.jsonl', object='file', purpose='batch', status='processed', status_details=None)


In [51]:
# creating a batch
metadata_dict = {"description": "One-shot prompts"}
os_batch = create_batch(os_batch_file.id, metadata_dict) 
print(os_batch)

Batch(id='batch_677d78e3f50c8190b0fcb8f6952d790e', completion_window='24h', created_at=1736276196, endpoint='/v1/chat/completions', input_file_id='file-Be32Y3pStfTVfh4K4RfbAQ', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1736362596, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'One-shot prompts'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [69]:
# Status des Batches abfragen - Diese Funktion kann mehrfach aufgerufen werden, um den Status des Batches zu überprüfen, ohne Zusatzkosten zu verursachen.
# Die Bearbeitung des Batches kann bis zu 24 Stunden dauern, funktioniert aber in der Regel schneller.
os_batch_id = os_batch.id
os_batch_status = check_batch_status(os_batch_id)
print(os_batch_status)

Status: completed

Beschreibung des Batches: One-shot prompts
Anfragen gesamt: 1448
Davon erfolgreich: 1448
Davon fehlerhaft: 0
Erfolgreiche Abfragen können abgerufen werden mit ID: file-43SEJ54s21fWrHYpCzLpWJ
Keine fehlerhaften Abfragen zum herunterladen vorhanden.

Batch(id='batch_677d78e3f50c8190b0fcb8f6952d790e', completion_window='24h', created_at=1736276196, endpoint='/v1/chat/completions', input_file_id='file-Be32Y3pStfTVfh4K4RfbAQ', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1736276955, error_file_id=None, errors=None, expired_at=None, expires_at=1736362596, failed_at=None, finalizing_at=1736276761, in_progress_at=1736276197, metadata={'description': 'One-shot prompts'}, output_file_id='file-43SEJ54s21fWrHYpCzLpWJ', request_counts=BatchRequestCounts(completed=1448, failed=0, total=1448))


In [70]:
# retrieve the results
os_batch_results = retrieve_and_save_batch_results(os_batch_status.output_file_id, "one-shot")
print(os_batch_results[:1000])

{"id": "batch_req_677d7b1a53488190b868416d1ed544d4", "custom_id": "one-shot_essay001.txt", "response": {"status_code": 200, "request_id": "4eca3263efd58756290f8c00ec7c661e", "body": {"id": "chatcmpl-An92rPeQW7pbvzJn5N55rZXXqNvIe", "object": "chat.completion", "created": 1736276205, "model": "gpt-4o-mini-2024-07-18", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n  \"MajorClaims\": [\n    {\n      \"ID\": \"MC1\",\n      \"Text\": \"we should attach more importance to cooperation during primary education\"\n    },\n    {\n      \"ID\": \"MC2\",\n      \"Text\": \"a more cooperative attitude towards life is more profitable in one's success\"\n    }\n  ],\n  \"Claims\": [\n    {\n      \"ID\": \"C1\",\n      \"Text\": \"competition can effectively promote the development of economy\"\n    },\n    {\n      \"ID\": \"C2\",\n      \"Text\": \"competition makes the society more effective\"\n    },\n    {\n      \"ID\": \"C3\",\n      \"Text\": \"children can learn ab

### few-shot 10 batch 

### FS 10 - Teil 1 

In [54]:
# Uploading the batch input file to OpenAI
fs10_batch_file_1 = upload_batch_file(BATCH_INPUT_PATH + "few-shot-10_1.jsonl")
print(f"Few-Shot-10-Batch-File:\n{fs10_batch_file_1}")

Few-Shot-10-Batch-File:
FileObject(id='file-Fb3NLF3YvyEeJpaWownQjk', bytes=47485684, created_at=1736325559, filename='few-shot-10_1.jsonl', object='file', purpose='batch', status='processed', status_details=None)


In [62]:
# creating a batch
metadata_dict = {"description": "Few-shot-10 prompts, Teil 1/2"}
fs10_batch_1 = create_batch(fs10_batch_file_1.id, metadata_dict)
print(fs10_batch_1)

Batch(id='batch_677ebf42a2d08190a61ba2683e773e9f', completion_window='24h', created_at=1736359746, endpoint='/v1/chat/completions', input_file_id='file-Fb3NLF3YvyEeJpaWownQjk', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1736446146, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Few-shot-10 prompts, Teil 1/2'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [63]:
# Status des Batches abfragen - Diese Funktion kann mehrfach aufgerufen werden, um den Status des Batches zu überprüfen, ohne Zusatzkosten zu verursachen.
# Die Bearbeitung des Batches kann bis zu 24 Stunden dauern, funktioniert aber in der Regel schneller.
fs10_batch_1_status = check_batch_status('batch_677e39c2095c8190bb51b6e828fecc21')#(fs10_batch_1.id)
print(fs10_batch_1_status)

Status: failed
Error: Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4o-mini in organization org-1gP3awMqey1RnJpTBBhMoMPk. Limit: 1,000,000 enqueued tokens. Please try again once some in_progress batches have been completed.', param=None)], object='list')

Beschreibung des Batches: Few-shot-10 prompts, Teil 1/2
Anfragen gesamt: 0
Davon erfolgreich: 0
Davon fehlerhaft: 0
Keine erfolgreichen Abfragen zum herunterladen vorhanden.
Keine fehlerhaften Abfragen zum herunterladen vorhanden.

Batch(id='batch_677e39c2095c8190bb51b6e828fecc21', completion_window='24h', created_at=1736325570, endpoint='/v1/chat/completions', input_file_id='file-Fb3NLF3YvyEeJpaWownQjk', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4o-mini in organization org-1gP3awMqey1RnJp

In [None]:
# retrieve the results
fs10_batch_1_results = retrieve_and_save_batch_results(fs10_batch_1_status.output_file_id, "few-shot-10_1")
print(fs10_batch_1_results[:1000])

### FS 10 - Teil 2 

### few-shot 20 batch

In [68]:
# Uploading the batch input file to OpenAI
fs20_batch_file = upload_batch_file(BATCH_INPUT_PATH + "few-shot-20.jsonl")
print(f"Few-Shot-20-Batch-File:\n{fs20_batch_file}")

# creating a batch
metadata_dict = {"description": "Few-shot-20 prompts"}
fs20_batch = create_batch(fs20_batch_file.id, metadata_dict)
print(fs20_batch)

Few-Shot-20-Batch-File:
FileObject(id='file-3MVA5GNc2ViHzsPZafh1u7', bytes=185953554, created_at=1736101926, filename='few-shot-20.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Batch(id='batch_677ad02cc7f48190baffdb04373f14c8', completion_window='24h', created_at=1736101932, endpoint='/v1/chat/completions', input_file_id='file-3MVA5GNc2ViHzsPZafh1u7', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1736188332, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Few-shot-20 prompts'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [92]:
# Status des Batches abfragen - Diese Funktion kann mehrfach aufgerufen werden, um den Status des Batches zu überprüfen, ohne Zusatzkosten zu verursachen.
# Die Bearbeitung des Batches kann bis zu 24 Stunden dauern, funktioniert aber in der Regel schneller.
fs20_batch_id = fs20_batch.id
fs20_batch_status = check_batch_status(fs20_batch_id)
print(fs20_batch_status)

Status: failed
Error: Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4o-mini in organization org-1gP3awMqey1RnJpTBBhMoMPk. Limit: 2,000,000 enqueued tokens. Please try again once some in_progress batches have been completed.', param=None)], object='list')

Beschreibung des Batches: Few-shot-20 prompts
Anfragen gesamt: 0
Davon erfolgreich: 0
Davon fehlerhaft: 0
Keine erfolgreichen Abfragen zum herunterladen vorhanden.
Keine fehlerhaften Abfragen zum herunterladen vorhanden.

Batch(id='batch_677ad02cc7f48190baffdb04373f14c8', completion_window='24h', created_at=1736101932, endpoint='/v1/chat/completions', input_file_id='file-3MVA5GNc2ViHzsPZafh1u7', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4o-mini in organization org-1gP3awMqey1RnJpTBBhMoMPk.

In [None]:
# retrieve the results
fs20_batch_results = retrieve_and_save_batch_results(fs20_batch_status.output_file_id, "few-shot-20")
print(fs20_batch_results[:1000])

### few-shot 40 batch

In [69]:
# Uploading the batch input file to OpenAI
fs40_batch_file = upload_batch_file(BATCH_INPUT_PATH + "few-shot-40.jsonl")
print(f"Few-Shot-40-Batch-File:\n{fs40_batch_file}")

# creating a batch
metadata_dict = {"description": "Few-shot-40 prompts"}
fs40_batch = create_batch(fs40_batch_file.id, metadata_dict)
print(fs40_batch)

Few-Shot-40-Batch-File:
FileObject(id='file-QgaYQyBBzLraPmpqW8xMtD', bytes=356775562, created_at=1736102017, filename='few-shot-40.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Batch(id='batch_677ad08ba60c8190af7404af995ec827', completion_window='24h', created_at=1736102027, endpoint='/v1/chat/completions', input_file_id='file-QgaYQyBBzLraPmpqW8xMtD', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1736188427, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Few-shot-40 prompts'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [93]:
# Status des Batches abfragen - Diese Funktion kann mehrfach aufgerufen werden, um den Status des Batches zu überprüfen, ohne Zusatzkosten zu verursachen.
# Die Bearbeitung des Batches kann bis zu 24 Stunden dauern, funktioniert aber in der Regel schneller.
fs40_batch_id = fs40_batch.id
fs40_batch_status = check_batch_status(fs40_batch_id)
print(fs40_batch_status)

Status: failed
Error: Errors(data=[BatchError(code='maximum_input_file_size_exceeded', line=None, message='The batch input file is larger than the 209715200 maximum for the gpt-4o-mini model. Please try again with a smaller batch.', param=None)], object='list')

Beschreibung des Batches: Few-shot-40 prompts
Anfragen gesamt: 0
Davon erfolgreich: 0
Davon fehlerhaft: 0
Keine erfolgreichen Abfragen zum herunterladen vorhanden.
Keine fehlerhaften Abfragen zum herunterladen vorhanden.

Batch(id='batch_677ad08ba60c8190af7404af995ec827', completion_window='24h', created_at=1736102027, endpoint='/v1/chat/completions', input_file_id='file-QgaYQyBBzLraPmpqW8xMtD', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='maximum_input_file_size_exceeded', line=None, message='The batch input file is larger than the 209715200 maximum for the gpt-4o-mini model. Please try again with a smaller batch.', param=Non

In [None]:
# retrieve the results
fs40_batch_results = retrieve_and_save_batch_results(fs40_batch_status.output_file_id, "few-shot-40")
print(fs40_batch_results[:1000])

### Übersicht Batches für Client

In [25]:
# List all batches
batches_data = client.batches.list().data
batches_data

[Batch(id='batch_6784d168bb288190bda2b11e3b42da3c', completion_window='24h', created_at=1736757608, endpoint='/v1/chat/completions', input_file_id='file-BAg7ZagvUiaPgNUeoPjxVc', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1736844008, failed_at=None, finalizing_at=None, in_progress_at=1736757613, metadata={'description': 'Batch 5/9 for Argument Mining'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=323)),
 Batch(id='batch_67837fec4f04819088f396053ef790f2', completion_window='24h', created_at=1736671212, endpoint='/v1/chat/completions', input_file_id='file-CUxcAUKiZJZ64KkV8YAsG4', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1736673370, error_file_id=None, errors=None, expired_at=None, expires_at=1736757612, failed_at=None, finalizing_at=1736673312, in_progress_at=1736671215, metadata={'descripti

In [147]:
# cancel_batch = [batch.id for batch in batches_data if batch.metadata["description"] == "Zero-shot prompts with 10 examples from the training set"]
# cancel_batch

# # Cancel the batch
# for batch_id in cancel_batch:
#     client.batches.cancel(batch_id) 

ConflictError: Error code: 409 - {'error': {'message': "Cannot cancel a batch with status 'completed'.", 'type': 'invalid_request_error', 'param': None, 'code': None}}

In [26]:
# Extract the status and errors of batches_data
for batch in batches_data:
    #print(f"Batch ID: {batch.id}")
    print(f"Metadata: {batch.metadata['description']}")
    print(f"Status: {batch.status}")
    if batch.status == "failed":
        print(f"Errors: {batch.errors}")
    print("-------------------")

# batch_status = [batch.status for batch in batches_data]
# meta_data = [batch.metadata for batch in batches_data]
# if batch_status == "failed":
#     batch_errors = [batch.errors for batch in batches_data]
# else:
    


Metadata: Batch 5/9 for Argument Mining
Status: in_progress
-------------------
Metadata: Batch 4/9 for Argument Mining
Status: completed
-------------------
Metadata: Batch 3/9 for Argument Mining
Status: completed
-------------------
Metadata: Batch 2/9 for Argument Mining
Status: completed
-------------------
Metadata: Batch 1/9 for Argument Mining
Status: completed
-------------------
Metadata: Few-shot-10 prompts, Teil 1/2
Status: completed
-------------------
Metadata: Few-shot-10 prompts, Teil 1/2
Status: failed
Errors: Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4o-mini in organization org-1gP3awMqey1RnJpTBBhMoMPk. Limit: 1,000,000 enqueued tokens. Please try again once some in_progress batches have been completed.', param=None)], object='list')
-------------------
Metadata: One-shot prompts
Status: completed
-------------------
Metadata: TEST-Batch für enqueued token limit error
Status: completed
---------------

### Übersicht hochgeladener Dateien für Client

In [27]:
files_data = client.files.list().data
files_data

[FileObject(id='file-BAg7ZagvUiaPgNUeoPjxVc', bytes=79286180, created_at=1736757463, filename='batch_input_5.jsonl', object='file', purpose='batch', status='processed', status_details=None),
 FileObject(id='file-CTPtvsw1e79hupNn1R1DXf', bytes=2157139, created_at=1736673370, filename='batch_67837fec4f04819088f396053ef790f2_output.jsonl', object='file', purpose='batch_output', status='processed', status_details=None),
 FileObject(id='file-CUxcAUKiZJZ64KkV8YAsG4', bytes=79467621, created_at=1736670809, filename='batch_input_4.jsonl', object='file', purpose='batch', status='processed', status_details=None),
 FileObject(id='file-Dn8EPeVM9BjT1VpRaaAnvS', bytes=2470578, created_at=1736586694, filename='batch_67822fc8058c8190b864f42b92ad646d_output.jsonl', object='file', purpose='batch_output', status='processed', status_details=None),
 FileObject(id='file-1VeM9QWs9utTGbREGjvauV', bytes=79782274, created_at=1736585156, filename='batch_input_3.jsonl', object='file', purpose='batch', status='pro

In [140]:
# del_files_id = [file.id for file in files_data if file.filename.startswith("one-shot")]
# del_files_id

['file-UAhepbJUSWf38YwvKeVJJU', 'file-65feVvReKp8XvrT8cwNETh']

In [141]:
# Löschen einer hochgeladenen Datei
# client.files.delete("file-7PKrydrhPXnrzYdQd9GKSV")
# for file_id in del_files_id:
#     client.files.delete(file_id)