In [138]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
from transformers import AutoTokenizer
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

from src.dataimport import list_files_with_extension_directory, list_files_with_extension, load_text, list_files

# Loading files

In [139]:
# Example usage
TXT_FILES_PATH = 'data/original/brat-project-final/'
JSON_FILES_PATH = 'data/transformed/'

In [140]:
txt_files_directory_list = list_files_with_extension_directory(TXT_FILES_PATH, '.txt')
txt_files_directory_list

['data/original/brat-project-final/essay001.txt',
 'data/original/brat-project-final/essay002.txt',
 'data/original/brat-project-final/essay003.txt',
 'data/original/brat-project-final/essay004.txt',
 'data/original/brat-project-final/essay005.txt',
 'data/original/brat-project-final/essay006.txt',
 'data/original/brat-project-final/essay007.txt',
 'data/original/brat-project-final/essay008.txt',
 'data/original/brat-project-final/essay009.txt',
 'data/original/brat-project-final/essay010.txt',
 'data/original/brat-project-final/essay011.txt',
 'data/original/brat-project-final/essay012.txt',
 'data/original/brat-project-final/essay013.txt',
 'data/original/brat-project-final/essay014.txt',
 'data/original/brat-project-final/essay015.txt',
 'data/original/brat-project-final/essay016.txt',
 'data/original/brat-project-final/essay017.txt',
 'data/original/brat-project-final/essay018.txt',
 'data/original/brat-project-final/essay019.txt',
 'data/original/brat-project-final/essay020.txt',


In [141]:
json_files_directory_list = list_files_with_extension_directory(JSON_FILES_PATH, '.json')
json_files_directory_list

['data/transformed/essay001.json',
 'data/transformed/essay002.json',
 'data/transformed/essay003.json',
 'data/transformed/essay004.json',
 'data/transformed/essay005.json',
 'data/transformed/essay006.json',
 'data/transformed/essay007.json',
 'data/transformed/essay008.json',
 'data/transformed/essay009.json',
 'data/transformed/essay010.json',
 'data/transformed/essay011.json',
 'data/transformed/essay012.json',
 'data/transformed/essay013.json',
 'data/transformed/essay014.json',
 'data/transformed/essay015.json',
 'data/transformed/essay016.json',
 'data/transformed/essay017.json',
 'data/transformed/essay018.json',
 'data/transformed/essay019.json',
 'data/transformed/essay020.json',
 'data/transformed/essay021.json',
 'data/transformed/essay022.json',
 'data/transformed/essay023.json',
 'data/transformed/essay024.json',
 'data/transformed/essay025.json',
 'data/transformed/essay026.json',
 'data/transformed/essay027.json',
 'data/transformed/essay028.json',
 'data/transformed/e

In [142]:
print(f"Anzahl Text-Dateien: {len(txt_files_directory_list)}")
print(f"Anzahl Brat-Dateien: {len(json_files_directory_list)}")

Anzahl Text-Dateien: 402
Anzahl Brat-Dateien: 402


In [143]:
# create dataframe with file names
df = pd.DataFrame()
df['txt_path'] = txt_files_directory_list
df['json_path'] = json_files_directory_list
df['txt_file'] = df['txt_path'].apply(lambda x: os.path.basename(x))
df['json_file'] = df['json_path'].apply(lambda x: os.path.basename(x))
df['txt'] = df['txt_path'].apply(load_text)
df['json'] = df['json_path'].apply(load_text)

print(df.shape)
df.head()

(402, 6)


Unnamed: 0,txt_path,json_path,txt_file,json_file,txt,json
0,data/original/brat-project-final/essay001.txt,data/transformed/essay001.json,essay001.txt,essay001.json,Should students be taught to compete or to coo...,"{\n ""MajorClaims"": {\n ""MC1"": ""we should a..."
1,data/original/brat-project-final/essay002.txt,data/transformed/essay002.json,essay002.txt,essay002.json,More people are migrating to other countries t...,"{\n ""MajorClaims"": {\n ""MC1"": ""they are ab..."
2,data/original/brat-project-final/essay003.txt,data/transformed/essay003.json,essay003.txt,essay003.json,International tourism is now more common than ...,"{\n ""MajorClaims"": {\n ""MC1"": ""it has cont..."
3,data/original/brat-project-final/essay004.txt,data/transformed/essay004.json,essay004.txt,essay004.json,International tourism is now more common than ...,"{\n ""MajorClaims"": {\n ""MC1"": ""this indust..."
4,data/original/brat-project-final/essay005.txt,data/transformed/essay005.json,essay005.txt,essay005.json,Living and studying overseas\n\nIt is every st...,"{\n ""MajorClaims"": {\n ""MC1"": ""one who stu..."


# Train test split

In [144]:
# Split the dataframe into training and test sets
train_df, test_df = train_test_split(df, test_size=0.5, random_state=42)

# Display the first few rows of the training and test sets
print(f"Training DataFrame: {train_df.shape}")
print(f"\nTest DataFrame: {test_df.shape}")

Training DataFrame: (201, 6)

Test DataFrame: (201, 6)


In [145]:
# sort the dataframes
train_df = train_df.sort_values(by='txt_file')
train_df.head()

Unnamed: 0,txt_path,json_path,txt_file,json_file,txt,json
1,data/original/brat-project-final/essay002.txt,data/transformed/essay002.json,essay002.txt,essay002.json,More people are migrating to other countries t...,"{\n ""MajorClaims"": {\n ""MC1"": ""they are ab..."
2,data/original/brat-project-final/essay003.txt,data/transformed/essay003.json,essay003.txt,essay003.json,International tourism is now more common than ...,"{\n ""MajorClaims"": {\n ""MC1"": ""it has cont..."
4,data/original/brat-project-final/essay005.txt,data/transformed/essay005.json,essay005.txt,essay005.json,Living and studying overseas\n\nIt is every st...,"{\n ""MajorClaims"": {\n ""MC1"": ""one who stu..."
8,data/original/brat-project-final/essay009.txt,data/transformed/essay009.json,essay009.txt,essay009.json,Roommates quality and their importance\n\nMuch...,"{\n ""MajorClaims"": {\n ""MC1"": ""Considerati..."
11,data/original/brat-project-final/essay012.txt,data/transformed/essay012.json,essay012.txt,essay012.json,Advance in transportation and communication li...,"{\n ""MajorClaims"": {\n ""MC1"": ""technology ..."


In [146]:
test_df = test_df.sort_values(by='txt_file')
test_df.head()

Unnamed: 0,txt_path,json_path,txt_file,json_file,txt,json
0,data/original/brat-project-final/essay001.txt,data/transformed/essay001.json,essay001.txt,essay001.json,Should students be taught to compete or to coo...,"{\n ""MajorClaims"": {\n ""MC1"": ""we should a..."
3,data/original/brat-project-final/essay004.txt,data/transformed/essay004.json,essay004.txt,essay004.json,International tourism is now more common than ...,"{\n ""MajorClaims"": {\n ""MC1"": ""this indust..."
5,data/original/brat-project-final/essay006.txt,data/transformed/essay006.json,essay006.txt,essay006.json,Studies abroad and the cultural aspect of the ...,"{\n ""MajorClaims"": {\n ""MC1"": ""studying ab..."
6,data/original/brat-project-final/essay007.txt,data/transformed/essay007.json,essay007.txt,essay007.json,Will newspapers become a thing of the past?\n\...,"{\n ""MajorClaims"": {\n ""MC1"": ""newspapers ..."
7,data/original/brat-project-final/essay008.txt,data/transformed/essay008.json,essay008.txt,essay008.json,"Government budget focus, young children or uni...","{\n ""MajorClaims"": {\n ""MC1"": ""a governmen..."


# Prompt Templates

## zero shot

In [150]:
# zero-shot prompt
task_description = load_text('prompts/zero-shot.txt')
task_description

'You will be given a text. Extract the argumentative units “major claim”, “claim”, and “premise” as parts from the text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim or another premise. It is possible that there are several major claims. Return only the argumentative units and relationships between them. Return as a JSON object.'

In [151]:
task_description_with_structure = load_text('prompts/zero-shot-structure.txt')
task_description_with_structure

'You will be given a text. Extract the argumentative units “major claim”, “claim”, and “premise” as parts from the text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim. It is possible that there are several major claims. Return only the argumentative units and relationships between them. Return as a JSON object.\n{\n    "MajorClaims": {\n        "MC1": "Text",\n        "MC2": "Text"\n    },\n    "Claims": {\n        "C1": "Text",\n        "C2": "Text"\n    },\n    "Premises": {\n        "P1": "Text",\n        "P2": "Text"\n    },\n    "ArgumentativeRelations": [\n        {"Claim": "C1", "Relation": "for", "Target": "MC"},\n        {"Claim": "C2", "Relation": "against", "Target": "MC"},\n        {"Premise": "P1", "Relation": "supports", "Target": "C1"},\n        {"Premise": "P2", "Relation": "attacks", "Target": "C2"}\n    ]\n}'

## one-shot

In [152]:
# one-shot prompt - 1 example
examples_1 = train_df.sample(1, random_state=42)

# extract the text and json from the row
one_shot_txt = examples_1['txt'].values[0]
one_shot_json = examples_1['json'].values[0]

one_shot = f"{task_description}\n# Example\n## Input:\n{one_shot_txt}\n## Output:\n{one_shot_json}"
print(one_shot)

# save the prompt to a file
with open('prompts/one-shot.txt', 'w') as f:
    f.write(one_shot)

You will be given a text. Extract the argumentative units “major claim”, “claim”, and “premise” as parts from the text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim or another premise. It is possible that there are several major claims. Return only the argumentative units and relationships between them. Return as a JSON object.
# Example
## Input:
Should students be required to attend classes?

The issue at hand is whether it should be obligatory for university students to attend classes. This is an interesting question because it affects a great amount of students worldwide. Nonetheless there are various different policies regarding this topic. An important aspect might be whether one desires to optimize the learned knowledge or the amount of valuable experiences for the students. However, to my mind students should be free not to attend classes because 

## few-shot

In [None]:
# few-shot prompt - 2 examples
examples_2 = train_df.sample(2, random_state=42)

few_shot_str_2 = f"{task_description}\n"
example_counter = 1
for idx, row in examples_2.iterrows():
    example_str = f"\n# Example {example_counter}\n## Input:\n{row['txt']}\n## Output:\n{row['json']}"
    few_shot_str_2 += example_str
    example_counter += 1

# print(few_shot_str_2)

# save the prompt to a file
with open('prompts/few-shot2.txt', 'w') as f:
    f.write(f"{few_shot_str_2}")

In [None]:
# few-shot prompt - 3 examples
examples_3 = train_df.sample(3, random_state=43)

few_shot_str_3 = f"{task_description}\n"
example_counter = 1
for idx, row in examples_3.iterrows():
    example_str = f"\n# Example {example_counter}\n## Input:\n{row['txt']}\n## Output:\n{row['json']}"
    few_shot_str_3 += example_str
    example_counter += 1

# print(few_shot_str_3)

# save the prompt to a file
with open('prompts/few-shot3.txt', 'w') as f:
    f.write(f"{few_shot_str_3}")

In [153]:
# few-shot prompt - 5 examples
examples_5 = train_df.sample(5, random_state=42)

few_shot_str_5 = f"{task_description}\n"
example_counter = 1
for idx, row in examples_5.iterrows():
    example_str = f"\n# Example {example_counter}\n## Input:\n{row['txt']}\n## Output:\n{row['json']}"
    few_shot_str_5 += example_str
    example_counter += 1

print(few_shot_str_5)

# save the prompt to a file
with open('prompts/few-shot5.txt', 'w') as f:
    f.write(f"{few_shot_str_5}")

You will be given a text. Extract the argumentative units “major claim”, “claim”, and “premise” as parts from the text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim or another premise. It is possible that there are several major claims. Return only the argumentative units and relationships between them. Return as a JSON object.

# Example 1
## Input:
Should students be required to attend classes?

The issue at hand is whether it should be obligatory for university students to attend classes. This is an interesting question because it affects a great amount of students worldwide. Nonetheless there are various different policies regarding this topic. An important aspect might be whether one desires to optimize the learned knowledge or the amount of valuable experiences for the students. However, to my mind students should be free not to attend classes becau

In [154]:
# few-shot prompt - 10 examples
examples_10 = train_df.sample(10, random_state=42)

few_shot_str_10 = f"{task_description}\n" # adding task description to the beginning of the prompt
example_counter = 1
for idx, row in examples_10.iterrows():
    example_str = f"\n# Example {example_counter}\n## Input:\n{row['txt']}\n## Output:\n{row['json']}"
    few_shot_str_10 += example_str
    example_counter += 1

print(few_shot_str_10)

# save the prompt to a file
with open('prompts/few-shot10.txt', 'w') as f:
    f.write(f"{few_shot_str_10}") 

You will be given a text. Extract the argumentative units “major claim”, “claim”, and “premise” as parts from the text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim or another premise. It is possible that there are several major claims. Return only the argumentative units and relationships between them. Return as a JSON object.

# Example 1
## Input:
Should students be required to attend classes?

The issue at hand is whether it should be obligatory for university students to attend classes. This is an interesting question because it affects a great amount of students worldwide. Nonetheless there are various different policies regarding this topic. An important aspect might be whether one desires to optimize the learned knowledge or the amount of valuable experiences for the students. However, to my mind students should be free not to attend classes becau

# list prompt files

In [155]:
PROMPT_PATH = "prompts/"
prompt_files = list_files_with_extension_directory(PROMPT_PATH, 'txt')
prompt_files

['prompts/few-shot10.txt',
 'prompts/few-shot5.txt',
 'prompts/one-shot.txt',
 'prompts/zero-shot-structure.txt',
 'prompts/zero-shot.txt']

In [156]:
# load the prompt with the file name into a dataframe
prompt_df = pd.DataFrame()
prompt_df['prompt_file'] = [os.path.splitext(os.path.basename(file))[0] for file in prompt_files] # get the file name without the extension
prompt_df['prompt'] = [load_text(file) for file in prompt_files] # load the text from the file
prompt_df

Unnamed: 0,prompt_file,prompt
0,few-shot10,You will be given a text. Extract the argument...
1,few-shot5,You will be given a text. Extract the argument...
2,one-shot,You will be given a text. Extract the argument...
3,zero-shot-structure,You will be given a text. Extract the argument...
4,zero-shot,You will be given a text. Extract the argument...


In [157]:
# display an example prompt
prompt_df['prompt'][4]

'You will be given a text. Extract the argumentative units “major claim”, “claim”, and “premise” as parts from the text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim or another premise. It is possible that there are several major claims. Return only the argumentative units and relationships between them. Return as a JSON object.'

# loading the model and tokenizer
Todo:
- explain how to get access to the model
- explain how to get Hugging Face token

In [None]:
# get the API key from the .env file
load_dotenv()
llama_api = os.getenv("HUGGINGFACE_TOKEN")

model_id = "meta-llama/Llama-3.2-3B-Instruct"
# model_id = "meta-llama/Llama-3.3-70B-Instruct" # requires HugginFace Pro subscription

In [158]:
# Function to calculate token count
def calculate_token_count(prompt):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenized_prompt = tokenizer(prompt, return_tensors='pt')
    return tokenized_prompt.input_ids.size(1)

# Apply the function to the 'prompt' column and create a new column 'token_count'
prompt_df['token_count'] = prompt_df['prompt'].apply(calculate_token_count)

prompt_df.sort_values(by='token_count')

Unnamed: 0,prompt_file,prompt,token_count
4,zero-shot,You will be given a text. Extract the argument...,104
3,zero-shot-structure,You will be given a text. Extract the argument...,267
2,one-shot,You will be given a text. Extract the argument...,1398
1,few-shot5,You will be given a text. Extract the argument...,6081
0,few-shot10,You will be given a text. Extract the argument...,12135


In [176]:
token_count = train_df['json'].apply(calculate_token_count)
token_count.describe()

count     201.000000
mean      756.482587
std       173.849507
min       375.000000
25%       627.000000
50%       731.000000
75%       860.000000
max      1249.000000
Name: json, dtype: float64

In [178]:
df.max()

1249

Alles über 4096 Tokens landet im Error:
    
"422 Client Error: Unprocessable Entity for url: https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-3B-Instruct (Request ID: UG-Io1JonTNrf9UstULen)

Input validation error: `inputs` tokens + `max_new_tokens` must be <= 4096. Given: 12479 `inputs` tokens and 256 `max_new_tokens`"


In [None]:

max_new_tokens = 1024  # standard 512. Orientiert an der Tokenanzahl der JSON-Dateien (Ground-Truth) 
llm = HuggingFaceEndpoint(repo_id=model_id,
                          huggingfacehub_api_token=llama_api,
                          max_new_tokens=max_new_tokens,
                          #top_k=, # standard None
                          #top_p=, # standard 0.95
                          temperature=0.8, # standard 0.8
                          )

In [180]:
max_input_tokens = 4096 - max_new_tokens
print(f"Der Input darf die Tokenanzahl von {max_input_tokens} Token nicht überschreiten.")

Der Input darf die Tokenanzahl von 3840 Token nicht überschreiten.


# Create Chat Prompt Templates

In [160]:
# template
prompt_template = ChatPromptTemplate.from_messages(
    [
        ("system", "{system_message}"),
        ("user", "Text: {argument_text}"),
    ]
)

In [161]:
# user input texts
test_df_sample = test_df.sample(20, random_state=42)#['txt']#.values doenst work with .itterows()
test_df_sample['txt'][25]

"Prepared Food\n\nNowadays, more and more people begin to select prepared food as their daily meals, since it can effectively save time which is considered as money in our modern society. However it is obvious that prepared food can bring about some negative influence result from utilizing the artificial ingredients, ignoring the nutrition of food and modifying people's eating habits. In this essay, I would like to explain why this is not a good thing based on the three reasons above.\nFirst of all, to make their food easier to prepare and taste delicious, almost every producer adds a wide range of artificial ingredients in to the food that is now purchased by most people. Some ingredients being added have caused dire consequences. For instance, there are usually some articles in newspapers and magazines which report the relationship between certain chemical components in some food and diseases. Thus, easy-to-cook foods sometimes could be dangerous for human's health.\nNot cooking fres

In [162]:
# print example for one prompt template
print(
    prompt_template.invoke(
        {
            "system_message": one_shot,
            "argument_text": test_df_sample.values[0],
        }
    )
)

messages=[SystemMessage(content='You will be given a text. Extract the argumentative units “major claim”, “claim”, and “premise” as parts from the text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim or another premise. It is possible that there are several major claims. Return only the argumentative units and relationships between them. Return as a JSON object.\n# Example\n## Input:\nShould students be required to attend classes?\n\nThe issue at hand is whether it should be obligatory for university students to attend classes. This is an interesting question because it affects a great amount of students worldwide. Nonetheless there are various different policies regarding this topic. An important aspect might be whether one desires to optimize the learned knowledge or the amount of valuable experiences for the students. However, to my mind students should 

In [163]:
output_parser = StrOutputParser() # turns the output into a string 
# combine the prompt template, llm and output parser
llm_chain = prompt_template | llm | output_parser

In [164]:
# # invoke the chain
# one_shot_answer = llm_chain.invoke({"system_message": one_shot,
#                            "argument_text": test_df_sample[0]})
# print(one_shot_answer)

In [165]:
# dataframe to store the input and output of the llm chain
results_df = pd.DataFrame()

# iterate over the test data
for idx, row in test_df_sample.iterrows():
    # iterate over the prompt dataframe
    for _, prompt_row in prompt_df.iterrows():
        # invoke the chain
        try: 
            answer = llm_chain.invoke({"system_message": prompt_row['prompt'],
                                       "argument_text": row['txt']})
            # store the input and output in the dataframe
            new_row = pd.DataFrame({'prompt_file': [prompt_row['prompt_file']],
                                    'txt_file': [row['txt_file']],
                                    'json_file': [row['json_file']],
                                    'ground_truth': [row['json']],
                                    'answer': [answer]})
            results_df = pd.concat([results_df, new_row], ignore_index=True)
        except Exception as e: # catch errors like HTTPError, HfHubHTTPError
            new_row = pd.DataFrame({'prompt_file': [prompt_row['prompt_file']],
                                    'txt_file': [row['txt_file']],
                                    'json_file': [row['json_file']],
                                    'ground_truth': [row['json']],
                                    'answer': e})
            results_df = pd.concat([results_df, new_row], ignore_index=True)
        print(f"Finished {row['txt_file']} with prompt {prompt_row['prompt_file']}")

Finished essay176.txt with prompt few-shot10
Finished essay176.txt with prompt few-shot5
Finished essay176.txt with prompt one-shot
Finished essay176.txt with prompt zero-shot-structure
Finished essay176.txt with prompt zero-shot
Finished essay026.txt with prompt few-shot10
Finished essay026.txt with prompt few-shot5
Finished essay026.txt with prompt one-shot
Finished essay026.txt with prompt zero-shot-structure
Finished essay026.txt with prompt zero-shot
Finished essay064.txt with prompt few-shot10
Finished essay064.txt with prompt few-shot5
Finished essay064.txt with prompt one-shot
Finished essay064.txt with prompt zero-shot-structure
Finished essay064.txt with prompt zero-shot
Finished essay319.txt with prompt few-shot10
Finished essay319.txt with prompt few-shot5
Finished essay319.txt with prompt one-shot
Finished essay319.txt with prompt zero-shot-structure
Finished essay319.txt with prompt zero-shot
Finished essay248.txt with prompt few-shot10
Finished essay248.txt with prompt f

In [170]:
results_df

Unnamed: 0,prompt_file,txt_file,json_file,ground_truth,answer
0,few-shot10,essay176.txt,essay176.json,"{\n ""MajorClaims"": {\n ""MC1"": ""a quintesse...",422 Client Error: Unprocessable Entity for url...
1,few-shot5,essay176.txt,essay176.json,"{\n ""MajorClaims"": {\n ""MC1"": ""a quintesse...",422 Client Error: Unprocessable Entity for url...
2,one-shot,essay176.txt,essay176.json,"{\n ""MajorClaims"": {\n ""MC1"": ""a quintesse...","Hence, it is very worthwhile to visit the mus..."
3,zero-shot-structure,essay176.txt,essay176.json,"{\n ""MajorClaims"": {\n ""MC1"": ""a quintesse...","Therefore, one can make a conclusion that vis..."
4,zero-shot,essay176.txt,essay176.json,"{\n ""MajorClaims"": {\n ""MC1"": ""a quintesse...","Therefore, visiting museums are very much in ..."
...,...,...,...,...,...
95,few-shot10,essay125.txt,essay125.json,"{\n ""MajorClaims"": {\n ""MC1"": ""the three m...",422 Client Error: Unprocessable Entity for url...
96,few-shot5,essay125.txt,essay125.json,"{\n ""MajorClaims"": {\n ""MC1"": ""the three m...",422 Client Error: Unprocessable Entity for url...
97,one-shot,essay125.txt,essay125.json,"{\n ""MajorClaims"": {\n ""MC1"": ""the three m...",This is the most important thing that the uni...
98,zero-shot-structure,essay125.txt,essay125.json,"{\n ""MajorClaims"": {\n ""MC1"": ""the three m...",\n\nNote that the major claim is not explicit...


In [171]:
# example of the results dataframe
print(results_df['answer'][0])

422 Client Error: Unprocessable Entity for url: https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-3B-Instruct (Request ID: UG-Io1JonTNrf9UstULen)

Input validation error: `inputs` tokens + `max_new_tokens` must be <= 4096. Given: 12479 `inputs` tokens and 256 `max_new_tokens`


In [174]:
calculate_token_count(results_df['ground_truth'][0])

768

# Database ?

In [168]:
# import sqlite3

# # Connect to SQLite database (or create it if it doesn't exist)
# conn = sqlite3.connect('llm_output.db')
# cursor = conn.cursor()

# # Create a table to store the LLM output
# cursor.execute('''
# CREATE TABLE IF NOT EXISTS llm_output (
#     id INTEGER PRIMARY KEY AUTOINCREMENT,
#     argument_text TEXT,
#     answer TEXT
# )
# ''')

# # Insert the LLM output into the table
# cursor.execute('''
# INSERT INTO llm_output (argument_text, answer)
# VALUES (?, ?)
# ''', (argument_text, answer))

# # Commit the transaction and close the connection
# conn.commit()
# conn.close()

# Evluation

In [169]:
# JSON structure
data = {
    "ArgumentMining": {
        "MajorClaims": {
            "MC1": "Text",
            "MC2": "Text"
        },
        "Claims": {
            "C1": "Text",
            "C2": "Text"
        },
        "Premises": {
            "P1": "Text",
            "P2": "Text"
        },
        "ArgumentativeRelations": [
            {"Claim": "C1", "Relation": "for", "Target": "MC"},
            {"Claim": "C2", "Relation": "against", "Target": "MC"},
            {"Premise": "P1", "Relation": "supports", "Target": "C1"},
            {"Premise": "P2", "Relation": "attacks", "Target": "C2"}
        ]
    }
}

# Extract sections
major_claims = pd.DataFrame(list(data["ArgumentMining"]["MajorClaims"].items()), columns=["ID", "Text"])
claims = pd.DataFrame(list(data["ArgumentMining"]["Claims"].items()), columns=["ID", "Text"])
premises = pd.DataFrame(list(data["ArgumentMining"]["Premises"].items()), columns=["ID", "Text"])
relations = pd.DataFrame(data["ArgumentMining"]["ArgumentativeRelations"])

# Display dataframes
print("Major Claims:")
print(major_claims)
print("\nClaims:")
print(claims)
print("\nPremises:")
print(premises)
print("\nArgumentative Relations:")
print(relations)

Major Claims:
    ID  Text
0  MC1  Text
1  MC2  Text

Claims:
   ID  Text
0  C1  Text
1  C2  Text

Premises:
   ID  Text
0  P1  Text
1  P2  Text

Argumentative Relations:
  Claim  Relation Target Premise
0    C1       for     MC     NaN
1    C2   against     MC     NaN
2   NaN  supports     C1      P1
3   NaN   attacks     C2      P2
