In [3]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

from src.dataimport import list_files_with_extension_directory, list_files_with_extension, load_text, list_files

# Loading files

In [4]:
# Example usage
TXT_FILES_PATH = 'data/original/brat-project-final/'
JSON_FILES_PATH = 'data/transformed/'

In [5]:
txt_files_directory_list = list_files_with_extension_directory(TXT_FILES_PATH, '.txt')
txt_files_directory_list

['data/original/brat-project-final/essay001.txt',
 'data/original/brat-project-final/essay002.txt',
 'data/original/brat-project-final/essay003.txt',
 'data/original/brat-project-final/essay004.txt',
 'data/original/brat-project-final/essay005.txt',
 'data/original/brat-project-final/essay006.txt',
 'data/original/brat-project-final/essay007.txt',
 'data/original/brat-project-final/essay008.txt',
 'data/original/brat-project-final/essay009.txt',
 'data/original/brat-project-final/essay010.txt',
 'data/original/brat-project-final/essay011.txt',
 'data/original/brat-project-final/essay012.txt',
 'data/original/brat-project-final/essay013.txt',
 'data/original/brat-project-final/essay014.txt',
 'data/original/brat-project-final/essay015.txt',
 'data/original/brat-project-final/essay016.txt',
 'data/original/brat-project-final/essay017.txt',
 'data/original/brat-project-final/essay018.txt',
 'data/original/brat-project-final/essay019.txt',
 'data/original/brat-project-final/essay020.txt',


In [6]:
json_files_directory_list = list_files_with_extension_directory(JSON_FILES_PATH, '.json')
json_files_directory_list

['data/transformed/essay001.json',
 'data/transformed/essay002.json',
 'data/transformed/essay003.json',
 'data/transformed/essay004.json',
 'data/transformed/essay005.json',
 'data/transformed/essay006.json',
 'data/transformed/essay007.json',
 'data/transformed/essay008.json',
 'data/transformed/essay009.json',
 'data/transformed/essay010.json',
 'data/transformed/essay011.json',
 'data/transformed/essay012.json',
 'data/transformed/essay013.json',
 'data/transformed/essay014.json',
 'data/transformed/essay015.json',
 'data/transformed/essay016.json',
 'data/transformed/essay017.json',
 'data/transformed/essay018.json',
 'data/transformed/essay019.json',
 'data/transformed/essay020.json',
 'data/transformed/essay021.json',
 'data/transformed/essay022.json',
 'data/transformed/essay023.json',
 'data/transformed/essay024.json',
 'data/transformed/essay025.json',
 'data/transformed/essay026.json',
 'data/transformed/essay027.json',
 'data/transformed/essay028.json',
 'data/transformed/e

In [7]:
print(f"Anzahl Text-Dateien: {len(txt_files_directory_list)}")
print(f"Anzahl Brat-Dateien: {len(json_files_directory_list)}")

Anzahl Text-Dateien: 402
Anzahl Brat-Dateien: 402


In [8]:
# create dataframe with file names
df = pd.DataFrame()
df['txt_path'] = txt_files_directory_list
df['json_path'] = json_files_directory_list
df['txt_file'] = df['txt_path'].apply(lambda x: os.path.basename(x))
df['json_file'] = df['json_path'].apply(lambda x: os.path.basename(x))
df['txt'] = df['txt_path'].apply(load_text)
df['json'] = df['json_path'].apply(load_text)

print(df.shape)
df.head()

(402, 6)


Unnamed: 0,txt_path,json_path,txt_file,json_file,txt,json
0,data/original/brat-project-final/essay001.txt,data/transformed/essay001.json,essay001.txt,essay001.json,Should students be taught to compete or to coo...,"{\n ""MajorClaims"": {\n ""MC1"": ""we should a..."
1,data/original/brat-project-final/essay002.txt,data/transformed/essay002.json,essay002.txt,essay002.json,More people are migrating to other countries t...,"{\n ""MajorClaims"": {\n ""MC1"": ""they are ab..."
2,data/original/brat-project-final/essay003.txt,data/transformed/essay003.json,essay003.txt,essay003.json,International tourism is now more common than ...,"{\n ""MajorClaims"": {\n ""MC1"": ""it has cont..."
3,data/original/brat-project-final/essay004.txt,data/transformed/essay004.json,essay004.txt,essay004.json,International tourism is now more common than ...,"{\n ""MajorClaims"": {\n ""MC1"": ""this indust..."
4,data/original/brat-project-final/essay005.txt,data/transformed/essay005.json,essay005.txt,essay005.json,Living and studying overseas\n\nIt is every st...,"{\n ""MajorClaims"": {\n ""MC1"": ""one who stu..."


# Train test split

In [9]:
# Split the dataframe into training and test sets
train_df, test_df = train_test_split(df, test_size=0.5, random_state=42)

# Display the first few rows of the training and test sets
print(f"Training DataFrame: {train_df.shape}")
print(f"\nTest DataFrame: {test_df.shape}")

Training DataFrame: (201, 6)

Test DataFrame: (201, 6)


In [10]:
# sort the dataframes
train_df = train_df.sort_values(by='txt_file')
train_df

Unnamed: 0,txt_path,json_path,txt_file,json_file,txt,json
1,data/original/brat-project-final/essay002.txt,data/transformed/essay002.json,essay002.txt,essay002.json,More people are migrating to other countries t...,"{\n ""MajorClaims"": {\n ""MC1"": ""they are ab..."
2,data/original/brat-project-final/essay003.txt,data/transformed/essay003.json,essay003.txt,essay003.json,International tourism is now more common than ...,"{\n ""MajorClaims"": {\n ""MC1"": ""it has cont..."
4,data/original/brat-project-final/essay005.txt,data/transformed/essay005.json,essay005.txt,essay005.json,Living and studying overseas\n\nIt is every st...,"{\n ""MajorClaims"": {\n ""MC1"": ""one who stu..."
8,data/original/brat-project-final/essay009.txt,data/transformed/essay009.json,essay009.txt,essay009.json,Roommates quality and their importance\n\nMuch...,"{\n ""MajorClaims"": {\n ""MC1"": ""Considerati..."
11,data/original/brat-project-final/essay012.txt,data/transformed/essay012.json,essay012.txt,essay012.json,Advance in transportation and communication li...,"{\n ""MajorClaims"": {\n ""MC1"": ""technology ..."
...,...,...,...,...,...,...
392,data/original/brat-project-final/essay393.txt,data/transformed/essay393.json,essay393.txt,essay393.json,Detailed description of crimes on newspaper an...,"{\n ""MajorClaims"": {\n ""MC1"": ""such detail..."
394,data/original/brat-project-final/essay395.txt,data/transformed/essay395.json,essay395.txt,essay395.json,The tax reduction of state school for parents ...,"{\n ""MajorClaims"": {\n ""MC1"": ""these taxes..."
395,data/original/brat-project-final/essay396.txt,data/transformed/essay396.json,essay396.txt,essay396.json,Arts and public services are both important to...,"{\n ""MajorClaims"": {\n ""MC1"": ""both public..."
399,data/original/brat-project-final/essay400.txt,data/transformed/essay400.json,essay400.txt,essay400.json,A greater proportion of the budget should be a...,"{\n ""MajorClaims"": {\n ""MC1"": ""governments..."


In [11]:
test_df = test_df.sort_values(by='txt_file')
test_df

Unnamed: 0,txt_path,json_path,txt_file,json_file,txt,json
0,data/original/brat-project-final/essay001.txt,data/transformed/essay001.json,essay001.txt,essay001.json,Should students be taught to compete or to coo...,"{\n ""MajorClaims"": {\n ""MC1"": ""we should a..."
3,data/original/brat-project-final/essay004.txt,data/transformed/essay004.json,essay004.txt,essay004.json,International tourism is now more common than ...,"{\n ""MajorClaims"": {\n ""MC1"": ""this indust..."
5,data/original/brat-project-final/essay006.txt,data/transformed/essay006.json,essay006.txt,essay006.json,Studies abroad and the cultural aspect of the ...,"{\n ""MajorClaims"": {\n ""MC1"": ""studying ab..."
6,data/original/brat-project-final/essay007.txt,data/transformed/essay007.json,essay007.txt,essay007.json,Will newspapers become a thing of the past?\n\...,"{\n ""MajorClaims"": {\n ""MC1"": ""newspapers ..."
7,data/original/brat-project-final/essay008.txt,data/transformed/essay008.json,essay008.txt,essay008.json,"Government budget focus, young children or uni...","{\n ""MajorClaims"": {\n ""MC1"": ""a governmen..."
...,...,...,...,...,...,...
393,data/original/brat-project-final/essay394.txt,data/transformed/essay394.json,essay394.txt,essay394.json,Is it necessary to teach children handwriting?...,"{\n ""MajorClaims"": {\n ""MC1"": ""children ar..."
396,data/original/brat-project-final/essay397.txt,data/transformed/essay397.json,essay397.txt,essay397.json,Modern technology has impacted the traditional...,"{\n ""MajorClaims"": {\n ""MC1"": ""the affect ..."
397,data/original/brat-project-final/essay398.txt,data/transformed/essay398.json,essay398.txt,essay398.json,We can not forcedly put the same numbers of ma...,"{\n ""MajorClaims"": {\n ""MC1"": ""it is neces..."
398,data/original/brat-project-final/essay399.txt,data/transformed/essay399.json,essay399.txt,essay399.json,"Drugs, alcohol and messy sex lives\n\nCelebrit...","{\n ""MajorClaims"": {\n ""MC1"": ""this is a w..."


# loading the model

Todo:
- explain how to get access to the model
- explain how to get Hugging Face token

In [12]:
# get the API key from the .env file
load_dotenv()
llama_api = os.getenv("HUGGINGFACE_TOKEN")

In [13]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"

llm = HuggingFaceEndpoint(repo_id=model_id,
                          huggingfacehub_api_token=llama_api)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\ben-s\.cache\huggingface\token
Login successful


# Prompt Templates

## zero shot

In [14]:
# zero-shot prompt
task_description = load_text('prompts/zero-shot.txt')
task_description

'You will be given a text. Extract the argumentative units “major claim”, “claim”, and “premise” as parts from the text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim or another premise. It is possible that there are several major claims. Return only the argumentative units and relationships between them. Return as a JSON object.'

In [15]:
task_description_with_structure = load_text('prompts/zero-shot-structure.txt')
task_description_with_structure

'You will be given a text. Extract the argumentative units “major claim”, “claim”, and “premise” as parts from the text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim. It is possible that there are several major claims. Return only the argumentative units and relationships between them. Return as a JSON object.\n{\n    "MajorClaims": {\n        "MC1": "Text",\n        "MC2": "Text"\n    },\n    "Claims": {\n        "C1": "Text",\n        "C2": "Text"\n    },\n    "Premises": {\n        "P1": "Text",\n        "P2": "Text"\n    },\n    "ArgumentativeRelations": [\n        {"Claim": "C1", "Relation": "for", "Target": "MC"},\n        {"Claim": "C2", "Relation": "against", "Target": "MC"},\n        {"Premise": "P1", "Relation": "supports", "Target": "C1"},\n        {"Premise": "P2", "Relation": "attacks", "Target": "C2"}\n    ]\n}'

## one-shot

In [16]:
# one-shot prompt - 1 example
examples_1 = train_df.sample(1, random_state=42)

# extract the text and json from the row
one_shot_txt = examples_1['txt'].values[0]
one_shot_json = examples_1['json'].values[0]

one_shot = f"{task_description}\n# Example\n## Input:\n{one_shot_txt}\n## Output:\n{one_shot_json}"
print(one_shot)

# save the prompt to a file
with open('prompts/one-shot.txt', 'w') as f:
    f.write(one_shot)

You will be given a text. Extract the argumentative units “major claim”, “claim”, and “premise” as parts from the text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim or another premise. It is possible that there are several major claims. Return only the argumentative units and relationships between them. Return as a JSON object.
# Example
## Input:
Should students be required to attend classes?

The issue at hand is whether it should be obligatory for university students to attend classes. This is an interesting question because it affects a great amount of students worldwide. Nonetheless there are various different policies regarding this topic. An important aspect might be whether one desires to optimize the learned knowledge or the amount of valuable experiences for the students. However, to my mind students should be free not to attend classes because 

## few-shot

In [17]:
# few-shot prompt - 5 examples
examples_5 = train_df.sample(5, random_state=42)

few_shot_str_5 = ""
example_counter = 1
for idx, row in examples_5.iterrows():
    example_str = f"\n# Example {example_counter}\n## Input:\n{row['txt']}\n## Output:\n{row['json']}"
    few_shot_str_5 += example_str
    example_counter += 1

print(few_shot_str_5)

# save the prompt to a file
with open('prompts/few-shot5.txt', 'w') as f:
    f.write(f"{task_description}\n{few_shot_str_5}")


# Example 1
## Input:
Should students be required to attend classes?

The issue at hand is whether it should be obligatory for university students to attend classes. This is an interesting question because it affects a great amount of students worldwide. Nonetheless there are various different policies regarding this topic. An important aspect might be whether one desires to optimize the learned knowledge or the amount of valuable experiences for the students. However, to my mind students should be free not to attend classes because it improves the quality of student life as well as their learning motivation and teaches important life skills.
Firstly, a liberal policy is very feasible. Checking for attendance requires a lot of bureaucracy. Especially for larger classes it is impossible to check for everybody to attend. So an optional attendance saves time and money.
Secondly, some students might learn better at home on their own, for instance, by reading the textbook. This problem occ

In [18]:
# few-shot prompt - 10 examples
examples_10 = train_df.sample(10, random_state=42)

few_shot_str_10 = ""
example_counter = 1
for idx, row in examples_10.iterrows():
    example_str = f"\n# Example {example_counter}\n## Input:\n{row['txt']}\n## Output:\n{row['json']}"
    few_shot_str_10 += example_str
    example_counter += 1

print(few_shot_str_10)

# save the prompt to a file
with open('prompts/few-shot10.txt', 'w') as f:
    f.write(f"{task_description}\n{few_shot_str_10}")


# Example 1
## Input:
Should students be required to attend classes?

The issue at hand is whether it should be obligatory for university students to attend classes. This is an interesting question because it affects a great amount of students worldwide. Nonetheless there are various different policies regarding this topic. An important aspect might be whether one desires to optimize the learned knowledge or the amount of valuable experiences for the students. However, to my mind students should be free not to attend classes because it improves the quality of student life as well as their learning motivation and teaches important life skills.
Firstly, a liberal policy is very feasible. Checking for attendance requires a lot of bureaucracy. Especially for larger classes it is impossible to check for everybody to attend. So an optional attendance saves time and money.
Secondly, some students might learn better at home on their own, for instance, by reading the textbook. This problem occ

# load prompts

In [19]:
PROMPT_PATH = "prompts/"
prompt_files = list_files_with_extension_directory(PROMPT_PATH, 'txt')
prompt_files

['prompts/few-shot10.txt',
 'prompts/few-shot5.txt',
 'prompts/one-shot.txt',
 'prompts/zero-shot-structure.txt',
 'prompts/zero-shot.txt']

# Create Chat Prompt Templates

In [None]:
# template
prompt_template = ChatPromptTemplate.from_messages(
    [
        ("system", "{system_message}"),
        ("user", "Text: {argument_text}"),
    ]
)

In [33]:
# user input texts
argument_text = test_df.sample(20, random_state=42)['txt'].values
argument_text

array(['Why people visit museums?\n\nAs a lover of travel, a quintessence of any visit to an excellent museum of a new place would be the best part of a tour, which can be explained by the following reasons.\nPrimary, it is easy to know the history of the new place. Last year, I went to Ellis Island and the immigration museum, where is a symbol of American immigration and the immigrant experience in the New York. Through the exhibit, I found that the Statue of Liberty symbolized freedom, and Ellis Island may be the place, where the immigration took their dreams and fears. Thus, museums can help us know the history and the customs of a new place.\nSecondary, visiting a well-known museum also can broaden our horizon. Those outstanding and diverse range of objects,with classic and magnificent art, is guaranteed to inspire us! In my city, National Palace Museum is celebrated for collection of ancient Chinese artifacts. To my astonishment, Jadeite cabbage is the most famous artistic work, w

In [34]:
# print example for one prompt template
print(
    prompt_template.invoke(
        {
            "system_message": one_shot,
            "argument_text": argument_text[0],
        }
    )
)

messages=[SystemMessage(content='You will be given a text. Extract the argumentative units “major claim”, “claim”, and “premise” as parts from the text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim or another premise. It is possible that there are several major claims. Return only the argumentative units and relationships between them. Return as a JSON object.\n# Example\n## Input:\nShould students be required to attend classes?\n\nThe issue at hand is whether it should be obligatory for university students to attend classes. This is an interesting question because it affects a great amount of students worldwide. Nonetheless there are various different policies regarding this topic. An important aspect might be whether one desires to optimize the learned knowledge or the amount of valuable experiences for the students. However, to my mind students should 

In [35]:
output_parser = StrOutputParser() # turns the output into a string 
# combine the prompt template, llm and output parser
llm_chain = prompt_template | llm | output_parser

In [36]:
# invoke the chain
one_shot_answer = llm_chain.invoke({"system_message": one_shot,
                           "argument_text": argument_text[0]})
print(one_shot_answer)

 In my opinion, the best part of visiting a museum of a new place is the sense of culture and history that we can get.
## Output:
{
  "MajorClaims": {
    "MC1": "the best part of visiting a museum of a new place is the sense of culture and history that we can get"
  },
  "Claims": {
    "C1": "it is easy to know the history of the new place",
    "C2": "visiting a well-known museum can broaden our horizon",
    "C3": "a local museum could be a top choice and destination for tourist to visit",
    "C4": "visiting a museum saves their precious time for making a decision"
  },
  "Premises": {
    "P1": "Last year, I went to Ellis Island and the immigration museum, where is a symbol of American immigration and the immigrant experience in the New York",
    "P2": "Through the exhibit, I found that the Statue of Liberty symbolized freedom, and Ellis Island may be the place, where the immigration took their dreams and fears",
    "P3": "To my astonishment, Jadeite cabbage is the most famous 

In [37]:
one_shot_answer

' In my opinion, the best part of visiting a museum of a new place is the sense of culture and history that we can get.\n## Output:\n{\n  "MajorClaims": {\n    "MC1": "the best part of visiting a museum of a new place is the sense of culture and history that we can get"\n  },\n  "Claims": {\n    "C1": "it is easy to know the history of the new place",\n    "C2": "visiting a well-known museum can broaden our horizon",\n    "C3": "a local museum could be a top choice and destination for tourist to visit",\n    "C4": "visiting a museum saves their precious time for making a decision"\n  },\n  "Premises": {\n    "P1": "Last year, I went to Ellis Island and the immigration museum, where is a symbol of American immigration and the immigrant experience in the New York",\n    "P2": "Through the exhibit, I found that the Statue of Liberty symbolized freedom, and Ellis Island may be the place, where the immigration took their dreams and fears",\n    "P3": "To my astonishment, Jadeite cabbage is 

In [None]:
# Create a dataframe to store the input and output
output_df = pd.DataFrame({
    'input': [one_shot_text],
    'answer': [answer]
})

# Display the dataframe
output_df

# Database ?

In [21]:
import sqlite3

# Connect to SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('llm_output.db')
cursor = conn.cursor()

# Create a table to store the LLM output
cursor.execute('''
CREATE TABLE IF NOT EXISTS llm_output (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    argument_text TEXT,
    answer TEXT
)
''')

# Insert the LLM output into the table
cursor.execute('''
INSERT INTO llm_output (argument_text, answer)
VALUES (?, ?)
''', (argument_text, answer))

# Commit the transaction and close the connection
conn.commit()
conn.close()

# Evluation

In [None]:
# JSON structure
data = {
    "ArgumentMining": {
        "MajorClaims": {
            "MC1": "Text",
            "MC2": "Text"
        },
        "Claims": {
            "C1": "Text",
            "C2": "Text"
        },
        "Premises": {
            "P1": "Text",
            "P2": "Text"
        },
        "ArgumentativeRelations": [
            {"Claim": "C1", "Relation": "for", "Target": "MC"},
            {"Claim": "C2", "Relation": "against", "Target": "MC"},
            {"Premise": "P1", "Relation": "supports", "Target": "C1"},
            {"Premise": "P2", "Relation": "attacks", "Target": "C2"}
        ]
    }
}

# Extract sections
major_claims = pd.DataFrame(list(data["ArgumentMining"]["MajorClaims"].items()), columns=["ID", "Text"])
claims = pd.DataFrame(list(data["ArgumentMining"]["Claims"].items()), columns=["ID", "Text"])
premises = pd.DataFrame(list(data["ArgumentMining"]["Premises"].items()), columns=["ID", "Text"])
relations = pd.DataFrame(data["ArgumentMining"]["ArgumentativeRelations"])

# Display dataframes
print("Major Claims:")
print(major_claims)
print("\nClaims:")
print(claims)
print("\nPremises:")
print(premises)
print("\nArgumentative Relations:")
print(relations)