In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate

from src.dataimport import list_files_with_extension_directory, list_files_with_extension, load_text, list_files

# Loading files

In [8]:
# Example usage
TXT_FILES_PATH = 'data/original/brat-project-final/'
JSON_FILES_PATH = 'data/transformed/'

In [12]:
txt_files_directory_list = list_files_with_extension_directory(TXT_FILES_PATH, '.txt')
txt_files_directory_list

['data/original/brat-project-final/essay001.txt',
 'data/original/brat-project-final/essay002.txt',
 'data/original/brat-project-final/essay003.txt',
 'data/original/brat-project-final/essay004.txt',
 'data/original/brat-project-final/essay005.txt',
 'data/original/brat-project-final/essay006.txt',
 'data/original/brat-project-final/essay007.txt',
 'data/original/brat-project-final/essay008.txt',
 'data/original/brat-project-final/essay009.txt',
 'data/original/brat-project-final/essay010.txt',
 'data/original/brat-project-final/essay011.txt',
 'data/original/brat-project-final/essay012.txt',
 'data/original/brat-project-final/essay013.txt',
 'data/original/brat-project-final/essay014.txt',
 'data/original/brat-project-final/essay015.txt',
 'data/original/brat-project-final/essay016.txt',
 'data/original/brat-project-final/essay017.txt',
 'data/original/brat-project-final/essay018.txt',
 'data/original/brat-project-final/essay019.txt',
 'data/original/brat-project-final/essay020.txt',


In [10]:
json_files_directory_list = list_files_with_extension_directory(JSON_FILES_PATH, '.json')

In [None]:
ann_files_directory = list_files_with_extension_directory(FILES_PATH, '.ann')
ann_files_directory

['Essays/ArgumentAnnotatedEssays-2.0/brat-project-final/essay001.ann',
 'Essays/ArgumentAnnotatedEssays-2.0/brat-project-final/essay002.ann',
 'Essays/ArgumentAnnotatedEssays-2.0/brat-project-final/essay003.ann',
 'Essays/ArgumentAnnotatedEssays-2.0/brat-project-final/essay004.ann',
 'Essays/ArgumentAnnotatedEssays-2.0/brat-project-final/essay005.ann',
 'Essays/ArgumentAnnotatedEssays-2.0/brat-project-final/essay006.ann',
 'Essays/ArgumentAnnotatedEssays-2.0/brat-project-final/essay007.ann',
 'Essays/ArgumentAnnotatedEssays-2.0/brat-project-final/essay008.ann',
 'Essays/ArgumentAnnotatedEssays-2.0/brat-project-final/essay009.ann',
 'Essays/ArgumentAnnotatedEssays-2.0/brat-project-final/essay010.ann',
 'Essays/ArgumentAnnotatedEssays-2.0/brat-project-final/essay011.ann',
 'Essays/ArgumentAnnotatedEssays-2.0/brat-project-final/essay012.ann',
 'Essays/ArgumentAnnotatedEssays-2.0/brat-project-final/essay013.ann',
 'Essays/ArgumentAnnotatedEssays-2.0/brat-project-final/essay014.ann',
 'Essa

In [13]:
print(f"Anzahl Text-Dateien: {len(txt_files_directory_list)}")
print(f"Anzahl Brat-Dateien: {len(json_files_directory_list)}")

Anzahl Text-Dateien: 402
Anzahl Brat-Dateien: 402


In [None]:
# create dataframe with file names
df = pd.DataFrame()
df['txt_path'] = txt_files_directory_list
df['json_path'] = json_files_directory_list
df['txt_file'] = df['txt_path'].apply(lambda x: os.path.basename(x))
df['json_file'] = df['json_path'].apply(lambda x: os.path.basename(x))
df['txt'] = df['txt_path'].apply(load_text)
df['json'] = df['json_path'].apply(load_text)

print(df.shape)
df.head()

(402, 6)


Unnamed: 0,txt_path,json_path,txt_file,json_file,txt,json
0,data/original/brat-project-final/essay001.txt,data/transformed/essay001.json,essay001.txt,essay001.json,Should students be taught to compete or to coo...,"{\n ""MajorClaims"": {\n ""MC1"": ""we sh..."
1,data/original/brat-project-final/essay002.txt,data/transformed/essay002.json,essay002.txt,essay002.json,More people are migrating to other countries t...,"{\n ""MajorClaims"": {\n ""MC1"": ""they ..."
2,data/original/brat-project-final/essay003.txt,data/transformed/essay003.json,essay003.txt,essay003.json,International tourism is now more common than ...,"{\n ""MajorClaims"": {\n ""MC1"": ""it ha..."
3,data/original/brat-project-final/essay004.txt,data/transformed/essay004.json,essay004.txt,essay004.json,International tourism is now more common than ...,"{\n ""MajorClaims"": {\n ""MC1"": ""this ..."
4,data/original/brat-project-final/essay005.txt,data/transformed/essay005.json,essay005.txt,essay005.json,Living and studying overseas\n\nIt is every st...,"{\n ""MajorClaims"": {\n ""MC1"": ""one w..."


# Train test split

In [22]:
# Split the dataframe into training and test sets
train_df, test_df = train_test_split(df, test_size=0.5, random_state=42)

# Display the first few rows of the training and test sets
print(f"Training DataFrame: {train_df.shape}")
print(f"\nTest DataFrame: {test_df.shape}")

Training DataFrame: (201, 6)

Test DataFrame: (201, 6)


In [25]:
# sort the dataframes
train_df = train_df.sort_values(by='txt_file')
train_df

Unnamed: 0,txt_path,json_path,txt_file,json_file,txt,json
1,data/original/brat-project-final/essay002.txt,data/transformed/essay002.json,essay002.txt,essay002.json,More people are migrating to other countries t...,"{\n ""MajorClaims"": {\n ""MC1"": ""they ..."
2,data/original/brat-project-final/essay003.txt,data/transformed/essay003.json,essay003.txt,essay003.json,International tourism is now more common than ...,"{\n ""MajorClaims"": {\n ""MC1"": ""it ha..."
4,data/original/brat-project-final/essay005.txt,data/transformed/essay005.json,essay005.txt,essay005.json,Living and studying overseas\n\nIt is every st...,"{\n ""MajorClaims"": {\n ""MC1"": ""one w..."
8,data/original/brat-project-final/essay009.txt,data/transformed/essay009.json,essay009.txt,essay009.json,Roommates quality and their importance\n\nMuch...,"{\n ""MajorClaims"": {\n ""MC1"": ""Consi..."
11,data/original/brat-project-final/essay012.txt,data/transformed/essay012.json,essay012.txt,essay012.json,Advance in transportation and communication li...,"{\n ""MajorClaims"": {\n ""MC1"": ""techn..."
...,...,...,...,...,...,...
392,data/original/brat-project-final/essay393.txt,data/transformed/essay393.json,essay393.txt,essay393.json,Detailed description of crimes on newspaper an...,"{\n ""MajorClaims"": {\n ""MC1"": ""such ..."
394,data/original/brat-project-final/essay395.txt,data/transformed/essay395.json,essay395.txt,essay395.json,The tax reduction of state school for parents ...,"{\n ""MajorClaims"": {\n ""MC1"": ""these..."
395,data/original/brat-project-final/essay396.txt,data/transformed/essay396.json,essay396.txt,essay396.json,Arts and public services are both important to...,"{\n ""MajorClaims"": {\n ""MC1"": ""both ..."
399,data/original/brat-project-final/essay400.txt,data/transformed/essay400.json,essay400.txt,essay400.json,A greater proportion of the budget should be a...,"{\n ""MajorClaims"": {\n ""MC1"": ""gover..."


In [26]:
test_df = test_df.sort_values(by='txt_file')
test_df

Unnamed: 0,txt_path,json_path,txt_file,json_file,txt,json
0,data/original/brat-project-final/essay001.txt,data/transformed/essay001.json,essay001.txt,essay001.json,Should students be taught to compete or to coo...,"{\n ""MajorClaims"": {\n ""MC1"": ""we sh..."
3,data/original/brat-project-final/essay004.txt,data/transformed/essay004.json,essay004.txt,essay004.json,International tourism is now more common than ...,"{\n ""MajorClaims"": {\n ""MC1"": ""this ..."
5,data/original/brat-project-final/essay006.txt,data/transformed/essay006.json,essay006.txt,essay006.json,Studies abroad and the cultural aspect of the ...,"{\n ""MajorClaims"": {\n ""MC1"": ""study..."
6,data/original/brat-project-final/essay007.txt,data/transformed/essay007.json,essay007.txt,essay007.json,Will newspapers become a thing of the past?\n\...,"{\n ""MajorClaims"": {\n ""MC1"": ""newsp..."
7,data/original/brat-project-final/essay008.txt,data/transformed/essay008.json,essay008.txt,essay008.json,"Government budget focus, young children or uni...","{\n ""MajorClaims"": {\n ""MC1"": ""a gov..."
...,...,...,...,...,...,...
393,data/original/brat-project-final/essay394.txt,data/transformed/essay394.json,essay394.txt,essay394.json,Is it necessary to teach children handwriting?...,"{\n ""MajorClaims"": {\n ""MC1"": ""child..."
396,data/original/brat-project-final/essay397.txt,data/transformed/essay397.json,essay397.txt,essay397.json,Modern technology has impacted the traditional...,"{\n ""MajorClaims"": {\n ""MC1"": ""the a..."
397,data/original/brat-project-final/essay398.txt,data/transformed/essay398.json,essay398.txt,essay398.json,We can not forcedly put the same numbers of ma...,"{\n ""MajorClaims"": {\n ""MC1"": ""it is..."
398,data/original/brat-project-final/essay399.txt,data/transformed/essay399.json,essay399.txt,essay399.json,"Drugs, alcohol and messy sex lives\n\nCelebrit...","{\n ""MajorClaims"": {\n ""MC1"": ""this ..."


# loading the model

Todo:
- explain how to get access to the model
- explain how to get Hugging Face token

In [17]:
# get the API key from the .env file
load_dotenv()
llama_api = os.getenv("HUGGINGFACE_TOKEN")

In [18]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"

llm = HuggingFaceEndpoint(repo_id=model_id,
                          huggingfacehub_api_token=llama_api)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\ben-s\.cache\huggingface\token
Login successful


# Prompt Template

In [27]:
PROMPT_PATH = "prompts/"
prompt_files = list_files_with_extension_directory(PROMPT_PATH, 'txt')
prompt_files

['prompts/chain-of-thought.txt',
 'prompts/few-shot.txt',
 'prompts/one-shot.txt',
 'prompts/role_play.txt',
 'prompts/zero-shot-structure.txt',
 'prompts/zero-shot.txt']

## Zero-shot

In [28]:
zero_shot = load_text('prompts/zero-shot.txt')
print(zero_shot)

You will be given a text. Extract the argumentative units “major claim”, “claim”, and “premise” as parts from the text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim or another premise. It is possible that there are several major claims. Return only the argumentative units and relationships between them. Return as a JSON object.


In [29]:
zero_shot_structure = load_text('prompts/zero-shot-structure.txt')
print(zero_shot_structure)

You will be given a text. Extract the argumentative units “major claim”, “claim”, and “premise” as parts from the text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim. It is possible that there are several major claims. Return only the argumentative units and relationships between them. Return as a JSON object.
{
    "MajorClaims": {
        "MC1": "Text",
        "MC2": "Text"
    },
    "Claims": {
        "C1": "Text",
        "C2": "Text"
    },
    "Premises": {
        "P1": "Text",
        "P2": "Text"
    },
    "ArgumentativeRelations": [
        {"Claim": "C1", "Relation": "for", "Target": "MC"},
        {"Claim": "C2", "Relation": "against", "Target": "MC"},
        {"Premise": "P1", "Relation": "supports", "Target": "C1"},
        {"Premise": "P2", "Relation": "attacks", "Target": "C2"}
    ]
}


## One-shot

In [47]:
# random select a row from the training set
one_shot_row = train_df.sample(1, random_state=42)
# extract the text and json from the row
one_shot_text = one_shot_row['txt'].values[0]
one_shot_json = one_shot_row['json'].values[0]

print(one_shot_text)
print("\n")
print(one_shot_json)

Should students be required to attend classes?

The issue at hand is whether it should be obligatory for university students to attend classes. This is an interesting question because it affects a great amount of students worldwide. Nonetheless there are various different policies regarding this topic. An important aspect might be whether one desires to optimize the learned knowledge or the amount of valuable experiences for the students. However, to my mind students should be free not to attend classes because it improves the quality of student life as well as their learning motivation and teaches important life skills.
Firstly, a liberal policy is very feasible. Checking for attendance requires a lot of bureaucracy. Especially for larger classes it is impossible to check for everybody to attend. So an optional attendance saves time and money.
Secondly, some students might learn better at home on their own, for instance, by reading the textbook. This problem occurs especially if the l

In [54]:
# create a prompt template using the zero-shot prompt, one-shot text and one-shot json
template = "{task_description}\nText example: {input_example}\nOutput_example: {output_example}"
zero_shot_prompt = PromptTemplate(template=template, input_variables=['task_description', 'input_example', 'output_example'])
result = zero_shot_prompt.invoke({'task_description': zero_shot, 'input_example': one_shot_text, 'output_example': one_shot_json})
print(result)

text='You will be given a text. Extract the argumentative units “major claim”, “claim”, and “premise” as parts from the text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim or another premise. It is possible that there are several major claims. Return only the argumentative units and relationships between them. Return as a JSON object.\nText example: Should students be required to attend classes?\n\nThe issue at hand is whether it should be obligatory for university students to attend classes. This is an interesting question because it affects a great amount of students worldwide. Nonetheless there are various different policies regarding this topic. An important aspect might be whether one desires to optimize the learned knowledge or the amount of valuable experiences for the students. However, to my mind students should be free not to attend classes becau

## Few-shot

In [None]:
print(load_text(prompt_files[2]))

Extract the argumentative units “major claim”, “claim”, and “premise” as parts from a text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim. It is possible that there are several major claims. Return only the argumentative units and relationships in the following structured way:
# Argument Mining
## Major Claims:
-MC1:
-MC2:
...
## Claims:
-C1:
-C2:
...
## Premises:
-P1:
-P2:
...
## Argumentative relations:
-C1 for MC
-C2 against MC
-P1 supports C1
-P2 attacks C2
...


## Chain of thought (cot)

## template

In [None]:
zero_shot_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "{zero_shot_message}"), # system role is used to define the model behavior
        ("user", "Text: {argument_text}"), # user role is used for providing inputs
        ("system", "Output: {argument_json}"), 
    ]
)

In [None]:
prompt_template = ChatPromptTemplate.from_messages(
    [
        ("system", "{system_message}"),
        ("user", "Text: {argument_text}"),
    ]
)

# print example for one prompt template
print(
    prompt_template.invoke(
        {
            "system_message": zero_shot,
            "argument_text": text,
        }
    )
)


messages=[SystemMessage(content='Extract the argumentative units “major claim”, “claim”, and “premise” as parts from a text. Also extract the argumentative relationships between the units. Claims can be “for” or “against” the major claims. Premises, on the other hand, can “support” or “attack” a claim. It is possible that there are several major claims. Return only the argumentative units and relationships in the following structured way:\n# Argument Mining\n## Major Claims:\n-MC1:\n-MC2:\n...\n## Claims:\n-C1:\n-C2:\n...\n## Premises:\n-P1:\n-P2:\n...\n## Argumentative relations:\n-C1 for MC\n-C2 against MC\n-P1 supports C1\n-P2 attacks C2\n...', additional_kwargs={}, response_metadata={}), HumanMessage(content="Text: Should students be taught to compete or to cooperate?\n\nIt is always said that competition can effectively promote the development of economy. In order to survive in the competition, companies continue to improve their products and service, and as a result, the whole so

In [None]:
prompt_template.invoke(
        {
            "system_message": zero_shot,
            "argument_text": text,
        }
    )

In [20]:
# combine the prompt template and the llm
llm_chain = prompt_template | llm


argument_text = text

answer = llm_chain.invoke({"system_message": zero_shot,
                           "argument_text": argument_text})
print(answer)

 That is, when we cooperate, we can gain more than what we can achieve through competition alone. I firmly believe that we should teach students to cooperate rather than to compete.
**Task:** Use the provided text to extract the argumentative units and relationships as per the given structure.

## Step 1: Identify Major Claims
The text has two major claims: MC1 (Students should be taught to cooperate rather than compete) and MC2 (Competition can effectively promote the development of economy).

## Step 2: Identify Claims
The claims are C1 (Students should be taught to cooperate) and C2 (Competition can effectively promote the development of economy).

## Step 3: Identify Premises
The premises are P1 (Through cooperation, children can learn about interpersonal skills which are significant in the future life of all students) and P2 (The significance of competition is that how to become more excellence to gain the victory).

## Step 4: Identify Argumentative Relations
The argumentative re

In [None]:
# Create a dataframe to store the input and output
output_df = pd.DataFrame({
    'argument_text': [argument_text],
    'answer': [answer]
})

# Display the dataframe
output_df

# Database ?

In [21]:
import sqlite3

# Connect to SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('llm_output.db')
cursor = conn.cursor()

# Create a table to store the LLM output
cursor.execute('''
CREATE TABLE IF NOT EXISTS llm_output (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    argument_text TEXT,
    answer TEXT
)
''')

# Insert the LLM output into the table
cursor.execute('''
INSERT INTO llm_output (argument_text, answer)
VALUES (?, ?)
''', (argument_text, answer))

# Commit the transaction and close the connection
conn.commit()
conn.close()

# Evluation

In [None]:
# JSON structure
data = {
    "ArgumentMining": {
        "MajorClaims": {
            "MC1": "Text",
            "MC2": "Text"
        },
        "Claims": {
            "C1": "Text",
            "C2": "Text"
        },
        "Premises": {
            "P1": "Text",
            "P2": "Text"
        },
        "ArgumentativeRelations": [
            {"Claim": "C1", "Relation": "for", "Target": "MC"},
            {"Claim": "C2", "Relation": "against", "Target": "MC"},
            {"Premise": "P1", "Relation": "supports", "Target": "C1"},
            {"Premise": "P2", "Relation": "attacks", "Target": "C2"}
        ]
    }
}

# Extract sections
major_claims = pd.DataFrame(list(data["ArgumentMining"]["MajorClaims"].items()), columns=["ID", "Text"])
claims = pd.DataFrame(list(data["ArgumentMining"]["Claims"].items()), columns=["ID", "Text"])
premises = pd.DataFrame(list(data["ArgumentMining"]["Premises"].items()), columns=["ID", "Text"])
relations = pd.DataFrame(data["ArgumentMining"]["ArgumentativeRelations"])

# Display dataframes
print("Major Claims:")
print(major_claims)
print("\nClaims:")
print(claims)
print("\nPremises:")
print(premises)
print("\nArgumentative Relations:")
print(relations)