In [1]:
import pandas as pd
import os

# eigene Module
from src.dataimport import load_text, list_files_with_extension_directory
from src.llmlib import num_tokens_from_string
from src.anntransform import update_ann_file, transform_ann_to_json, count_origin_target_pairs, count_relation_types

# Dateien laden

In [2]:
FILES_PATH = 'data/original/brat-project-final/'

In [3]:
ann_files = list_files_with_extension_directory(FILES_PATH, '.ann')
ann_files[:5]

['data/original/brat-project-final/essay001.ann',
 'data/original/brat-project-final/essay002.ann',
 'data/original/brat-project-final/essay003.ann',
 'data/original/brat-project-final/essay004.ann',
 'data/original/brat-project-final/essay005.ann']

# IDs ändern
Die Argumentationskomponenten wurden alle mit T als Präfix versehen. Die IDs der Argumentationskomponenten sollen nachfolgend geändert werden, sodass sie die nachfolgende IDs erhalten: 
- MC für MajorClaim (Hauptaussage)
- C für Claim (Behauptung)
- P für Premise (Prämisse)

Die Änderung des Präfix soll auch in den Argumentationsbeziehungen vorgenommen werden. Die jeweiligen Komponenten werden fortlaufend nummeriert.

In [4]:
# Example usage
input_text = load_text(ann_files[0])

output_text = update_ann_file(input_text)
print(output_text)

MC1	MajorClaim 503 575	we should attach more importance to cooperation during primary education
MC2	MajorClaim 2154 2231	a more cooperative attitudes towards life is more profitable in one's success
C1	Claim 591 714	through cooperation, children can learn about interpersonal skills which are significant in the future life of all students
A1	Stance C1 For
P1	Premise 716 851	What we acquired from team work is not only how to achieve the same goal with others but more importantly, how to get along with others
P2	Premise 853 1086	During the process of cooperation, children can learn about how to listen to opinions of others, how to communicate with others, how to think comprehensively, and even how to compromise with other team members when conflicts occurred
P3	Premise 1088 1191	All of these skills help them to get on well with other people and will benefit them for the whole life
R1	supports Arg1:P1 Arg2:C1	
R2	supports Arg1:P2 Arg2:C1	
R3	supports Arg1:P3 Arg2:C1	
C2	Claim 1332 1376	com

In [5]:
new_path = 'data/transformed/'
for file in ann_files:
    input_text = load_text(file)
    output_text = update_ann_file(input_text)
    # Speichern der transformierten Dateien
    with open(new_path + file.split('/')[-1], 'w') as f: # file.split('/')[-1] gibt den Dateinamen zurück
        f.write(output_text)

# Transformation zu JSON

In [6]:
# Liste mit den transformierten .ann-Dateien
transformed_ann_files = list_files_with_extension_directory(new_path, '.ann')
transformed_ann_files[:5]

['data/transformed/essay001.ann',
 'data/transformed/essay002.ann',
 'data/transformed/essay003.ann',
 'data/transformed/essay004.ann',
 'data/transformed/essay005.ann']

In [7]:
# Beispieldatei nach der Anpassung der IDs 
ann_transformed = load_text(transformed_ann_files[1])
print(ann_transformed)

MC1	MajorClaim 391 489	they are able to sustain their cultural identities and doing so help they keep their origin values
MC2	MajorClaim 1936 2077	sustaining the traditions will make the immigrated people keep their cultural identity and give them confident among the many culture society
P1	Premise 500 624	maintaining one’s cultural identity is a key important rule to help individuals emerge in the new multicultural environments
C1	Claim 1089 1156	sustaining the cultural values of immigrants is paramount essential
A2	Stance C1 For
P2	Premise 626 839	Take Australia for example, immigrants from varieties of nations have a day called multicultural day where people from each country prepare their food and traditional activities for displaying in the public venues
P3	Premise 841 946	Many Australians come this day to enjoy the shows, learn about the cultures and admire the diverse values
P4	Premise 948 1057	These feedbacks, in turn, help raise one’s pride of their cultures and help people un

In [8]:
# transformierte Dateien in JSON-Dateien umwandeln
for file in transformed_ann_files:
    input_text = load_text(file)
    output_text = transform_ann_to_json(input_text)
    # Speichern der transformierten Dateien
    with open(new_path + file.split('/')[-1].replace('.ann', '.json'), 'w') as f: # Dateiendung .ann durch .json ersetzen
        f.write(output_text)    

In [9]:
# Liste mit den transformierten .json-Dateien
transformed_json_files = list_files_with_extension_directory(new_path, '.json')
transformed_json_files[:5]

['data/transformed/essay001.json',
 'data/transformed/essay002.json',
 'data/transformed/essay003.json',
 'data/transformed/essay004.json',
 'data/transformed/essay005.json']

In [10]:
# Beispieldatei nach der Transformation in JSON
print(load_text(transformed_json_files[0]))

{
  "MajorClaims": [
    {
      "ID": "MC1",
      "Text": "we should attach more importance to cooperation during primary education"
    },
    {
      "ID": "MC2",
      "Text": "a more cooperative attitudes towards life is more profitable in one's success"
    }
  ],
  "Claims": [
    {
      "ID": "C1",
      "Text": "through cooperation, children can learn about interpersonal skills which are significant in the future life of all students"
    },
    {
      "ID": "C2",
      "Text": "competition makes the society more effective"
    },
    {
      "ID": "C3",
      "Text": "without the cooperation, there would be no victory of competition"
    }
  ],
  "Premises": [
    {
      "ID": "P1",
      "Text": "What we acquired from team work is not only how to achieve the same goal with others but more importantly, how to get along with others"
    },
    {
      "ID": "P2",
      "Text": "During the process of cooperation, children can learn about how to listen to opinions of others,

In [11]:
# Berechnung der Tokenanzahl in der JSON-Dateien
json_data = []
for file in transformed_json_files:
    input_text = load_text(file)
    file_name = os.path.basename(file)
    num_tokens = num_tokens_from_string(input_text, 'gpt-4o-mini')
    json_data.append({"File": file_name, "NumTokens": num_tokens})

df = pd.DataFrame(json_data)
df.head()

Unnamed: 0,File,NumTokens
0,essay001.json,717
1,essay002.json,883
2,essay003.json,643
3,essay004.json,703
4,essay005.json,733


In [12]:
df.describe()

Unnamed: 0,NumTokens
count,402.0
mean,915.445274
std,214.888461
min,451.0
25%,758.0
50%,886.0
75%,1057.75
max,1620.0


In [13]:
# Anzahl der Beziehungen pro Typ bestimmen für alle JSON-Dateien
df = count_relation_types(transformed_json_files)
df.T

Unnamed: 0,C,P,M
Origin,1506.0,3832.0,
Target,3108.0,724.0,1506.0


In [14]:
# Anzahl der Origin-Target-Paare bestimmen für alle JSON-Dateien
df = count_origin_target_pairs(transformed_json_files)
df

Unnamed: 0,Origin-Target Pair,Count
1,"(P, C)",3108
0,"(C, M)",1506
2,"(P, P)",724
