In [1]:
import re
import json
import pandas as pd

from src.dataimport import load_text, list_files_with_extension_directory

# Dateien laden

In [2]:
FILES_PATH = 'data/original/brat-project-final/'

In [3]:
ann_files = list_files_with_extension_directory(FILES_PATH, '.ann')
ann_files[:5]

['data/original/brat-project-final/essay001.ann',
 'data/original/brat-project-final/essay002.ann',
 'data/original/brat-project-final/essay003.ann',
 'data/original/brat-project-final/essay004.ann',
 'data/original/brat-project-final/essay005.ann']

# IDs ändern
Die Argumentationskomponenten wurden alle mit T als Präfix versehen. Die IDs der Argumentationskomponenten sollen nachfolgend geändert werden. Die Argumentationskomponenten sollen folgende IDs erhalten: 
- MC für MajorClaim
- C für Claim
- P für Premise

Die Änderung des Präfix soll auch in den Argumentationsbeziehungen vorgenommen werden. Die jeweiligen Komponenten werden fortlaufend nummeriert.

In [4]:
def update_ann_file(input_text):
    # Define patterns
    pattern_unit = r"^(T\d+)\s+(MajorClaim|Claim|Premise)" # explanation: ^ start of line, T\d+ T followed by digits, \s+ one or more spaces, (MajorClaim|Claim|Premise) one of the three types
    pattern_relation = r"^(R\d+)\s+\w+ Arg1:(T\d+) Arg2:(T\d+)" # explanation: ^ start of line, R\d+ R followed by digits, \s+ one or more spaces, \w+ one or more word characters, Arg1:T\d+ Arg2:T\d+ Arg1 and Arg2 followed by T and digits
    pattern_stance = r"^(A\d+)\s+Stance (T\d+)" # explanation: ^ start of line, A\d+ A followed by digits, \s+ one or more spaces, Stance, T\d+ T followed by digits

    # Counters for each type, to assign new IDs
    counters = {"MajorClaim": 1, "Claim": 1, "Premise": 1}
    
    # Mapping from old to new IDs
    id_mapping = {} # key: old ID, value: new ID

    output_lines = [] # List to store the updated lines

    for line in input_text.splitlines():
        # Match argumentative units
        match_unit = re.match(pattern_unit, line)
        if match_unit:
            old_id, unit_type = match_unit.groups()
            prefix = {"MajorClaim": "MC", "Claim": "C", "Premise": "P"}[unit_type]
            new_id = f"{prefix}{counters[unit_type]}"
            counters[unit_type] += 1
            id_mapping[old_id] = new_id

            # Replace old ID with new ID in the line
            line = line.replace(old_id, new_id, 1)

        # Match relations
        match_relation = re.match(pattern_relation, line)
        if match_relation:
            old_rel_id, arg1, arg2 = match_relation.groups()
            new_arg1 = id_mapping.get(arg1, arg1)
            new_arg2 = id_mapping.get(arg2, arg2)

            # Replace Arg1 and Arg2 IDs in the line
            line = re.sub(rf"Arg1:{arg1}", f"Arg1:{new_arg1}", line)
            line = re.sub(rf"Arg2:{arg2}", f"Arg2:{new_arg2}", line)

        # Match stances
        match_stance = re.match(pattern_stance, line)
        if match_stance:
            old_a_id, target_id = match_stance.groups()
            new_target_id = id_mapping.get(target_id, target_id)

            # Replace target ID in the line
            line = re.sub(rf"Stance {target_id}", f"Stance {new_target_id}", line)

        output_lines.append(line)

    return "\n".join(output_lines)

In [5]:
# Example usage
input_text = load_text(ann_files[0])

output_text = update_ann_file(input_text)
print(output_text)

MC1	MajorClaim 503 575	we should attach more importance to cooperation during primary education
MC2	MajorClaim 2154 2231	a more cooperative attitudes towards life is more profitable in one's success
C1	Claim 591 714	through cooperation, children can learn about interpersonal skills which are significant in the future life of all students
A1	Stance C1 For
P1	Premise 716 851	What we acquired from team work is not only how to achieve the same goal with others but more importantly, how to get along with others
P2	Premise 853 1086	During the process of cooperation, children can learn about how to listen to opinions of others, how to communicate with others, how to think comprehensively, and even how to compromise with other team members when conflicts occurred
P3	Premise 1088 1191	All of these skills help them to get on well with other people and will benefit them for the whole life
R1	supports Arg1:P1 Arg2:C1	
R2	supports Arg1:P2 Arg2:C1	
R3	supports Arg1:P3 Arg2:C1	
C2	Claim 1332 1376	com

In [6]:
new_path = 'data/transformed/'
for file in ann_files:
    input_text = load_text(file)
    output_text = update_ann_file(input_text)
    # save the updated file to a new location
    with open(new_path + file.split('/')[-1], 'w') as f: # file.split('/')[-1] gets the last part of the path, which is the file name
        f.write(output_text)

# Transformation zu JSON

In [7]:
transformed_ann_files = list_files_with_extension_directory(new_path, '.ann')
transformed_ann_files[:5]

['data/transformed/essay001.ann',
 'data/transformed/essay002.ann',
 'data/transformed/essay003.ann',
 'data/transformed/essay004.ann',
 'data/transformed/essay005.ann']

In [8]:
ann_transformed = load_text(transformed_ann_files[1])
print(ann_transformed)

MC1	MajorClaim 391 489	they are able to sustain their cultural identities and doing so help they keep their origin values
MC2	MajorClaim 1936 2077	sustaining the traditions will make the immigrated people keep their cultural identity and give them confident among the many culture society
P1	Premise 500 624	maintaining one’s cultural identity is a key important rule to help individuals emerge in the new multicultural environments
C1	Claim 1089 1156	sustaining the cultural values of immigrants is paramount essential
A2	Stance C1 For
P2	Premise 626 839	Take Australia for example, immigrants from varieties of nations have a day called multicultural day where people from each country prepare their food and traditional activities for displaying in the public venues
P3	Premise 841 946	Many Australians come this day to enjoy the shows, learn about the cultures and admire the diverse values
P4	Premise 948 1057	These feedbacks, in turn, help raise one’s pride of their cultures and help people un

In [15]:
def transform_text_to_json(text):
    data = {
        "MajorClaims": {},
        "Claims": {},
        "Premises": {},
        "ArgumentativeRelations": []
    }
    # Split the input text into lines
    lines = text.strip().split('\n')
    # Process each line to extract relevant information
    for line in lines:
        parts = line.split()
        # Check if the line defines a MajorClaim
        if parts[0].startswith('MC'):
            data["MajorClaims"][parts[0]] = ' '.join(parts[4:]) # parts[0] is the ID, parts[4:] is the text
        # Check if the line defines a Claim
        elif parts[0].startswith('C'):
            data["Claims"][parts[0]] = ' '.join(parts[4:])
        # Check if the line defines a Premise
        elif parts[0].startswith('P'):
            data["Premises"][parts[0]] = ' '.join(parts[4:])
        # Check if the line defines an Argumentative Relation
        elif parts[0].startswith('R'):
            data["ArgumentativeRelations"].append({
                "Claim": parts[2].split(':')[1],
                "Relation": parts[1],
                "Target": parts[3].split(':')[1]
            })
        # Check if the line defines a Stance
        elif parts[0].startswith('A'):
            data["ArgumentativeRelations"].append({
                "Claim": parts[2],
                "Relation": parts[3],
                "Target": "MC"
            })

    return json.dumps(data, indent=2)   # dumps() function converts a Python object into a JSON string. 
                                        # indent is used to format the JSON output for better readability


print(transform_text_to_json(ann_transformed))

{
  "MajorClaims": {
    "MC1": "they are able to sustain their cultural identities and doing so help they keep their origin values",
    "MC2": "sustaining the traditions will make the immigrated people keep their cultural identity and give them confident among the many culture society"
  },
  "Claims": {
    "C1": "sustaining the cultural values of immigrants is paramount essential",
    "C2": "keeping the cultural traditions in the destination countries is tremendous important",
    "C3": "there are opposing ideas of neglecting one\u2019s cultural values to adapt in the new cultures"
  },
  "Premises": {
    "P1": "maintaining one\u2019s cultural identity is a key important rule to help individuals emerge in the new multicultural environments",
    "P2": "Take Australia for example, immigrants from varieties of nations have a day called multicultural day where people from each country prepare their food and traditional activities for displaying in the public venues",
    "P3": "Many

In [11]:
# use function to transform all files
for file in transformed_ann_files:
    input_text = load_text(file)
    output_text = transform_text_to_json(input_text)
    # save the updated file to a new location
    with open(new_path + file.split('/')[-1].replace('.ann', '.json'), 'w') as f:
        f.write(output_text)    

In [12]:
# view example of transformed json file
transformed_json_files = list_files_with_extension_directory(new_path, '.json')
transformed_json_files[:5]

['data/transformed/essay001.json',
 'data/transformed/essay002.json',
 'data/transformed/essay003.json',
 'data/transformed/essay004.json',
 'data/transformed/essay005.json']

In [13]:
print(load_text(transformed_json_files[0]))

{
    "MajorClaims": {
        "MC1": "we should attach more importance to cooperation during primary education",
        "MC2": "a more cooperative attitudes towards life is more profitable in one's success"
    },
    "Claims": {
        "C1": "through cooperation, children can learn about interpersonal skills which are significant in the future life of all students",
        "C2": "competition makes the society more effective",
        "C3": "without the cooperation, there would be no victory of competition"
    },
    "Premises": {
        "P1": "What we acquired from team work is not only how to achieve the same goal with others but more importantly, how to get along with others",
        "P2": "During the process of cooperation, children can learn about how to listen to opinions of others, how to communicate with others, how to think comprehensively, and even how to compromise with other team members when conflicts occurred",
        "P3": "All of these skills help them to get on

# JSON zu Pandas Dataframe 

In [25]:
# transform json data to dataframe

def json_to_dataframe(json_data):
    data = json.loads(json_data)
    # Create a DataFrame for MajorClaims
    major_claims_df = pd.DataFrame(data["MajorClaims"].items(), columns=["ID", "MajorClaim"])
    major_claims_df["Type"] = "MajorClaim"
    # Create a DataFrame for Claims
    claims_df = pd.DataFrame(data["Claims"].items(), columns=["ID", "Claim"])
    claims_df["Type"] = "Claim"
    # Create a DataFrame for Premises
    premises_df = pd.DataFrame(data["Premises"].items(), columns=["ID", "Premise"])
    premises_df["Type"] = "Premise"
    # Combine all DataFrames
    df = pd.concat([major_claims_df, claims_df, premises_df], ignore_index=True)
    # Create a DataFrame for ArgumentativeRelations
    relations_df = pd.DataFrame(data["ArgumentativeRelations"])
    return df, relations_df

# Example usage
json_data = load_text(transformed_json_files[0])
df, relations_df = json_to_dataframe(json_data)
df.head()

Unnamed: 0,ID,MajorClaim,Type,Claim,Premise
0,MC1,we should attach more importance to cooperatio...,MajorClaim,,
1,MC2,a more cooperative attitudes towards life is m...,MajorClaim,,
2,C1,,Claim,"through cooperation, children can learn about ...",
3,C2,,Claim,competition makes the society more effective,
4,C3,,Claim,"without the cooperation, there would be no vic...",


In [26]:
relations_df.head()

Unnamed: 0,start,Relation,target
0,C1,For,MC
1,P1,supports,C1
2,P2,supports,C1
3,P3,supports,C1
4,C2,Against,MC
