In [6]:
import os
import pandas as pd
from src.dataimport import list_files_with_extension_directory

In [7]:
RAW_FILES_PATH = 'data/original/brat-project-final/'

files_directory_ann = list_files_with_extension_directory(RAW_FILES_PATH, '.ann')

# ANN-File formatieren
ggf. nicht notwendig, da die Dateien bereits als JSON formatiert wurden und dann wiederum in JSON umgewandelt werden. Ggf. auslagen in separate Datei ablegen für alle Fälle

## Var1
Dataframes of entities. attributes and relations

In [8]:
def parse_brat_annotation(file_path):
    """
    Formatiert die Annotationen in einer .ann-Datei in ein Pandas DataFrame. Dazu gehören die Entitäten, Attribute und Beziehungen
    Args:
        file_path (str): Pfad zur .ann-Datei
    Returns:
        pandas.DataFrame: Ein DataFrame mit den Annotationen
    """
    entities = []
    attributes = []
    relations = []

    with open(file_path, 'r') as file:
        lines = file.readlines()

    for line in lines: # Iteriert über alle Zeilen in der Datei
        parts = line.strip().split('\t') # Teilt eine Zeile in durch Tabulatoren getrennte Teile 
        if parts[0].startswith('T'): # Wenn Zeile mit 'T' beginnt // Claims, MajorClaims, Premises
            entity_id = parts[0]
            entity_info = parts[1].split(' ')
            entity_type = entity_info[0]
            entity_start = int(entity_info[1])
            entity_end = int(entity_info[2])
            entity_text = parts[2]
            entities.append({

                'id': entity_id,
                'type': entity_type,
                'start': entity_start,
                'end': entity_end,
                'text': entity_text
            })
        elif parts[0].startswith('A'): # Wenn Zeile mit 'A' beginnt // Attribute
            attribute_id = parts[0]
            attribute_info = parts[1].split(' ')
            attribute_type = attribute_info[0]
            target_id = attribute_info[1]
            #attribute_value = parts[2] if len(parts) > 2 else None
            attribute_value = attribute_info[2]
            attributes.append({
                'id': attribute_id,
                'type': attribute_type,
                'target': target_id,
                'value': attribute_value
            })
        elif parts[0].startswith('R'): # Wenn Zeile mit 'R' beginnt
            relation_id = parts[0]
            relation_info = parts[1].split(' ') 
            relation_type = relation_info[0]
            arg1 = relation_info[1].split(':')[1]
            arg2 = relation_info[2].split(':')[1]
            relations.append({
                'id': relation_id,
                'arg1_from': arg1,
                'type': relation_type,
                'arg2_to': arg2
            })

    for entity in entities:
        # Mit nachfolgendem Code ist kein merge notwendig
        # Add stance to entities
        # Wenn 'target' der Attribute und 'ID' der Entities übereinstimmen, dann wird 'value' aus den Attributen zu Entities als Stance hinzugefügt
        # Mit next() wird das erste Element zurückgegeben, das den Bedingungen entspricht und None, wenn kein Element gefunden wird
        entity['stance'] = next((attr['value'] for attr in attributes if attr['target'] == entity['id']), None)
        #entity['stance'] = next((attr['value'] for attr in attributes if attr['target'] == entity['id'] and attr['type'] == 'Stance'), None)

        # Add relation to claims
        # Wenn 'arg1_from' der Relations und 'ID' der Entities übereinstimmen, dann wird:
        # 1: 'type' aus den Relations zu Entities als Relation hinzugefügt
        # 2: 'arg2_to' aus den Relations zu Entities als Relation_target hinzugefügt. Das ist der claim auf der sich die Relation bezieht
        # Mit next() wird das erste Element zurückgegeben, das den Bedingungen entspricht und None, wenn kein Element gefunden wird
        entity['relation'] = next((rel['type'] for rel in relations if rel['arg1_from'] == entity['id']), None)
        entity['relation_target'] = next((rel['arg2_to'] for rel in relations if rel['arg1_from'] == entity['id']), None)
           

    # turn data into a dataframe
    entities_df = pd.DataFrame(entities)
    attributes_df = pd.DataFrame(attributes)
    relations_df = pd.DataFrame(relations)

    return entities_df, attributes_df, relations_df

In [9]:
# Example usage
entities, attributes, relations = parse_brat_annotation(files_directory_ann[0])
entities.drop(columns=['start', 'end'], inplace=True)
entities

Unnamed: 0,id,type,text,stance,relation,relation_target
0,T1,MajorClaim,we should attach more importance to cooperatio...,,,
1,T2,MajorClaim,a more cooperative attitudes towards life is m...,,,
2,T3,Claim,"through cooperation, children can learn about ...",For,,
3,T4,Premise,What we acquired from team work is not only ho...,,supports,T3
4,T5,Premise,"During the process of cooperation, children ca...",,supports,T3
5,T6,Premise,All of these skills help them to get on well w...,,supports,T3
6,T7,Claim,competition makes the society more effective,Against,,
7,T8,Premise,the significance of competition is that how to...,,supports,T7
8,T9,Premise,when we consider about the question that how t...,,supports,T11
9,T10,Premise,Take Olympic games which is a form of competit...,,supports,T11


In [10]:
attributes

Unnamed: 0,id,type,target,value
0,A1,Stance,T3,For
1,A2,Stance,T7,Against
2,A3,Stance,T11,For


In [11]:
relations

Unnamed: 0,id,arg1_from,type,arg2_to
0,R1,T4,supports,T3
1,R2,T5,supports,T3
2,R3,T6,supports,T3
3,R4,T10,supports,T11
4,R5,T9,supports,T11
5,R6,T8,supports,T7


In [12]:
# def count_entity_types_per_file(files_directory_ann):
#     entity_counts = {}

#     for file in files_directory_ann:
#         entities_df, _, _ = parse_brat_annotation(file)
#         entity_type_counts = entities_df['type'].value_counts().to_dict()
#         entity_counts[file] = entity_type_counts

#     entity_counts_df = pd.DataFrame(entity_counts).T#.fillna(0)

#     return entity_counts_df

# # Example usage
# entity_counts_df = count_entity_types_per_file(files_directory_ann)
# entity_counts_df.sort_values(by=['MajorClaim'], ascending=False)

## Var 2
Lists and list of dicts

In [13]:
def parse_brat_annotation2(file_path):
    """
    Formatiert die Annotationen in einer .ann-Datei in Dictionarys. Dazu gehören die Entitäten, Attribute und Beziehungen
    """
    entities = []
    attributes = []
    relations = []

    file_path

    with open(file_path, 'r') as file:
        lines = file.readlines()

    for line in lines: # Iteriert über alle Zeilen in der Datei
        parts = line.strip().split('\t') # Teilt eine Zeile in durch Tabulatoren getrennte Teile 
        if parts[0].startswith('T'): # Wenn Zeile mit 'T' beginnt // Claims, MajorClaims, Premises
            entity_id = parts[0]
            entity_info = parts[1].split(' ')
            entity_type = entity_info[0]
            entity_start = int(entity_info[1])
            entity_end = int(entity_info[2])
            entity_text = parts[2]
            entities.append({

                'id': entity_id,
                'type': entity_type,
                #'start': entity_start,
                #'end': entity_end,
                'text': entity_text
            })
        elif parts[0].startswith('A'): # Wenn Zeile mit 'A' beginnt // Attribute
            attribute_id = parts[0]
            attribute_info = parts[1].split(' ')
            attribute_type = attribute_info[0]
            target_id = attribute_info[1]
            #attribute_value = parts[2] if len(parts) > 2 else None
            attribute_value = attribute_info[2]
            attributes.append({
                'id': attribute_id,
                'type': attribute_type,
                'target': target_id,
                'value': attribute_value
            })
        elif parts[0].startswith('R'): # Wenn Zeile mit 'R' beginnt
            relation_id = parts[0]
            relation_info = parts[1].split(' ') 
            relation_type = relation_info[0]
            arg1 = relation_info[1].split(':')[1]
            arg2 = relation_info[2].split(':')[1]
            relations.append({
                'id': relation_id,
                'arg1_from': arg1,
                'type': relation_type,
                'arg2_to': arg2
            })

    # turn data into a dataframe
    #entities_df = pd.DataFrame(entities)
    #attributes_df = pd.DataFrame(attributes)
    #relations_df = pd.DataFrame(relations)

    #return entities_df, attributes_df, relations_df
    return entities, attributes, relations

In [14]:
# Example usage
entities, attributes, relations = parse_brat_annotation2(files_directory_ann[0])
entities

[{'id': 'T1',
  'type': 'MajorClaim',
  'text': 'we should attach more importance to cooperation during primary education'},
 {'id': 'T2',
  'type': 'MajorClaim',
  'text': "a more cooperative attitudes towards life is more profitable in one's success"},
 {'id': 'T3',
  'type': 'Claim',
  'text': 'through cooperation, children can learn about interpersonal skills which are significant in the future life of all students'},
 {'id': 'T4',
  'type': 'Premise',
  'text': 'What we acquired from team work is not only how to achieve the same goal with others but more importantly, how to get along with others'},
 {'id': 'T5',
  'type': 'Premise',
  'text': 'During the process of cooperation, children can learn about how to listen to opinions of others, how to communicate with others, how to think comprehensively, and even how to compromise with other team members when conflicts occurred'},
 {'id': 'T6',
  'type': 'Premise',
  'text': 'All of these skills help them to get on well with other peop

In [15]:
attributes

[{'id': 'A1', 'type': 'Stance', 'target': 'T3', 'value': 'For'},
 {'id': 'A2', 'type': 'Stance', 'target': 'T7', 'value': 'Against'},
 {'id': 'A3', 'type': 'Stance', 'target': 'T11', 'value': 'For'}]

In [16]:
relations

[{'id': 'R1', 'arg1_from': 'T4', 'type': 'supports', 'arg2_to': 'T3'},
 {'id': 'R2', 'arg1_from': 'T5', 'type': 'supports', 'arg2_to': 'T3'},
 {'id': 'R3', 'arg1_from': 'T6', 'type': 'supports', 'arg2_to': 'T3'},
 {'id': 'R4', 'arg1_from': 'T10', 'type': 'supports', 'arg2_to': 'T11'},
 {'id': 'R5', 'arg1_from': 'T9', 'type': 'supports', 'arg2_to': 'T11'},
 {'id': 'R6', 'arg1_from': 'T8', 'type': 'supports', 'arg2_to': 'T7'}]

In [17]:
df = pd.DataFrame(entities)
df

Unnamed: 0,id,type,text
0,T1,MajorClaim,we should attach more importance to cooperatio...
1,T2,MajorClaim,a more cooperative attitudes towards life is m...
2,T3,Claim,"through cooperation, children can learn about ..."
3,T4,Premise,What we acquired from team work is not only ho...
4,T5,Premise,"During the process of cooperation, children ca..."
5,T6,Premise,All of these skills help them to get on well w...
6,T7,Claim,competition makes the society more effective
7,T8,Premise,the significance of competition is that how to...
8,T9,Premise,when we consider about the question that how t...
9,T10,Premise,Take Olympic games which is a form of competit...


## Var3
Lists and list of dicts per entity in a DataFrame

In [18]:
import os
import re
import pandas as pd

def parse_ann_file(filepath):
    # Initialize dictionaries to hold the different argument units and their stances
    major_claims = []
    claims = []
    premises = []

    # Dictionary to map argument unit IDs to their stance if available
    stances = {}

    with open(filepath, 'r') as file:
        for line in file:
            # Match argument units (MajorClaim, Claim, Premise)
            if line.startswith("T"):
                parts = line.strip().split("\t")
                unit_id, unit_type, text = parts[0], parts[1].split()[0], parts[2]

                # Parse each argument unit by type
                if unit_type == "MajorClaim":
                    major_claims.append({"text": text, "stance": stances.get(unit_id, None)})
                elif unit_type == "Claim":
                    claims.append({"text": text, "stance": stances.get(unit_id, None)})
                elif unit_type == "Premise":
                    premises.append({"text": text, "stance": stances.get(unit_id, None)})

            # Match stance annotations
            elif line.startswith("A"):
                stance_parts = line.strip().split("\t")
                if len(stance_parts) >= 3:
                    _, stance_type, target_id = stance_parts[0], stance_parts[1].split()[0], stance_parts[1].split()[1]
                    stance_value = stance_parts[2]

                    # Only consider stance type if it is 'Stance'
                    if stance_type == "Stance":
                        stances[target_id] = stance_value

    # Return the extracted data as dictionaries
    return {"MajorClaims": major_claims, "Claims": claims, "Premises": premises}

def process_files_in_directory(directory):
    # Initialize a list to collect data for each file
    data = []

    # Process each .ann file in the specified directory
    for filename in os.listdir(directory):
        if filename.endswith(".ann"):
            filepath = os.path.join(directory, filename)
            file_data = parse_ann_file(filepath)
            file_data["Filename"] = filename  # Include the filename for reference
            data.append(file_data)

    # Convert the list of dictionaries into a pandas DataFrame
    df = pd.DataFrame(data)

    # Convert columns with argument lists into lists of texts and stances for easier reading
    df["MajorClaims"] = df["MajorClaims"].apply(lambda lst: [(item["text"], item["stance"]) for item in lst])
    df["Claims"] = df["Claims"].apply(lambda lst: [(item["text"], item["stance"]) for item in lst])
    df["Premises"] = df["Premises"].apply(lambda lst: [(item["text"], item["stance"]) for item in lst])

    return df

# Usage
PATH = 'data/original/brat-project-final/'
df = process_files_in_directory(PATH)

# Print or save the DataFrame as needed
df


Unnamed: 0,MajorClaims,Claims,Premises,Filename
0,[(we should attach more importance to cooperat...,"[(through cooperation, children can learn abou...",[(What we acquired from team work is not only ...,essay001.ann
1,[(they are able to sustain their cultural iden...,[(sustaining the cultural values of immigrants...,[(maintaining one’s cultural identity is a key...,essay002.ann
2,[(it has contributed to the economic developme...,[(tourism has clearly improved lives in the to...,[(international tourism promotes many aspects ...,essay003.ann
3,[(this industry has affected the cultural attr...,[(the tourism bring large profit for the desti...,[(tourists from different cultures will probab...,essay004.ann
4,[(one who studies overseas will gain many skil...,[(studying at an overseas university gives ind...,[(Compared to the peers studying in the home c...,essay005.ann
...,...,...,...,...
397,[(it is necessary for universities to respect ...,[(many different characters exist between male...,[(Most of male students tend to use their left...,essay398.ann
398,[(this is a worrying tread which has negative ...,[(This gives children the idea that it is not ...,[(The first impression of the celebrities seem...,essay399.ann
399,[(governments should devote a greater portion ...,[(society should be educated and became aware ...,[(decreasing the number of patients in the hea...,essay400.ann
400,[(fatherhood is a as vital part of a healty pa...,[(The first reason why the father's role shoul...,[(the role models within a family play a signi...,essay401.ann


In [19]:
df['MajorClaims'][0]

[('we should attach more importance to cooperation during primary education',
  None),
 ("a more cooperative attitudes towards life is more profitable in one's success",
  None)]

In [20]:
df['Claims'][0]

[('through cooperation, children can learn about interpersonal skills which are significant in the future life of all students',
  None),
 ('competition makes the society more effective', None),
 ('without the cooperation, there would be no victory of competition', None)]

In [21]:
df['Claims'][0]

[('through cooperation, children can learn about interpersonal skills which are significant in the future life of all students',
  None),
 ('competition makes the society more effective', None),
 ('without the cooperation, there would be no victory of competition', None)]

## Var4
Lists and list of dicts per entity in a DataFrame with IDs

In [22]:
import pandas as pd
import re
import os

def parse_ann_file(file_path):
    with open(file_path, 'r') as file:
        content = file.readlines()
    
    major_claims = []
    claims = []
    premises = []
    stances = {}
    relations = []

    for line in content:
        if line.startswith('T'):
            parts = line.strip().split('\t')
            t_number = parts[0]
            annotation = parts[1].split()
            annotation_type = annotation[0]
            sentence = parts[2]

            if annotation_type == 'MajorClaim':
                major_claims.append({'id': t_number, 'Sentence': sentence})
            elif annotation_type == 'Claim':
                claims.append({'id': t_number, 'Sentence': sentence})
            elif annotation_type == 'Premise':
                premises.append({'id': t_number, 'Sentence': sentence})
        
        elif line.startswith('A'):
            parts = line.strip().split('\t')
            annotation = parts[1].split()
            t_number = annotation[1]
            stance = annotation[2]
            stances[t_number] = stance
        
        elif line.startswith('R'):
            parts = line.strip().split('\t')
            annotation = parts[1].split()
            relation_type = annotation[0]
            arg1 = annotation[1].split(':')[1]
            arg2 = annotation[2].split(':')[1]
            relations.append({'Relation': relation_type, 'Arg1': arg1, 'Arg2': arg2})

    # Add stances to claims
    for claim in claims:
        t_number = claim['id']
        claim['Stance'] = stances.get(t_number, '')

    # Add relations to premises
    for premise in premises:
        t_number = premise['id']
        related_relations = [rel for rel in relations if rel['Arg1'] == t_number]
        premise['Relations'] = [{'Relation': rel['Relation'], 'Target': rel['Arg2']} for rel in related_relations]

    return {
        'MajorClaims': major_claims,
        'Claims': claims,
        'Premises': premises,
        'Stances': stances,
        'Relations': relations
    }

def create_dataframe_from_ann_files(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith('.ann'):
            file_path = os.path.join(directory, filename)
            parsed_data = parse_ann_file(file_path)
            data.append({
                'file-name': filename,
                'MajorClaims': parsed_data['MajorClaims'],
                'Claims': parsed_data['Claims'],
                'Premises': parsed_data['Premises'],
                'Stances': parsed_data['Stances'],
                'Relations': parsed_data['Relations']
            })
    
    df = pd.DataFrame(data, columns=['file-name', 'MajorClaims', 'Claims', 'Premises', 'Stances', 'Relations'])
    return df

# Example usage
print(PATH)
df = create_dataframe_from_ann_files(PATH)
df

data/original/brat-project-final/


Unnamed: 0,file-name,MajorClaims,Claims,Premises,Stances,Relations
0,essay001.ann,"[{'id': 'T1', 'Sentence': 'we should attach mo...","[{'id': 'T3', 'Sentence': 'through cooperation...","[{'id': 'T4', 'Sentence': 'What we acquired fr...","{'T3': 'For', 'T7': 'Against', 'T11': 'For'}","[{'Relation': 'supports', 'Arg1': 'T4', 'Arg2'..."
1,essay002.ann,"[{'id': 'T1', 'Sentence': 'they are able to su...","[{'id': 'T4', 'Sentence': 'sustaining the cult...","[{'id': 'T3', 'Sentence': 'maintaining one’s c...","{'T4': 'For', 'T10': 'For', 'T14': 'Against'}","[{'Relation': 'supports', 'Arg1': 'T9', 'Arg2'..."
2,essay003.ann,"[{'id': 'T1', 'Sentence': 'it has contributed ...","[{'id': 'T3', 'Sentence': 'tourism has clearly...","[{'id': 'T4', 'Sentence': 'international touri...","{'T3': 'For', 'T8': 'For'}","[{'Relation': 'supports', 'Arg1': 'T10', 'Arg2..."
3,essay004.ann,"[{'id': 'T1', 'Sentence': 'this industry has a...","[{'id': 'T3', 'Sentence': 'the tourism bring l...","[{'id': 'T6', 'Sentence': 'tourists from diffe...","{'T3': 'Against', 'T4': 'For', 'T5': 'For'}","[{'Relation': 'supports', 'Arg1': 'T8', 'Arg2'..."
4,essay005.ann,"[{'id': 'T1', 'Sentence': 'one who studies ove...","[{'id': 'T3', 'Sentence': 'studying at an over...","[{'id': 'T6', 'Sentence': 'Compared to the pee...","{'T3': 'For', 'T4': 'For', 'T5': 'For', 'T12':...","[{'Relation': 'supports', 'Arg1': 'T6', 'Arg2'..."
...,...,...,...,...,...,...
397,essay398.ann,"[{'id': 'T2', 'Sentence': 'it is necessary for...","[{'id': 'T1', 'Sentence': 'many different char...","[{'id': 'T3', 'Sentence': 'Most of male studen...","{'T1': 'For', 'T8': 'For', 'T10': 'For', 'T12'...","[{'Relation': 'supports', 'Arg1': 'T3', 'Arg2'..."
398,essay399.ann,"[{'id': 'T1', 'Sentence': 'this is a worrying ...","[{'id': 'T3', 'Sentence': 'This gives children...","[{'id': 'T4', 'Sentence': 'The first impressio...","{'T3': 'For', 'T7': 'For', 'T9': 'For'}","[{'Relation': 'supports', 'Arg1': 'T4', 'Arg2'..."
399,essay400.ann,"[{'id': 'T1', 'Sentence': 'governments should ...","[{'id': 'T3', 'Sentence': 'society should be e...","[{'id': 'T4', 'Sentence': 'decreasing the numb...","{'T3': 'For', 'T10': 'For'}","[{'Relation': 'supports', 'Arg1': 'T4', 'Arg2'..."
400,essay401.ann,"[{'id': 'T1', 'Sentence': 'fatherhood is a as ...","[{'id': 'T5', 'Sentence': 'The first reason wh...","[{'id': 'T3', 'Sentence': 'the role models wit...","{'T5': 'For', 'T8': 'For'}","[{'Relation': 'supports', 'Arg1': 'T3', 'Arg2'..."


In [23]:
df["MajorClaims"][0]

[{'id': 'T1',
  'Sentence': 'we should attach more importance to cooperation during primary education'},
 {'id': 'T2',
  'Sentence': "a more cooperative attitudes towards life is more profitable in one's success"}]

In [24]:
df["Claims"][0]

[{'id': 'T3',
  'Sentence': 'through cooperation, children can learn about interpersonal skills which are significant in the future life of all students',
  'Stance': 'For'},
 {'id': 'T7',
  'Sentence': 'competition makes the society more effective',
  'Stance': 'Against'},
 {'id': 'T11',
  'Sentence': 'without the cooperation, there would be no victory of competition',
  'Stance': 'For'}]

In [25]:
df["Premises"][0]

[{'id': 'T4',
  'Sentence': 'What we acquired from team work is not only how to achieve the same goal with others but more importantly, how to get along with others',
  'Relations': [{'Relation': 'supports', 'Target': 'T3'}]},
 {'id': 'T5',
  'Sentence': 'During the process of cooperation, children can learn about how to listen to opinions of others, how to communicate with others, how to think comprehensively, and even how to compromise with other team members when conflicts occurred',
  'Relations': [{'Relation': 'supports', 'Target': 'T3'}]},
 {'id': 'T6',
  'Sentence': 'All of these skills help them to get on well with other people and will benefit them for the whole life',
  'Relations': [{'Relation': 'supports', 'Target': 'T3'}]},
 {'id': 'T8',
  'Sentence': 'the significance of competition is that how to become more excellence to gain the victory',
  'Relations': [{'Relation': 'supports', 'Target': 'T7'}]},
 {'id': 'T9',
  'Sentence': 'when we consider about the question that ho