<a href="https://colab.research.google.com/github/DatNguyen2084/DLDH-Metaphor-detection/blob/main/DLDH_BERT_DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and data

In [None]:
!pip install PyDrive
!pip install dkpro-cassis
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import os
import os.path
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import numpy as np
from cassis import *
import seaborn as sns
import matplotlib.pyplot as plt
import argparse
import re, pdb


Collecting dkpro-cassis
  Downloading dkpro-cassis-0.7.0.tar.gz (73 kB)
[K     |████████████████████████████████| 73 kB 1.3 MB/s 
[?25hCollecting lxml==4.7.*
  Downloading lxml-4.7.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 9.3 MB/s 
[?25hCollecting attrs==21.2.*
  Downloading attrs-21.2.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.6 MB/s 
Collecting toposort==1.7
  Downloading toposort-1.7-py2.py3-none-any.whl (9.0 kB)
Collecting deprecation==2.1.*
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Building wheels for collected packages: dkpro-cassis
  Building wheel for dkpro-cassis (setup.py) ... [?25l[?25hdone
  Created wheel for dkpro-cassis: filename=dkpro_cassis-0.7.0-py3-none-any.whl size=74043 sha256=a895e10628158dc7adc4166f67e82bdc192ca7f80f2a3445280c67f6d2a8a934
  Stored in directory: /root/.cache/pip/wheels/a9/3c/80/81baf39265

In [None]:
# Mount Google Drive
# The following data is needed: https://drive.google.com/drive/folders/159CN2MDaGLzuoiA7x--Qq5zEdPavFcpf?usp=sharing
# Create a shortcut to your Drive ("Drive-Verknüpfung hinzufügen" zu "Meine Ablage")
from google.colab import drive
drive.mount('/content/drive')

ROOT_PATH = '/content/drive/MyDrive/DLDH'
DATA_PATH = ROOT_PATH + '/data'
MODEL_PATH = ROOT_PATH + '/model'
RESULTS_PATH = ROOT_PATH + '/results'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Extracting the annotation data of phase 2 if necessary
ANNOTATION_PATH = DATA_PATH + '/Annotationen - Stufe 2'

if not os.path.exists(ANNOTATION_PATH):      # replace the file name with your file
  zip_filepath = DATA_PATH + '/Annotationen - Stufe 2.zip'

  !unzip "$zip_filepath" -d "$DATA_PATH"

Archive:  /content/drive/My Drive/DLDH/data/Annotationen - Stufe 2.zip
   creating: /content/drive/My Drive/DLDH/data/Annotationen - Stufe 2/
   creating: /content/drive/My Drive/DLDH/data/Annotationen - Stufe 2/txt/
  inflating: /content/drive/My Drive/DLDH/data/Annotationen - Stufe 2/txt/Schalk_Metaphern_GruppeBPT.txt  
  inflating: /content/drive/My Drive/DLDH/data/Annotationen - Stufe 2/txt/Methner_Stufe2_pgg.txt  
  inflating: /content/drive/My Drive/DLDH/data/Annotationen - Stufe 2/txt/Methner_Metaphern_GruppeBPT.txt  
  inflating: /content/drive/My Drive/DLDH/data/Annotationen - Stufe 2/txt/Haecker_Stufe2_pgg.txt  
  inflating: /content/drive/My Drive/DLDH/data/Annotationen - Stufe 2/txt/Haecker_Metaphern_GruppeBPT.txt  
  inflating: /content/drive/My Drive/DLDH/data/Annotationen - Stufe 2/Häckel_Welträtsel_Stufe2_P.tsv  
  inflating: /content/drive/My Drive/DLDH/data/Annotationen - Stufe 2/Darwin_Kap1_Stufe2_B.tsv  
  inflating: /content/drive/My Drive/DLDH/data/Annotationen - 

# TSV Data

In [None]:
def concat_string_from_rows(df: pd.DataFrame):
    """
    concatenate values of all rows together on columns 'Stelle', 'Fokus', 'Rahmen', 
    :param df: The given dataframe
    :return re.sub: The concatenated values
    """  
    result = ''
    for s in df['Stelle'].values:
        if result.endswith('-'):
            result = result[:-1] + s
        result = result + ' ' + s
    return re.sub(' +', ' ', result)

def concat_Fokus_Rahmen_from_rows(df: pd.DataFrame):
    """
    concatenate values of all rows together on columns 'Fokus', 'Rahmen', 
    :param df: The given dataframe
    :return fokus: The fokus
    :return rahmen: The rahmen
    """  
    fokus = ''
    rahmen = ''
    for f in df['Fokus']:
      if not pd.isnull(f) and (f not in fokus):
          fokus = fokus + f + ','
    for r in df['Rahmen']:
      if not pd.isnull(r) and (r not in rahmen):
          rahmen = rahmen + r + ','
    # remove last ',' character
    return fokus[:-1], rahmen[:-1]

def get_sentence_dataframe(df, i):
    """
    concatenate all sentences that were separated by newlines 
    :param df: The given dataframe
    :param i: The index of the current row
    :return tmp: The temporary text
    :return i: The index of the current row after the operation
    """  
    tmp = pd.DataFrame([], columns=df.columns.tolist())
    while not pd.isnull(df.at[i, 'Stelle']):

            tmp = tmp.append(df.loc[[i]])
            i = i + 1
            if i >= len(df):
                return tmp, i
    return tmp, i
    
def merge_rows_to_sentence(df: pd.DataFrame,filename):
  """
  concatenates all sentences that were splitted into multiple rows in tsv
  :param df: The given dataframe
  :param filename: The filename of the tsv data
  :return result: The dataframe with merged rows
  """    
  columns = ['Textstelle', 'Metapher?','Fokus','Rahmen','Annotator', 'Filename']
  result = pd.DataFrame([], columns=columns)
  i = 0
  while i < len(df):
    #pdb.set_trace()
    if pd.isnull(df.at[i, 'Stelle']):
        # increase i until it meets onther empty row
        i = i + 1
    else:
        tmp, i = get_sentence_dataframe(df, i)
        if len(tmp) > 0:
            s = concat_string_from_rows(tmp)
            metapher = ''
            fokus = ''
            rahmen = ''
            #print("s", s)
            if re.match(r'^Matzat|Schallmeyer', filename):
              agreement = 0            
              for annotator in ['P','K','A','B']:
                if 'x' in tmp[annotator].unique():
                  agreement +=1
              if agreement > 2:
                metapher = 'Metapher'
            elif re.match(r'^Ruppin', filename):
              if re.search('positive', filename):
                metapher = 'Metapher'
                fokus = tmp['Fokus'].unique()[0]
                rahmen = tmp['Rahmen'].unique()[0]

            new_line = pd.DataFrame([[s, metapher, fokus, rahmen, 'GoldStandard', filename ]], columns=columns)
            result = result.append(new_line)            
  return result

def blumenberg(tsv,filename):
  """
  special file handling for the text by Blumenberg
  :param tsv: The tsv dataframe
  :param filename: The filename of the tsv data
  :return tsv: The dataframe including the text by Blumenberg
  """  
  columns = ['Textstelle', 'Metapher?','Fokus','Rahmen','Annotator', 'Filename']

  tsv = tsv[['Textstelle', 'Gruppe', 'Foki der Gruppe']]
  tsv['Rahmen'] = ''
  tsv['Annotator'] = 'GoldStandard'
  tsv['Filename'] = filename
  tsv.rename(columns={'Gruppe': 'Metapher?'}, inplace=True)
  tsv.rename(columns={'Foki der Gruppe': 'Fokus'}, inplace=True)
  tsv.loc[tsv['Metapher?'] == 'x', 'Metapher?'] = 'Metapher'
  return tsv

In [None]:
# Some .tsv files should not be used due to duplicates 
# .tsv files to be excluded have to be written in the following array. Thereby these files will not be used while gathering data.
excluding = ["RTPK_1916_paarweise_Instruvtions.tsv", "RTPK_1916_paarweise_B.tsv", "RTPK_1916_paarweiser Vergleich_P.tsv", 
             "RTPK_1916_paarweise_K.tsv", "Matzat_Stufe2_Vergleich.tsv", "Schallmeyer_Stufe2_Vergleich.tsv", 
             "Blumenberg_Arbeit am Mythos_Gruppenergebnis und Vergleich mit pgg.tsv", "Ruppin_Stufe2_Vergleich.tsv",
             "Ruppin_Stufe2_gemeinsame Ergebnisse_positive Fälle_Einigkeit.tsv", 
             "Ruppin_Stufe2_gemeinsame Ergebnisse_negative Fälle_Uneinigkeit.tsv",
             "Ruppin_Stufe2_Negativfälle_A.tsv", "Ruppin_Stufe2_Negativfälle_K.tsv", "Ruppin_Stufe2_Negativfälle_P.tsv", 
             "Ruppin_Stufe2_Negativfälle_B.tsv"]

def open_tsv(path):
    """
    opens all tsv files from a given path
    :param path: The path of the tsv files
    :return df: The dataframe containing the tsv data
    """  
    annotator = None
    df = pd.DataFrame()
    for filename in os.listdir(path):
        if filename.endswith('.tsv') and not filename in excluding:
          # set annotator of the file based on the file's endings
          if filename.endswith('_T.tsv'):
            annotator = 'T'
          elif filename.endswith('_B.tsv'):
            annotator = 'B'
          elif filename.endswith('_P.tsv'):
            annotator = 'P'
          elif filename.endswith('_A.tsv'):
            annotator = 'A'
          elif filename.endswith('_K.tsv'):
            annotator = 'K'
          else:
            annotator = 'No Annotator'
          tsv = pd.read_csv(os.path.join(path, filename), sep='\t', header=0)
          if re.match(r'^Matzat|Ruppin|Schallmeyer', filename):
            print(f'merge rows for file {filename}')
            tsv = merge_rows_to_sentence(tsv)
            print(f'len: {len(tsv)}')
          # add one column with the previously determined annotator
          tsv['Annotator'] = annotator
          # Some corrections of column names
          if 'Stärkegrad (Á, B, C)' in tsv or 'Stärkegrad (0, 1, 2)' in tsv:
            tsv.rename(columns={"Stärkegrad (Á, B, C)": "Stärkegrad (A, B, C)", "Stärkegrad (0, 1, 2)": "Stärkegrad (A, B, C)"}, inplace=True)
            #tsv.loc['Stärkegrad (0, 1, 2)'] = (tsv['Stärkegrad (0, 1, 2)'] = 'A' 1990).astype(int)
            tsv.loc[tsv['Stärkegrad (A, B, C)'] == '0', "Stärkegrad (A, B, C)"] = 'A'
            tsv.loc[tsv['Stärkegrad (A, B, C)'] == '1', "Stärkegrad (A, B, C)"] = 'B'
            tsv.loc[tsv['Stärkegrad (A, B, C)'] == '2', "Stärkegrad (A, B, C)"] = 'C'
          if 'Metapher? ' in tsv or 'Metapher (ja=x; Nein = *leer*)' in tsv:
            tsv.rename(columns={"Metapher? ": "Metapher?", 'Metapher (ja=x; Nein = *leer*)': "Metapher?"}, inplace=True)
          if 'Stelle' in tsv or 'Sätze' in tsv:
            tsv.rename(columns={'Stelle': 'Textstelle', 'Sätze': 'Textstelle'}, inplace=True)
          if 'Abschnitt' in tsv:
            tsv.rename(columns={'Abschnitt': 'Seite'}, inplace=True)
          if 'Fokus der Metapher' in tsv:
            tsv.rename(columns={'Fokus der Metapher': 'Fokus'}, inplace=True)
          if 'Theresa' in tsv:
            tsv.rename(columns={'Theresa': 'Metapher?'}, inplace=True)          
          if 'Begründung' in tsv:
            tsv.rename(columns={'Begründung': 'Begründung/Kommentar'}, inplace=True)          


          df = pd.concat([df,tsv], axis=0, ignore_index=True)
    return df


tsv = open_tsv('/content/Annotationen - Stufe 2/tsv/')
print("Column Names")
print(tsv.columns)

print("Metaphor? values before cleanup")
print(tsv['Metapher?'].unique())

# some more cleanup
tsv.loc[tsv['Metapher?'] == 'X', 'Metapher?'] = 'Metapher'
tsv.loc[tsv['Metapher?'] == 'x', 'Metapher?'] = 'Metapher'
tsv.loc[tsv['Metapher?'] == 'Metapher', 'Metapher?'] = 'Metapher'
tsv.loc[tsv['Metapher?'] == 'Metapher ', 'Metapher?'] = 'Metapher'
tsv.fillna(value={'Metapher?': 'Nein'}, inplace=True)
tsv.loc[tsv['Metapher?'] == 'nein', 'Metapher?'] = 'Nein'
tsv.loc[tsv['Metapher?'] == 'nien', 'Metapher?'] = 'Nein'
tsv.loc[tsv['Metapher?'] == 'nein ', 'Metapher?'] = 'Nein'
tsv.drop(tsv[tsv['Metapher?'] == 'gleiches wie oben'].index, inplace=True)
tsv.loc[tsv['Metapher?'] == '?', 'Metapher?'] = 'Unklar'
tsv.loc[tsv['Metapher?'] == 'unklar', 'Metapher?'] = 'Unklar'
tsv.loc[tsv['Metapher?'] == 'Metapher/Grenzfall', 'Metapher?'] = 'Grenzfall'
tsv.loc[tsv['Metapher?'] == 'ungeklärter Grenzfall', 'Metapher?'] = 'Grenzfall'

print("Metaphor? values after cleanup")
print(tsv['Metapher?'].unique())

print(len(tsv))

#XMI

In [None]:
def get_dataframe_from_xmi(path,typesystem):
  """
  creates a pandas dataframe from a xmi file
  :param path: The path of the xmi file
  :param typesystem: The typesystem of the xmi file
  :return df: The dataframe containing the xmi data
  """  
  with open(typesystem, 'rb') as f:
    typesystem = load_typesystem(f)

  column_names = ["Textstelle", "Metapher?", "Fokus", "Rahmen", "Stärkegrad (A, B, C)", "Annotator", "Filename"]

  df = pd.DataFrame(columns = column_names)
  for filename in os.listdir(path):
    with open(os.path.join(path, filename), 'rb') as f:
      if "Stufe2" in filename and filename.endswith('.xmi'):

        # getting annotator
        annotator = get_annotator(os.path.splitext(filename)[0])
        
        cas = load_cas_from_xmi(f, typesystem=typesystem)
        # getting all sentences
        for i, sentence in enumerate(cas.select("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")):
          sentence_text = sentence.get_covered_text()

          # getting all focuses
          fokus_list = cas.select_covered('webanno.custom.Fokus', sentence)
          if len(fokus_list) > 0:
            for fokus in fokus_list:
              fokus_text = fokus.get_covered_text()
              # getting the Rahmen for the Fokus via the FokusRahmenLink
              if len(fokus['Rahmen']['elements']) > 0:
                rahmen = fokus['Rahmen']['elements'][0]['target']
                rahmen_text = rahmen.get_covered_text()
              # apparently, there are Fokus annotations without a specified Rahmen, this would be the place to exclude them
              else:
                rahmen_text = ''

              # getting the Score_ABC
              score = fokus['Score_ABC']

              df_entry = pd.DataFrame([[sentence_text, 'Metapher', fokus_text, rahmen_text, score, annotator, filename]], columns=column_names)
              df = pd.concat([df,df_entry], axis=0, ignore_index=True)
          else:
            df_entry = pd.DataFrame([[sentence_text, 'Nein', np.NaN, np.NaN, np.NaN, annotator, filename]], columns=column_names)
            df = pd.concat([df,df_entry], axis=0, ignore_index=True)
  return df
   
def get_annotator(filename):
  """
  returns the annotator of a file
  :param filename: The name of the file
  :return annotator: The annotator who annotated this file
  """  
  annotator = None
  if filename.endswith('T') or filename.endswith('T_1') or filename.endswith('T_2') or filename.endswith('T_3') or filename.endswith('T_4') or filename.endswith('T_5'):
    annotator = 'T'
  elif filename.endswith('P') or filename.endswith('P_1') or filename.endswith('P_2') or filename.endswith('P_3') or filename.endswith('P_4') or filename.endswith('P_5'):
    annotator = 'P'
  elif filename.endswith('B') or filename.endswith('B_1') or filename.endswith('B_2') or filename.endswith('B_3') or filename.endswith('B_4') or filename.endswith('B_5'):
    annotator = 'B'
  elif filename.endswith('A') or filename.endswith('A_1') or filename.endswith('A_2') or filename.endswith('A_3') or filename.endswith('A_4') or filename.endswith('A_5'):
    annotator = 'A'
  elif filename.endswith('K') or filename.endswith('K_1') or filename.endswith('K_2') or filename.endswith('K_3') or filename.endswith('K_4') or filename.endswith('K_5'):
    annotator = 'K'
  else:
    annotator = 'No Annotator'
  return annotator

file_path="/content/Annotationen - Stufe 2/xmi/"
type_system=os.path.join(file_path,"TypeSystem.xml")

xmi = get_dataframe_from_xmi(file_path, type_system)
print(len(xmi))
xmi.head(10)

765


Unnamed: 0,Textstelle,Metapher?,Fokus,Rahmen,"Stärkegrad (A, B, C)",Annotator,Filename
0,"Das Denken wird ihm schwer, er strengt seinen ...",Metapher,überwuchern,daß seine Einbildungskraft seinen Verstand und...,C,P,Schalk_Stufe2_P_1.xmi
1,Unter allen natürlichen Produkten eines Landes...,Metapher,versteinerte,Sonnenstrahlen,C,P,Schalk_Stufe2_P_1.xmi
2,"Aus der ganzen Darstellung erkennt man, daß de...",Nein,,,,P,Schalk_Stufe2_P_1.xmi
3,Da die organische Natur aus der anorganischen ...,Nein,,,,P,Schalk_Stufe2_P_1.xmi
4,"Vielleicht würden alle etwas mehr wissen, wenn...",Nein,,,,P,Schalk_Stufe2_P_1.xmi
5,"So wird sich wiederholen, was uns die Geschich...",Nein,,,,P,Schalk_Stufe2_P_1.xmi
6,Die Materie schafft die Welten mit allen den b...,Metapher,die Verkehrsstraße zwischen den Welten,der Äther ist,B,P,Schalk_Stufe2_P_1.xmi
7,"Man kann kaum begreifen, wie irgend jemand sic...",Metapher,versteinerter,Sonnenstrahlen,C,P,Schalk_Stufe2_P_1.xmi
8,"Das Haus, infolge seiner größeren Benutzung, e...",Nein,,,,P,Schalk_Stufe2_P_1.xmi
9,"Auch bei Tieren, besonders bei gesellig lebend...",Nein,,,,P,Schalk_Stufe2_P_1.xmi


In [None]:
# combine xmi and tsv data
df = pd.concat([tsv,xmi], axis=0, ignore_index=True)

# save gold standard as csv
df.to_csv(DATA_PATH + '/Annotationen-Stufe-2.csv')

# Gold Standard

In [None]:
# The following tsv files contain a gold standard discussed by the annotators
including = ["Blumenberg_Arbeit am Mythos_Gruppenergebnis und Vergleich mit pgg.tsv", "Matzat_Stufe2_Vergleich.tsv", "Schallmeyer_Stufe2_Vergleich.tsv",
             "Ruppin_Stufe2_gemeinsame Ergebnisse_negative Fälle_Uneinigkeit.tsv", "Ruppin_Stufe2_gemeinsame Ergebnisse_positive Fälle_Einigkeit.tsv"]


def open_tsv_gold_standard(path):
    """
    opens all tsv files from a given path for the gold standard
    :param path: The path of the tsv files
    :return df: The dataframe containing the tsv data
    """  
    annotator = None
    df = pd.DataFrame()
    for filename in os.listdir(path):
        if filename.endswith('.tsv') and filename in including:
          # set annotator of the file based on the file's endings
          annotator = 'GoldStandard'
          tsv = pd.read_csv(os.path.join(path, filename), sep='\t', header=0)

          if re.match(r'^Matzat|Schallmeyer|Ruppin', filename):
            print(f'merge rows for file {filename}')
            tsv = merge_rows_to_sentence(tsv,filename)
          elif re.match(r'^Blumenberg', filename):
            tsv = blumenberg(tsv,filename)

          print(f'len: {len(tsv)}')
          # add one column with the previously determined annotator
          tsv['Annotator'] = annotator
          tsv['Filename'] = filename
          # Some corrections of column names
          if 'Stärkegrad (Á, B, C)' in tsv or 'Stärkegrad (0, 1, 2)' in tsv:
            tsv.rename(columns={"Stärkegrad (Á, B, C)": "Stärkegrad (A, B, C)", "Stärkegrad (0, 1, 2)": "Stärkegrad (A, B, C)"}, inplace=True)
            #tsv.loc['Stärkegrad (0, 1, 2)'] = (tsv['Stärkegrad (0, 1, 2)'] = 'A' 1990).astype(int)
            tsv.loc[tsv['Stärkegrad (A, B, C)'] == '0', "Stärkegrad (A, B, C)"] = 'A'
            tsv.loc[tsv['Stärkegrad (A, B, C)'] == '1', "Stärkegrad (A, B, C)"] = 'B'
            tsv.loc[tsv['Stärkegrad (A, B, C)'] == '2', "Stärkegrad (A, B, C)"] = 'C'
          if 'Metapher? ' in tsv or 'Metapher (ja=x; Nein = *leer*)' in tsv:
            tsv.rename(columns={"Metapher? ": "Metapher?", 'Metapher (ja=x; Nein = *leer*)': "Metapher?"}, inplace=True)
          if 'Stelle' in tsv or 'Sätze' in tsv:
            tsv.rename(columns={'Stelle': 'Textstelle', 'Sätze': 'Textstelle'}, inplace=True)
          if 'Abschnitt' in tsv:
            tsv.rename(columns={'Abschnitt': 'Seite'}, inplace=True)
          if 'Fokus der Metapher' in tsv:
            tsv.rename(columns={'Fokus der Metapher': 'Fokus'}, inplace=True)
          if 'Theresa' in tsv:
            tsv.rename(columns={'Theresa': 'Metapher?'}, inplace=True)          
          if 'Begründung' in tsv:
            tsv.rename(columns={'Begründung': 'Begründung/Kommentar'}, inplace=True)          


          df = pd.concat([df,tsv], axis=0, ignore_index=True)
    return df


tsv_gold_standard = open_tsv_gold_standard('/content/Annotationen - Stufe 2/tsv/')
print("Column Names")
print(tsv.columns)

print("Metaphor? values before cleanup")
print(tsv['Metapher?'].unique())

# some more cleanup
tsv_gold_standard = tsv_gold_standard[tsv_gold_standard['Textstelle'].notnull()]
tsv_gold_standard.loc[tsv_gold_standard['Metapher?'] == 'X', 'Metapher?'] = 'Metapher'
tsv_gold_standard.loc[tsv_gold_standard['Metapher?'] == 'x', 'Metapher?'] = 'Metapher'
tsv_gold_standard.loc[tsv_gold_standard['Metapher?'] == 'Metapher', 'Metapher?'] = 'Metapher'
tsv_gold_standard.loc[tsv_gold_standard['Metapher?'] == 'Metapher ', 'Metapher?'] = 'Metapher'
tsv_gold_standard.fillna(value={'Metapher?': 'Metaphernkandidat'}, inplace=True)
tsv_gold_standard.loc[tsv_gold_standard['Metapher?'] == 'nein', 'Metapher?'] = 'Metaphernkandidat'
tsv_gold_standard.loc[tsv_gold_standard['Metapher?'] == 'nien', 'Metapher?'] = 'Metaphernkandidat'
tsv_gold_standard.loc[tsv_gold_standard['Metapher?'] == 'nein ', 'Metapher?'] = 'Metaphernkandidat'
tsv_gold_standard.loc[tsv_gold_standard['Metapher?'] == '', 'Metapher?'] = 'Metaphernkandidat'
tsv_gold_standard.drop(tsv_gold_standard[tsv_gold_standard['Metapher?'] == 'gleiches wie oben'].index, inplace=True)
tsv_gold_standard.loc[tsv_gold_standard['Metapher?'] == '?', 'Metapher?'] = 'Unklar'
tsv_gold_standard.loc[tsv_gold_standard['Metapher?'] == 'unklar', 'Metapher?'] = 'Unklar'
tsv_gold_standard.loc[tsv_gold_standard['Metapher?'] == 'Metapher/Grenzfall', 'Metapher?'] = 'Grenzfall'
tsv_gold_standard.loc[tsv_gold_standard['Metapher?'] == 'ungeklärter Grenzfall', 'Metapher?'] = 'Grenzfall'

print("Metaphor? values after cleanup")
print(tsv_gold_standard['Metapher?'].unique())

merge rows for file Matzat_Stufe2_Vergleich.tsv
len: 101
merge rows for file Ruppin_Stufe2_gemeinsame Ergebnisse_positive Fälle_Einigkeit.tsv
len: 35
merge rows for file Schallmeyer_Stufe2_Vergleich.tsv
len: 145
merge rows for file Ruppin_Stufe2_gemeinsame Ergebnisse_negative Fälle_Uneinigkeit.tsv
len: 30
len: 112
Column Names
Index(['Textstelle', 'Metapher?', 'Fokus', 'Rahmen', 'Annotator', 'Filename'], dtype='object')
Metaphor? values before cleanup
['Metapher' '' nan]
Metaphor? values after cleanup
['Metapher' 'Metaphernkandidat']
423


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  err

In [None]:
def xmi_gold_standard(xmi):
  """
  Generates a gold standard from given xmi data
  :param xmi: The xmi dataframe
  :return result: The dataframe containing the gold standard
  """  
  textstellen = xmi['Textstelle'].unique()
  columns = ['Textstelle', 'Metapher?','Fokus','Rahmen','Annotator', 'Filename']
  result = pd.DataFrame([], columns=columns)
  for stelle in textstellen:
    metapher = 'Metaphernkandidat'
    fokus = ''
    rahmen = ''
    tmp = xmi[xmi['Textstelle'] == stelle]
    entries = len(tmp)
    if entries < 2:
      print(stelle)
    # calculating the annotator agreement
    agreement = len(tmp[tmp['Metapher?'] == 'Metapher'])
    # accepting as metaphor if agreement is higher than 0.5
    if agreement / entries > 0.5:
      metapher = 'Metapher'
      fokus = tmp['Fokus'].dropna().tolist()
      rahmen = tmp['Rahmen'].dropna().tolist()
    filename = tmp['Filename'].tolist()

    new_line = pd.DataFrame([[stelle, metapher, fokus, rahmen, 'GoldStandard', filename]], columns=columns)
    result = result.append(new_line)

  return result

xmi_gold_standard = xmi_gold_standard(xmi)

xmi_gold_standard.sample(10)

Unnamed: 0,Textstelle,Metapher?,Fokus,Rahmen,Annotator,Filename
0,Zunächst natürlich: Kenntnisse über die Techni...,Metaphernkandidat,,,GoldStandard,"[Weber_Stufe2_K.xmi, Weber_Stufe2_P.xmi, Weber..."
0,"Denn er erhascht von dem, was das Leben des Ge...",Metaphernkandidat,,,GoldStandard,"[Weber_Stufe2_K.xmi, Weber_Stufe2_P.xmi, Weber..."
0,Das aber bedeutet: die Entzauberung der Welt.,Metaphernkandidat,,,GoldStandard,"[Weber_Stufe2_K.xmi, Weber_Stufe2_P.xmi, Weber..."
0,Der Schimmer einer hohen sittlichen Idee aber ...,Metaphernkandidat,,,GoldStandard,"[Methner_Stufe2_T_2.xmi, Methner_Stufe2_B_2.xm..."
0,22\tWie ist die kunstreich zusammengesetzte Ma...,Metaphernkandidat,,,GoldStandard,"[Haeckel_Lebenswunder_Stufe2_B.xmi, Haeckel_Le..."
0,wenn wir von dem schlechten Worte »Rechtsphilo...,Metapher,"[schwindsüchtig, schwindsüchtig ist]","[eine Philosophie, daß eine Philosophie]",GoldStandard,"[Eleutheropulos_Stufe2_K.xmi, Eleutheropulos_S..."
0,[8]: So band er das Geschlecht von heute an da...,Metapher,"[der Zukunft, der Vergangenheit2, starrt in da...","[in die Morgenröte, starrt in das Dunkel, der ...",GoldStandard,"[Michaelis_Stufe2_P.xmi, Michaelis_Stufe2_P.xm..."
0,"Wenn man gewöhnlich mit Aristoteles, Pol. I, 6...",Metaphernkandidat,,,GoldStandard,"[Eleutheropulos_Stufe2_K.xmi, Eleutheropulos_S..."
0,Da die organische Natur aus der anorganischen ...,Metaphernkandidat,,,GoldStandard,"[Schalk_Stufe2_P_1.xmi, Schalk_Stufe2_B_1.xmi,..."
0,"Unter solchen Umständen ist es natürlich, daß ...",Metaphernkandidat,,,GoldStandard,"[Schalk_Stufe2_P_4.xmi, Schalk_Stufe2_T_4.xmi,..."


In [None]:
# combine xmi and tsv gold standards
df_gold_standard = pd.concat([tsv_gold_standard,xmi_gold_standard], axis=0, ignore_index=True)

# save gold standard as csv
df_gold_standard.to_csv(DATA_PATH + '/Annotationen-Stufe-2-GoldStandard.csv')

658