<a href="https://colab.research.google.com/github/DatNguyen2084/DLDH-Metaphor-detection/blob/main/DLDH_BERT_NonMetaphor_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
!pip install PyDrive
!pip install dkpro-cassis
!pip install fuzzywuzzy

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import os
import os.path
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import numpy as np
from cassis import *
import seaborn as sns
import matplotlib.pyplot as plt
import argparse
from fuzzywuzzy import fuzz
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
import re

Collecting dkpro-cassis
  Downloading dkpro-cassis-0.7.0.tar.gz (73 kB)
[K     |████████████████████████████████| 73 kB 1.9 MB/s 
[?25hCollecting lxml==4.7.*
  Downloading lxml-4.7.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 16.1 MB/s 
[?25hCollecting attrs==21.2.*
  Downloading attrs-21.2.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.0 MB/s 
Collecting toposort==1.7
  Downloading toposort-1.7-py2.py3-none-any.whl (9.0 kB)
Collecting deprecation==2.1.*
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Building wheels for collected packages: dkpro-cassis
  Building wheel for dkpro-cassis (setup.py) ... [?25l[?25hdone
  Created wheel for dkpro-cassis: filename=dkpro_cassis-0.7.0-py3-none-any.whl size=74043 sha256=595bd622c8a69feca2257cdb958ebf0dcf0cde419a95dbd50af6e1676bf5615a
  Stored in directory: /root/.cache/pip/wheels/a9/3c/80/81baf3926



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Mounted at /content/drive


# Data

In [None]:
# Mount Google Drive
# The following data is needed: https://drive.google.com/drive/folders/159CN2MDaGLzuoiA7x--Qq5zEdPavFcpf?usp=sharing
# Create a shortcut to your Drive ("Drive-Verknüpfung hinzufügen" zu "Meine Ablage")
from google.colab import drive
drive.mount('/content/drive')

ROOT_PATH = '/content/drive/MyDrive/DLDH'
DATA_PATH = '/data'
TEXT_PATH = '/original_texts'

In [None]:
# loading the gold standard to compare to
gold_standard_df = pd.read_csv(ROOT_PATH + DATA_PATH + '/Annotationen-Stufe-2-GoldStandard.csv')

In [None]:
df = pd.DataFrame()

# loading texts from these files
including = ["nus1_2_matzat_bereinigt.txt", "nus2_2_ruppin_bereinigt.txt", "nus3_2_schallmeyer_bereinigt.txt", "nus6_2_Eleutheropulos_bereinigt.txt",
             "nus9_2_Haecker_bereinigt.txt", "nus8_2_Methner_bereinigt.txt", "nus5_2_Michaelis_bereinigt.txt", "nus7_2_Schalk_bereinigt.txt"]

# specified individual amount of new datapoints per text
individual_amount = 500

def generate_samples_txt(path,df, individual_amount, including):
    """
    Generates a given amount of new non-metaphor samples from given texts and adds it to a given dataframe
    and saves it as CSV to a given path
    :param path: The path to save to
    :param df: The dataframe to add to
    :param individual_amount: The individual amount of new datapoints per text
    :param including: The texts included in the generation process
    :return df: The dataframe with added new metaphors
    """  
    original_length = len(df)
    for i, filename in enumerate(including):
      print(filename)

      # load text
      file = open(os.path.join(path, filename), 'rt')
      text = file.read()
      text = re.sub(r'\n+', ' ', text).strip()
      file.close()

      doc = sent_tokenize(text)
      if len(df) > 0:
        textstellen = df['Textstelle'].tolist()
      else:
        textstellen = []
      for j, sentence in enumerate(doc):

        if len(sentence) > 100 and not sentence in textstellen:
          # check similarity to all gold standard samples via fuzzy string matching
          tmp_df = gold_standard_df.apply(lambda row : fuzz.partial_ratio(row['Textstelle'], sentence), axis = 1)
          max_value = tmp_df.max()
          # take sample only if no gold standard sample has a high similarity score
          if max_value < 60:     
            df = df.append({'Textstelle': sentence, 'Metapher?': 'Nein', 'Fokus': '', 'Rahmen': '', 'Annotator': 'X', 'Filename': filename}, ignore_index=True)
            if len(df) % 100 == 0:
              print("Current dataframe size:", len(df))
        # end the generation for a specific file if individual amount is reached
        if len(df) >= original_length+(i+1)*individual_amount:
          break

      # save the df to the given path
      df.to_csv(ROOT_PATH+DATA_PATH+'/NoMetaphor.csv')
      print("Saved Dataframe as csv.")
    return df

# generate new non-metaphor examples from given texts and save them as CSV
text_df = generate_samples_txt(ROOT_PATH+DATA_PATH+TEXT_PATH, df, individual_amount, including)

display(text_df.sample(10))

nus1_2_matzat_bereinigt.txt
Current dataframe size: 100
Current dataframe size: 200
Current dataframe size: 300
Current dataframe size: 400
Current dataframe size: 500
len 500
Saved Dataframe as csv.
nus2_2_ruppin_bereinigt.txt
Current dataframe size: 600
Current dataframe size: 700
Current dataframe size: 800
Current dataframe size: 900
Current dataframe size: 1000
len 1000
Saved Dataframe as csv.
nus3_2_schallmeyer_bereinigt.txt
Current dataframe size: 1100
Current dataframe size: 1200
Current dataframe size: 1300
Current dataframe size: 1400
Current dataframe size: 1500
len 1500
Saved Dataframe as csv.
nus6_2_Eleutheropulos_bereinigt.txt
Current dataframe size: 1600
Current dataframe size: 1700
Current dataframe size: 1800
Current dataframe size: 1900
Current dataframe size: 2000
len 2000
Saved Dataframe as csv.
nus9_2_Haecker_bereinigt.txt
Current dataframe size: 2100
Current dataframe size: 2200
Current dataframe size: 2300
Current dataframe size: 2400
Current dataframe size: 2500

Unnamed: 0,Annotator,Filename,Fokus,Metapher?,Rahmen,Textstelle
1669,X,nus6_2_Eleutheropulos_bereinigt.txt,,Nein,,Mit der auf breitere Basis gestellten Induktio...
3635,X,nus7_2_Schalk_bereinigt.txt,,Nein,,"Alle diese Leute, welche von Generation zu Gen..."
434,X,nus1_2_matzat_bereinigt.txt,,Nein,,Wenn irgend ein Verhältnis objektiver Förderun...
1050,X,nus3_2_schallmeyer_bereinigt.txt,,Nein,,"Denn je höher die Zivilisation steigt, desto m..."
626,X,nus2_2_ruppin_bereinigt.txt,,Nein,,"Mit Recht bemerkt aber Tylor1), 1) Introductio..."
3828,X,nus7_2_Schalk_bereinigt.txt,,Nein,,Sehr bald wurde sie käuf- lich durch Versprech...
1240,X,nus3_2_schallmeyer_bereinigt.txt,,Nein,,Dafür kann die Eizelle auf Beweglichkeit verzi...
1763,X,nus6_2_Eleutheropulos_bereinigt.txt,,Nein,,Prinzipiell tritt er uns in zwei Formen entgeg...
1948,X,nus6_2_Eleutheropulos_bereinigt.txt,,Nein,,Erst jetzt sind uns aber auch die Erscheinunge...
1793,X,nus6_2_Eleutheropulos_bereinigt.txt,,Nein,,Denn bei der Annahme eines Rechtsgefühls wird ...
