In [1]:
#!python -m spacy download de_dep_news_trf

In [2]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('de_dep_news_trf')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

#pd.set_option('display.max_colwidth', 200)
#%matplotlib inline



In [3]:
with open("KE1.txt", "r") as f:
    txt = f.read()

doc = nlp(txt)

In [4]:
for tok in nlp("Peter mag Ute."):
    print(tok, tok.dep_, tok.pos_, list(tok.children)[0] if len(list(tok.children)) > 0 else "")

Peter sb PROPN 
mag ROOT VERB Peter
Ute oa PROPN 
. punct PUNCT 


In [5]:
l = ["ROOT", "ac", "adc", "ag", "ams", "app", "avc", "cc", "cd", "cj", "cm", "cp", "cvc", "da", "dep", "dm", "ep", "ju", "mnr", "mo", "ng", "nk", "nmc", "oa", "oc", "og", "op", "par", "pd", "pg", "ph", "pm", "pnc", "punct", "rc", "re", "rs", "sb", "sbp", "svp", "uc", "vo"]

In [6]:
for i in l:
    print(i, spacy.explain(i))

ROOT None
ac adpositional case marker
adc adjective component
ag genitive attribute
ams measure argument of adjective
app apposition
avc adverbial phrase component
cc coordinating conjunction
cd coordinating conjunction
cj conjunct
cm comparative conjunction
cp complementizer
cvc collocational verb construction
da dative
dep unclassified dependent
dm discourse marker
ep expletive es
ju junctor
mnr postnominal modifier
mo modifier
ng negation
nk noun kernel element
nmc numerical component
oa accusative object
oc clausal object
og genitive object
op prepositional object
par parenthetical element
pd predicate
pg phrasal genitive
ph placeholder
pm morphological particle
pnc proper noun component
punct punctuation
rc relative clause
re repeated element
rs reported speech
sb subject
sbp passivized subject (PP)
svp separable verb prefix
uc unit component
vo vocative


# 1. Sentence segmentation

# 2. Entity extraction

In [7]:
def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    #print(tok.dep_)
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      '''# check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text'''
      
      ## chunk 3
      if tok.dep_ == "sb":#if tok.dep_.find("subj") == True:
        #print(tok, tok.dep_)
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_ == "oa":##if tok.dep_.find("obj") == True:
        #print(tok, tok.dep_)
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]

In [8]:
entity_pairs = []

for i in tqdm(doc.sents):
  entity_pairs.append(get_entities(i.text))

1129it [00:40, 27.71it/s]


# 3. Relations Extraction

In [9]:
def get_relation(sent):

  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", [pattern]) 

  matches = matcher(doc)
  k = len(matches) - 1

  if k == -1:
    return

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [10]:
relations = [get_relation(i.text) for i in tqdm(doc.sents)]

1129it [00:42, 26.87it/s]


# Build KG

In [11]:
# extract subject
source = [i[0] for i in entity_pairs]

# extract object
target = [i[1] for i in entity_pairs]

kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})

In [14]:
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="ist"], "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5) # k regulates the distance between nodes
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1, edge_cmap=plt.cm.Blues, pos = pos)
#plt.show()



: 

: 

# Flair

In [14]:
!pip install flair

Collecting flair
  Using cached flair-0.11.3-py3-none-any.whl (401 kB)
Collecting segtok>=1.5.7
  Using cached segtok-1.5.11-py3-none-any.whl (24 kB)
Collecting langdetect
  Using cached langdetect-1.0.9-py3-none-any.whl
Collecting mpld3==0.3
  Using cached mpld3-0.3-py3-none-any.whl
Collecting wikipedia-api
  Using cached Wikipedia_API-0.5.8-py3-none-any.whl (13 kB)


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
markdown 3.3.6 requires importlib-metadata>=4.4; python_version < "3.10", but you have importlib-metadata 3.10.1 which is incompatible.


Collecting konoha<5.0.0,>=4.0.0
  Using cached konoha-4.6.5-py3-none-any.whl (20 kB)
Collecting bpemb>=0.3.2
  Using cached bpemb-0.3.4-py3-none-any.whl (19 kB)
Collecting deprecated>=1.2.4
  Using cached Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting hyperopt>=0.2.7
  Using cached hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
Collecting sqlitedict>=1.6.0
  Using cached sqlitedict-2.1.0-py3-none-any.whl
Collecting janome
  Using cached Janome-0.4.2-py2.py3-none-any.whl (19.7 MB)
Collecting gdown==4.4.0
  Using cached gdown-4.4.0-py3-none-any.whl
Collecting gensim>=3.4.0
  Using cached gensim-4.3.0-cp39-cp39-win_amd64.whl (24.0 MB)
Collecting conllu>=4.0
  Using cached conllu-4.5.2-py2.py3-none-any.whl (16 kB)
Collecting ftfy
  Using cached ftfy-6.1.1-py3-none-any.whl (53 kB)
Collecting pptree
  Using cached pptree-3.1-py3-none-any.whl
Collecting py4j
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Collecting overrides<4.0.0,>=3.0.0
  Using cached overrides-3.1.0-p

In [23]:
from flair.data import Sentence
from flair.models import RelationExtractor, SequenceTagger

# 1. make example sentence
sentence = Sentence("George wurde in Washington geboren")

# 2. load entity tagger and predict entities
tagger = SequenceTagger.load('de-pos')
tagger.predict(sentence)
print(sentence)
# check which entities have been found in the sentence
entities = sentence.get_labels('ner')
for entity in entities:
    print(entity)

# 3. load relation extractor
extractor: RelationExtractor = RelationExtractor.load('relations')

# predict relations
extractor.predict(sentence)

# check which relations have been found
relations = sentence.get_labels('relation')
for relation in relations:
    print(relation)

2023-02-08 11:30:08,529 loading file C:\Users\menze\.flair\models\de-pos-ud-hdt-v0.5.pt
2023-02-08 11:30:08,700 SequenceTagger predicts: Dictionary with 58 tags: <unk>, O, APPR, ART, ADJA, NN, VVFIN, PIS, NE, FM, $,, KON, $., CARD, APPRART, $(, PROAV, KOUS, PPER, ADV, VVINF, VAFIN, VMFIN, ADJD, PTKVZ, PTKNEG, KOKOM, PIDAT, PIAT, VVPP, PRF, PTKA, TRUNC, PPOSAT, VVIZU, PTKZU, VAINF, VMINF, PWAV, PDAT, PRELS, KOUI, APPO, VAPP, PWAT, PWS, VVIMP, APZR, PDS, PRELAT
Sentence: "George wurde in Washington geboren" → ["George"/NE, "wurde"/VAFIN, "in"/APPR, "Washington"/NE, "geboren"/VVPP]
2023-02-08 11:30:09,168 loading file C:\Users\menze\.flair\models\relations-v11.pt
