<a href="https://colab.research.google.com/github/ElemelonWind/knowledge-graph/blob/main/KG_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part 1: Generating KG

## Import Dependencies + Datasets

In [66]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [61]:
candidate_sentences = pd.read_csv('https://raw.githubusercontent.com/phgunawan/Latihan-ML/master/wiki_sentences_v2.csv')
candidate_sentences.shape

(4318, 1)

In [62]:
candidate_sentences['sentence'].sample(5)


3415                     the battle royale phenomenon has become especially popular in the 2010s.
725     the swedish film industry was smaller and slower to get started than the danish industry.
607           zombie's second studio album, the sinister urge, was released on november 13, 2001.
3690                                                                   a style means an attitude.
2421                                                 the theatre was shut down after a few years.
Name: sentence, dtype: object

## Sentence Segmentation & Extraction

In [63]:
doc = nlp(candidate_sentences["sentence"][2])

for tok in doc:
  print(tok.text, "...", tok.dep_)

christian ... nsubjpass
is ... auxpass
then ... advmod
paralyzed ... ROOT
by ... agent
an ... det
elder ... pobj
. ... punct


In [19]:
def get_entities(sent):
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""
  prv_tok_text = "" 

  prefix = ""
  modifier = ""

  for tok in nlp(sent):
    if tok.dep_ != "punct":
      
      # text associated with subject/object
      if tok.dep_ == "compound":
        prefix = tok.text
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " " + tok.text
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " " + tok.text
      
      # subject = first entity
      if tok.dep_.find("subj") == True:
        ent1 = modifier + " " + prefix + " " + tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""
      # object = second entity
      if tok.dep_.find("obj") == True:
        ent2 = modifier + " " + prefix + " " + tok.text

      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text 
    
  return [ent1.strip(), ent2.strip()]

In [64]:
get_entities(candidate_sentences["sentence"][2])

['christian', 'then  elder']

In [65]:
entity_pairs = [] 

for i in tqdm(candidate_sentences["sentence"]):
  entity_pairs.append(get_entities(i))

100%|██████████| 4318/4318 [00:38<00:00, 111.69it/s]


In [67]:
entity_pairs[10:20]

[['we', 'tests'],
 ['m', 'international sales rights'],
 ['musician robbie robertson', 'soundtrack'],
 ['it', 'original music tracks'],
 ['it', 'reviewed  franchise'],
 ['she', 'accidentally  mystique'],
 ['military  forces', 'arrest'],
 ['train', 'vuk'],
 ['kota eberhardt', 'telepath selene gallio'],
 ['singer', '-']]

In [87]:
# spaCy's rule-based matching
def get_relation(sent):
  doc = nlp(sent)
  matcher = Matcher(nlp.vocab)
  pattern = [{'DEP': 'ROOT'},
             {'DEP': 'prep', 'OP': '?'},
             {'DEP': 'agent', 'OP': '?'},
             {'POS': 'ADJ', 'OP': '?'}]

  matcher.add("matching_1", [pattern], on_match = None)
  matches = matcher(doc)
  k = len(matches) - 1
  try:
    span = doc[matches[k][1]:matches[k][2]]
  except:
    print(sent)
    return "n/a"
  return span.text

In [81]:
get_relation(candidate_sentences["sentence"][2])

'paralyzed by'

In [88]:
relations = []
for i in tqdm(candidate_sentences['sentence']):
  relations.append(get_relation(i))





  0%|          | 0/4318 [00:00<?, ?it/s][A[A[A[A



  0%|          | 11/4318 [00:00<00:40, 105.36it/s][A[A[A[A



  1%|          | 23/4318 [00:00<00:38, 110.21it/s][A[A[A[A



  1%|          | 35/4318 [00:00<00:40, 106.04it/s][A[A[A[A



  1%|          | 47/4318 [00:00<00:38, 109.64it/s][A[A[A[A



  1%|▏         | 58/4318 [00:00<00:39, 107.27it/s][A[A[A[A



  2%|▏         | 69/4318 [00:00<00:39, 106.31it/s][A[A[A[A



  2%|▏         | 80/4318 [00:00<00:41, 103.17it/s][A[A[A[A



  2%|▏         | 91/4318 [00:00<00:40, 103.73it/s][A[A[A[A



  2%|▏         | 103/4318 [00:00<00:38, 108.48it/s][A[A[A[A



  3%|▎         | 115/4318 [00:01<00:38, 110.03it/s][A[A[A[A



  3%|▎         | 127/4318 [00:01<00:38, 109.25it/s][A[A[A[A



  3%|▎         | 138/4318 [00:01<00:38, 109.28it/s][A[A[A[A



  3%|▎         | 149/4318 [00:01<00:39, 106.50it/s][A[A[A[A



  4%|▎         | 161/4318 [00:01<00:37, 110.32it/s][A[A[A[A



  4%|▍    

prometheus  and alien: covenant  address extraterrestrial themes.






  7%|▋         | 284/4318 [00:02<00:41, 97.50it/s] [A[A[A[A



  7%|▋         | 294/4318 [00:02<00:43, 92.52it/s][A[A[A[A



  7%|▋         | 304/4318 [00:02<00:42, 94.11it/s][A[A[A[A



  7%|▋         | 315/4318 [00:03<00:41, 97.29it/s][A[A[A[A



  8%|▊         | 326/4318 [00:03<00:39, 100.61it/s][A[A[A[A



  8%|▊         | 338/4318 [00:03<00:38, 103.62it/s][A[A[A[A



  8%|▊         | 349/4318 [00:03<00:39, 100.40it/s][A[A[A[A



  8%|▊         | 360/4318 [00:03<00:39, 100.17it/s][A[A[A[A



  9%|▊         | 371/4318 [00:03<00:40, 98.42it/s] [A[A[A[A



  9%|▉         | 381/4318 [00:03<00:40, 97.03it/s][A[A[A[A



  9%|▉         | 391/4318 [00:03<00:40, 96.21it/s][A[A[A[A



  9%|▉         | 402/4318 [00:03<00:39, 98.17it/s][A[A[A[A



 10%|▉         | 413/4318 [00:04<00:38, 101.03it/s][A[A[A[A



 10%|▉         | 424/4318 [00:04<00:38, 102.14it/s][A[A[A[A



 10%|█         | 435/4318 [00:04<00:38, 100.20it/s][A[A[A[A


martin scorsese's after hours  






 47%|████▋     | 2009/4318 [00:20<00:23, 96.35it/s] [A[A[A[A



 47%|████▋     | 2021/4318 [00:20<00:22, 101.22it/s][A[A[A[A



 47%|████▋     | 2032/4318 [00:20<00:23, 97.96it/s] [A[A[A[A



 47%|████▋     | 2042/4318 [00:20<00:23, 98.15it/s][A[A[A[A



 48%|████▊     | 2053/4318 [00:20<00:22, 100.49it/s][A[A[A[A



 48%|████▊     | 2064/4318 [00:20<00:22, 100.61it/s][A[A[A[A



 48%|████▊     | 2075/4318 [00:20<00:22, 100.83it/s][A[A[A[A



 48%|████▊     | 2086/4318 [00:20<00:21, 102.19it/s][A[A[A[A



 49%|████▊     | 2097/4318 [00:21<00:22, 100.88it/s][A[A[A[A



 49%|████▉     | 2108/4318 [00:21<00:22, 97.23it/s] [A[A[A[A



 49%|████▉     | 2118/4318 [00:21<00:22, 96.94it/s][A[A[A[A



 49%|████▉     | 2129/4318 [00:21<00:21, 100.21it/s][A[A[A[A



 50%|████▉     | 2140/4318 [00:21<00:21, 102.52it/s][A[A[A[A



 50%|████▉     | 2151/4318 [00:21<00:20, 103.32it/s][A[A[A[A



 50%|█████     | 2162/4318 [00:21<00:21, 100.1

leftist filmmakers joris ivens and henri storck directed borinage  about the belgian coal mining region.






 58%|█████▊    | 2487/4318 [00:24<00:18, 99.00it/s] [A[A[A[A



 58%|█████▊    | 2498/4318 [00:25<00:18, 99.74it/s][A[A[A[A



 58%|█████▊    | 2508/4318 [00:25<00:18, 96.36it/s][A[A[A[A



 58%|█████▊    | 2519/4318 [00:25<00:18, 98.38it/s][A[A[A[A



 59%|█████▊    | 2529/4318 [00:25<00:18, 97.73it/s][A[A[A[A



 59%|█████▉    | 2539/4318 [00:25<00:18, 96.20it/s][A[A[A[A



 59%|█████▉    | 2549/4318 [00:25<00:18, 96.42it/s][A[A[A[A



 59%|█████▉    | 2560/4318 [00:25<00:18, 96.19it/s][A[A[A[A



 60%|█████▉    | 2571/4318 [00:25<00:17, 98.18it/s][A[A[A[A



 60%|█████▉    | 2582/4318 [00:25<00:17, 100.30it/s][A[A[A[A



 60%|██████    | 2593/4318 [00:26<00:16, 102.50it/s][A[A[A[A



 60%|██████    | 2605/4318 [00:26<00:16, 105.61it/s][A[A[A[A



 61%|██████    | 2616/4318 [00:26<00:16, 106.29it/s][A[A[A[A



 61%|██████    | 2627/4318 [00:26<00:15, 106.31it/s][A[A[A[A



 61%|██████    | 2638/4318 [00:26<00:15, 105.79it/s]

after mcquarrie's drawings for lucas's colleagues hal barwood and matthew robbins  






 71%|███████   | 3047/4318 [00:30<00:12, 99.96it/s] [A[A[A[A



 71%|███████   | 3058/4318 [00:30<00:12, 100.44it/s][A[A[A[A



 71%|███████   | 3070/4318 [00:30<00:11, 104.66it/s][A[A[A[A



 71%|███████▏  | 3081/4318 [00:30<00:11, 104.59it/s][A[A[A[A



 72%|███████▏  | 3092/4318 [00:30<00:11, 103.34it/s][A[A[A[A



 72%|███████▏  | 3103/4318 [00:30<00:11, 101.59it/s][A[A[A[A



 72%|███████▏  | 3114/4318 [00:31<00:11, 102.57it/s][A[A[A[A



 72%|███████▏  | 3126/4318 [00:31<00:11, 105.78it/s][A[A[A[A



 73%|███████▎  | 3137/4318 [00:31<00:11, 104.47it/s][A[A[A[A



 73%|███████▎  | 3149/4318 [00:31<00:10, 106.92it/s][A[A[A[A



 73%|███████▎  | 3160/4318 [00:31<00:11, 104.15it/s][A[A[A[A



 73%|███████▎  | 3171/4318 [00:31<00:11, 104.00it/s][A[A[A[A



 74%|███████▎  | 3182/4318 [00:31<00:10, 103.85it/s][A[A[A[A



 74%|███████▍  | 3193/4318 [00:31<00:11, 101.86it/s][A[A[A[A



 74%|███████▍  | 3204/4318 [00:31<00:11, 99.

In [89]:
pd.Series(relations).value_counts()[:50]

is               365
was              299
released on       88
are               78
include           72
were              68
released          41
composed by       33
's                32
have              31
became            30
has               30
become            28
released in       27
included          22
produced          21
been              20
made              20
had               19
called            19
considered        18
used              18
be                17
received          15
written by        15
scheduled         15
stars             14
hired             14
produced by       13
directed by       13
introduced in     13
began             13
went              13
wanted            12
wrote             11
began in          11
won               11
set               10
gave              10
includes          10
sold              10
features           9
cast as            9
used in            9
opened             9
gives              9
shot in            9
produced in  

## Build KG