<a href="https://colab.research.google.com/github/ElemelonWind/knowledge-graph/blob/main/KG_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part 1: Generating KG

## Import Dependencies + Datasets

In [1]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [2]:
candidate_sentences = pd.read_csv('https://raw.githubusercontent.com/phgunawan/Latihan-ML/master/wiki_sentences_v2.csv')
candidate_sentences.shape

(4318, 1)

In [3]:
candidate_sentences['sentence'].sample(5)


2827                            jagannadh meanwhile directed and produced 143 .
4102            this film is also the only rugrats film to receive a pg rating.
1470    at least forty-four major gujarati directors worked during this period.
2801                   but that didn't materialize because of time constraints.
2676                                 elliot goldenthal composed the soundtrack.
Name: sentence, dtype: object

## Sentence Segmentation & Extraction

In [4]:
doc = nlp(candidate_sentences["sentence"][2])

for tok in doc:
  print(tok.text, "...", tok.dep_)

christian ... nsubjpass
is ... auxpass
then ... advmod
paralyzed ... ROOT
by ... agent
an ... det
elder ... pobj
. ... punct


In [5]:
def get_entities(sent):
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""
  prv_tok_text = "" 

  prefix = ""
  modifier = ""

  for tok in nlp(sent):
    if tok.dep_ != "punct":
      
      # text associated with subject/object
      if tok.dep_ == "compound":
        prefix = tok.text
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " " + tok.text
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " " + tok.text
      
      # subject = first entity
      if tok.dep_.find("subj") == True:
        ent1 = modifier + " " + prefix + " " + tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""
      # object = second entity
      if tok.dep_.find("obj") == True:
        ent2 = modifier + " " + prefix + " " + tok.text

      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text 
    
  return [ent1.strip(), ent2.strip()]

In [6]:
get_entities(candidate_sentences["sentence"][2])

['christian', 'then  elder']

In [7]:
entity_pairs = [] 

for i in tqdm(candidate_sentences["sentence"]):
  entity_pairs.append(get_entities(i))

100%|██████████| 4318/4318 [00:37<00:00, 114.37it/s]


In [8]:
entity_pairs[10:20]

[['we', 'tests'],
 ['m', 'international sales rights'],
 ['musician robbie robertson', 'soundtrack'],
 ['it', 'original music tracks'],
 ['it', 'reviewed  franchise'],
 ['she', 'accidentally  mystique'],
 ['military  forces', 'arrest'],
 ['train', 'vuk'],
 ['kota eberhardt', 'telepath selene gallio'],
 ['singer', '-']]

In [9]:
# spaCy's rule-based matching
def get_relation(sent):
  doc = nlp(sent)
  matcher = Matcher(nlp.vocab)
  pattern = [{'DEP': 'ROOT'},
             {'DEP': 'prep', 'OP': '?'},
             {'DEP': 'agent', 'OP': '?'},
             {'POS': 'ADJ', 'OP': '?'}]

  matcher.add("matching_1", [pattern], on_match = None)
  matches = matcher(doc)
  k = len(matches) - 1
  try:
    span = doc[matches[k][1]:matches[k][2]]
  except:
    return "n/a"
  return span.text

In [10]:
get_relation(candidate_sentences["sentence"][2])

'paralyzed by'

In [11]:
relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]

100%|██████████| 4318/4318 [00:42<00:00, 101.51it/s]


In [12]:
pd.Series(relations).value_counts()[:50]

is               365
was              299
released on       88
are               78
include           72
were              68
released          41
composed by       33
's                32
have              31
became            30
has               30
become            28
released in       27
included          22
produced          21
been              20
made              20
had               19
called            19
considered        18
used              18
be                17
received          15
written by        15
scheduled         15
stars             14
hired             14
produced by       13
directed by       13
introduced in     13
began             13
went              13
wanted            12
wrote             11
began in          11
won               11
set               10
gave              10
includes          10
sold              10
features           9
cast as            9
used in            9
opened             9
gives              9
shot in            9
produced in  

## Build KG