# Create a Knowledge Graph from Text

## Task 1: Import Libraries

In [2]:
import wikipedia as wp
import re
import requests
import spacy
import spacy_transformers
from spacy import displacy
from spacy.matcher import Matcher
import networkx as nx
from pyvis.network import Network


## Task 2: Load the Data

In [3]:
# Set the language of the response
wp.set_lang("en")

# Obtain and store the data
title = "New York"
data = wp.page(title).content

# View the data for testing
print(data)


New York, often called New York City (NYC), is the most populous city in the United States. It is located at the southern tip of New York State on one of the world's largest natural harbors. The city comprises five boroughs, each coextensive with its respective county. The city is the geographical and demographic center of both the Northeast megalopolis and the New York metropolitan area, the largest metropolitan area in the United States by both population and urban area. New York is a global center of finance and commerce, culture, technology, entertainment and media, academics and scientific output, the arts and fashion, and, as home to the headquarters of the United Nations, international diplomacy.
With an estimated population in July 2024 of 8,478,072, distributed over 300.46 square miles (778.2 km2), the city is the most densely populated major city in the United States. New York City has more than double the population of Los Angeles, the nation's second-most populous city. Ove

## Task 3: Preprocess the Data

In [4]:
# Convert the data to lowercase and replace new lines
data = data.lower().replace('\n', "")

# Remove the last part of the text, certain punctuation marks,
# headings, as well as any text within the parentheses
# -----------------------------------------------------------------------------
# Strip out various unwanted bits from `data` in one pass:
#
# 1. == see also ==…         — remove any "== see also ==" heading and everything that follows on that line
# 2. [@#:&"]                 — remove any single character that is @, #, :, & or "
# 3. ===…===                 — remove level-3 headings of the form ===some text===
# 4. ==…==                   — remove level-2 headings of the form ==some text==
# 5. (… )                    — remove any text in parentheses
# -----------------------------------------------------------------------------
data = re.sub(
    r'==\s*see\ also\s*==.*'  # 1
    r'|[@#:&"]'               # 2
    r'|===.*?==='             # 3
    r'|==.*?=='               # 4
    r'|\(.*?\)'               # 5
    , '', data
)

# View the processed data
print(data)


new york, often called new york city , is the most populous city in the united states. it is located at the southern tip of new york state on one of the world's largest natural harbors. the city comprises five boroughs, each coextensive with its respective county. the city is the geographical and demographic center of both the northeast megalopolis and the new york metropolitan area, the largest metropolitan area in the united states by both population and urban area. new york is a global center of finance and commerce, culture, technology, entertainment and media, academics and scientific output, the arts and fashion, and, as home to the headquarters of the united nations, international diplomacy.with an estimated population in july 2024 of 8,478,072, distributed over 300.46 square miles , the city is the most densely populated major city in the united states. new york city has more than double the population of los angeles, the nation's second-most populous city. over 20.1 million pe

## Task 4: Recognize Named Entities

In [5]:
# Load a language model
nlp = spacy.load('en_core_web_lg')
doc = nlp(data)

# Display the entities in the doc
displacy.render(doc, style = "ent", jupyter = True)


## Task 5: Compute Coreference Clusters

In [6]:
# Add the coreference resolution component in the pipeline
nlp.add_pipe('coreferee')

# Pass the data to the language model
doc = nlp(data)

# Print resolved coreferences, if any
doc._.coref_chains.print()

0: york(1), city(13), it(19), city(41), its(49), city(54), york(69), york(89), city(147), city(154)
1: states(80), states(158)
2: city(162), city(190), its(199), city(215), city(227), its(229)
3: world(220), world(265)
4: states(242), city(262), city(271)
5: us(284), city(297)
6: population(292), its(299)
7: amsterdam(303), amsterdam(323)
8: city(331), city(336)
9: ii(352), his(357)
10: britain(378), city(381)
11: states(389), city(397), its(405)
12: manhattan(408), manhattan(428)
13: city(432), city(450)
14: world(438), world(453), world(471), world(511)
15: york(461), york(485), york(507), city(519)
16: countries(496), their(534)
17: world(524), world(571), world(601), world(631)
18: city(547), city(563), city(568), city(605)
19: york(639), york(648), city(672)
20: james(650), him(660)
21: kingdom(679), it(683)
22: algonquians(705), their(711)
23: harbor(751), harbor(797), harbor(828)
24: verrazzano(762), he(764)
25: area(767), it(772)
26: captain(784), he(811), he(845)
27: gomes(786

## Task 6: Resolve Coreferences

In [7]:
resolved_data = ""
for token in doc:
    resolved_coref = doc._.coref_chains.resolve(token)
    if resolved_coref:
        resolved_data += " " + " and ".join(r.text for r in resolved_coref)
    elif token.dep_ == "punct":
        resolved_data += token.text
    else:
        resolved_data += " " + token.text
print(resolved_data)

 new york, often called new york city, is the most populous york in the united states. york is located at the southern tip of new york state on one of the world 's largest natural harbors. the york comprises five boroughs, each coextensive with york respective county. the york is the geographical and demographic center of both the northeast megalopolis and the new york metropolitan area, the largest metropolitan area in the united states by both population and urban area. new york is a global center of finance and commerce, culture, technology, entertainment and media, academics and scientific output, the arts and fashion, and, as home to the headquarters of the united nations, international diplomacy.with an estimated population in july 2024 of 8,478,072, distributed over 300.46 square miles, the york is the most densely populated major york in the united states. new york city has more than double the population of los angeles, the nation 's second- most populous city. over 20.1 milli

## Task 7: Extract Relationships

In [8]:
def extract_relationship(sentence):
    doc = nlp(sentence)

    first, last = None, None

    for chunk in doc.noun_chunks:
        if not first:
            first = chunk
        else:
            last = chunk
    
    if first and last:
        return (first.text.strip(), last.text.strip(), str(doc[first.end:last.start]).strip())
    
    return (None, None, None)

## Task 8: Create a Graph

In [10]:
# A helper function that prints 5 words per row. Can be used for better readability of a given text.
print_five_words = lambda sentence: '\n'.join(' '.join(sentence.split()[i:i+5]) for i in range(0, len(sentence.split()), 5))

# Create a Network object
graph_doc = nlp(resolved_data)

# Create an empty graph
nx_graph = nx.DiGraph()

for sent in enumerate(graph_doc.sents) :
    if len(sent[1]) > 3:
        (a, b, c) = extract_relationship(str(sent[1]))

        # Add nodes and edges to graph
        if a and b:
            nx_graph.add_node(a, size = 5)
            nx_graph.add_node(b, size = 5)
            nx_graph.add_edge(a, b, weight=1, title=print_five_words(c), arrows="to")

g = Network(notebook = True, cdn_resources = 'in_line')
g.from_nx(nx_graph)
g.save_graph("/usercode/example.html")

In [11]:
# Run this cell to view the resulting graph i.e. the /usercode/example.html file
from IPython.display import HTML, display
import base64

# Read and encode the HTML as base64
with open("/usercode/example.html", "r", encoding="utf-8") as f:
    html_str = f.read()
    b64_html = base64.b64encode(html_str.encode("utf-8")).decode("utf-8")

# Create an iframe using a data URI
iframe = f"""
<iframe src="data:text/html;base64,{b64_html}" width="100%" height="600" style="border:none;"></iframe>
"""

display(HTML(iframe))


## Task 9: List the Related Entities

In [12]:
print(nx_graph.edges(['manhattan']))

[('manhattan', 'important universities'), ('manhattan', "the nation 's 360 largest counties"), ('manhattan', 'the second department')]
