<a href="https://colab.research.google.com/github/GenericP3rson/WikiGraph/blob/main/WikiGraph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WikiGraph

## Schema Creation

In [1]:
!pip install pyTigerGraph[gds]

import pyTigerGraph as tg

conn = tg.TigerGraphConnection(host = "https://bleve.i.tgcloud.io/")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyTigerGraph[gds]
  Downloading pyTigerGraph-0.9-py3-none-any.whl (134 kB)
[K     |████████████████████████████████| 134 kB 4.1 MB/s 
Collecting validators
  Downloading validators-0.20.0.tar.gz (30 kB)
Collecting pyTigerDriver
  Downloading pyTigerDriver-1.0.14-py3-none-any.whl (8.7 kB)
Collecting kafka-python
  Downloading kafka_python-2.0.2-py2.py3-none-any.whl (246 kB)
[K     |████████████████████████████████| 246 kB 14.7 MB/s 
Building wheels for collected packages: validators
  Building wheel for validators (setup.py) ... [?25l[?25hdone
  Created wheel for validators: filename=validators-0.20.0-py3-none-any.whl size=19582 sha256=f142e137966280918d2df52cd3b1568dbddf7798ad3735673f23c091f808c6d3
  Stored in directory: /root/.cache/pip/wheels/5f/55/ab/36a76989f7f88d9ca7b1f68da6d94252bb6a8d6ad4f18e04e9
Successfully built validators
Installing collected packages: validators

In [None]:
conn.gsql('''
USE GLOBAL

CREATE VERTEX Doc(PRIMARY_ID id STRING, title STRING, content STRING) WITH PRIMARY_ID_AS_ATTRIBUTE = "true"
CREATE VERTEX Entity(PRIMARY_ID entity STRING) WITH PRIMARY_ID_AS_ATTRIBUTE = "true"

CREATE DIRECTED EDGE LINKS_TO(FROM Doc, TO Doc, weight DOUBLE)
CREATE UNDIRECTED EDGE DOC_ENTITY(FROM Doc, TO Entity, weight DOUBLE)

CREATE GRAPH WikiGraph(Doc, Entity, LINKS_TO, DOC_ENTITY)
''')

'Successfully created edge types: [DOC_ENTITY].\nThe graph WikiGraph is created.'

In [2]:
conn.graphname = "WikiGraph"
conn.apiToken = conn.getToken(conn.createSecret())

## Load Data

In [None]:
!pip install yake

In [8]:
import requests
from bs4 import BeautifulSoup
import yake
from collections import Counter

# Text cleaning, entity extraction, sentiment analysis, summarisation
# Graph structures

seen = []

def scrape(input_link):

  print(input_link)

  response = requests.get(
    url = input_link,
  )
  soup = BeautifulSoup(response.content, 'html.parser')

  first_paragraph = soup.find('p')

  if first_paragraph == None:
    return 

  while first_paragraph.get_text().strip() == "":
    first_paragraph = first_paragraph.find_next_sibling('p')

  conn.upsertVertex("Doc", input_link, attributes={"id": input_link, "title": soup.find("h1").get_text(), "content": first_paragraph.get_text()})

  text = ("\n".join([''.join(i.get_text().split('\n')) for i in soup.find_all("p")]))

  kw_extractor = yake.KeywordExtractor()
  keywords = kw_extractor.extract_keywords(text)
  
  for word, score in keywords:
    conn.upsertVertex("Entity", word, attributes={"entity": word})
    conn.upsertEdge("Doc", input_link, "DOC_ENTITY", "Entity", word, {"weight": float(1-score)})

  seen.append(input_link)

  links = [f"https://en.wikipedia.org{link['href'].split('#')[0]}" for link in soup.find_all("a") if link.has_attr("href") and link["href"][:6] == "/wiki/" and not ":" in link["href"]]
  tot_links = len(links)

  count = Counter(links)
  for link in links:
    conn.upsertEdge("Doc", input_link, "LINKS_TO", "Doc", link, {"weight": float(count[link]/tot_links)})
    if link not in seen:
      scrape(link)

scrape("https://en.wikipedia.org/wiki/Graph_database")

https://en.wikipedia.org/wiki/Graph_database
https://en.wikipedia.org/wiki/Computing
https://en.wikipedia.org/wiki/Computer
https://en.wikipedia.org/wiki/Computer_(disambiguation)
https://en.wikipedia.org/wiki/Computer_(Courage_the_Cowardly_Dog)
https://en.wikipedia.org/wiki/Cartoon_Network
https://en.wikipedia.org/wiki/Cartoon_Network_(disambiguation)
https://en.wikipedia.org/wiki/Cartoon_Network_Arabic
https://en.wikipedia.org/wiki/United_Arab_Emirates
https://en.wikipedia.org/wiki/UAE_(disambiguation)
https://en.wikipedia.org/wiki/UAE_(emulator)
https://en.wikipedia.org/wiki/Programmer
https://en.wikipedia.org/wiki/Programmer_(disambiguation)
https://en.wikipedia.org/wiki/Program_management
https://en.wikipedia.org/wiki/Program_Manager
https://en.wikipedia.org/wiki/Windows_3.1x
https://en.wikipedia.org/wiki/Windows_NT_3.1
https://en.wikipedia.org/wiki/Windows_NT
https://en.wikipedia.org/wiki/Microsoft
https://en.wikipedia.org/wiki/Microsoft_Redmond_campus
https://en.wikipedia.org/wi

AttributeError: ignored

## Queries

In [None]:
# Coming soon!