https://neptune.ai/blog/web-scraping-and-knowledge-graphs-machine-learning

In [4]:

import wikipediaapi  
import pandas as pd
import concurrent.futures
from tqdm import tqdm


In [6]:
def scrape_wikipedia(name_topic, verbose=True):
   def link_to_wikipedia(link):
       try:
           page = api_wikipedia.page(link)
           if page.exists():
               return {'page': link, 'text': page.text, 'link': page.fullurl, 'categories': list(page.categories.keys())}
       except:
           return None
      
   api_wikipedia = wikipediaapi.Wikipedia(language='en',user_agent="Mozilla/5.0 (X11; Linux i686; rv:109.0) Gecko/20100101 Firefox/121.0", extract_format=wikipediaapi.ExtractFormat.WIKI)
   name_of_page = api_wikipedia.page(name_topic)
   if not name_of_page.exists():
       print('Page {} is not present'.format(name_of_page))
       return
  
   links_to_page = list(name_of_page.links.keys())
   procceed = tqdm(desc='Scraped links', unit='', total=len(links_to_page)) if verbose else None
   origin = [{'page': name_topic, 'text': name_of_page.text, 'link': name_of_page.fullurl, 'categories': list(name_of_page.categories.keys())}]
  
   with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
       links_future = {executor.submit(link_to_wikipedia, link): link for link in links_to_page}
       for future in concurrent.futures.as_completed(links_future):
           info = future.result()
           origin.append(info) if info else None
           procceed.update(1) if verbose else None
   procceed.close() if verbose else None
  
   namespaces = ('Wikipedia', 'Special', 'Talk', 'LyricWiki', 'File', 'MediaWiki',
                 'Template', 'Help', 'User', 'Category talk', 'Portal talk')
   origin = pd.DataFrame(origin)
   origin = origin[(len(origin['text']) > 20)
                     & ~(origin['page'].str.startswith(namespaces, na=True))]
   origin['categories'] = origin.categories.apply(lambda a: [b[9:] for b in a])

   origin['topic'] = name_topic
   print('Scraped pages', len(origin))
  
   return origin


In [7]:
data_wikipedia = scrape_wikipedia('Artificial_intelligence')

Scraped links: 100%|██████████| 1593/1593 [03:11<00:00,  8.32/s]

Scraped pages 1549





In [8]:
data_wikipedia.to_csv('scraped_data.csv')

Now that we have the data stored as csv we want to create the graph

In [1]:
# download spacy model
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import spacy
import pandas as pd
import requests
from spacy import displacy
# import en_core_web_sm
 
nlp = spacy.load('en_core_web_sm')
 
from spacy.tokens import Span
from spacy.matcher import Matcher
 
import matplotlib.pyplot as plot
from tqdm import tqdm
import networkx as ntx
import neptune
 
%matplotlib inline


In [4]:
import os
from dotenv import load_dotenv

load_dotenv()

API_TOKEN = os.getenv('NEPTUNE_API_TOKEN')

run = neptune.init_run(
    project="7frank/scaping-knowledge-graph",
    api_token=API_TOKEN,
)

eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJhOGRlOWZiNi02YjU5LTQwMTgtYTQwOC0wZjMxNDU2OWE5NjYifQ


NeptuneInvalidApiTokenException: 
[95m
----NeptuneInvalidApiTokenException------------------------------------------------
[0m
The provided API token is invalid.
Make sure you copied and provided your API token correctly.

You can get it or check if it is correct here:
    - https://app.neptune.ai/get_my_api_token

There are two options to add it:
    - specify it in your code
    - set it as an environment variable in your operating system.

[94mCODE[0m
Pass the token to the [1minit_run()[0m function via the [1mapi_token[0m argument:
    [96mneptune.init_run(project='WORKSPACE_NAME/PROJECT_NAME', api_token='YOUR_API_TOKEN')[0m

[94mENVIRONMENT VARIABLE[0m [92m(Recommended option)[0m
or export or set an environment variable depending on your operating system:

    [92mLinux/Unix[0m
    In your terminal run:
        [95mexport NEPTUNE_API_TOKEN="YOUR_API_TOKEN"[0m

    [92mWindows[0m
    In your CMD run:
        [95mset NEPTUNE_API_TOKEN="YOUR_API_TOKEN"[0m

and skip the [1mapi_token[0m argument of the [1minit_run()[0m function:
    [96mneptune.init_run(project='WORKSPACE_NAME/PROJECT_NAME')[0m

You may also want to check the following docs page:
    - https://docs.neptune.ai/setup/setting_api_token/

[92mNeed help?[0m-> https://docs.neptune.ai/getting_help


In [None]:
# Upload data to Neptune.
run["data"].upload("scraped_data.csv")

In [None]:

data = pd.read_csv('scraped_data.csv')