In [1]:
#imports
import requests
from bs4 import BeautifulSoup
import random
import nltk
import urllib
import bs4 as bs
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
import json
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import tensorflow_hub as hub
from scipy.spatial import distance



  from .autonotebook import tqdm as notebook_tqdm


In [2]:

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [4]:
#helper functions
def getSoup(url):
    response = requests.get(url=url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def getTitleAndSummaryFromWikipediaPage(soup):
    title = soup.select("#firstHeading")[0].text
    headers = soup.find_all(['h1', 'h2', 'h3'])

    relevantHeader = None
    for i in range(len(headers)):
        firstString = re.split('[^a-zA-Z]', headers[i].getText())[0]
        if (firstString.lower() == "plot" or firstString.lower() == "summary"):
            relevantHeader = (headers[i])
    if (relevantHeader == None):
        return None
    summary = ""
    for elem in relevantHeader.next_siblings:
        if elem.name and elem.name.startswith('h'):
            # stop at next header
            break
        if elem.name == 'p':
            summary += (elem.get_text())+" "
            #f.write(elem.get_text() + u'\n')
    return title, summary

def removeStopWords(string) -> str:
    """takes in a string, removes stop words from it, and returns the string without stopwords"""
    word_tokens = word_tokenize(string)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return " ".join(filtered_sentence)

def getAllLinksFromPage(soup):
    allLinks = soup.find(id="bodyContent").find_all("a")
    res = []
    for ele in allLinks:
        if (ele.has_attr("href") and ele['href'].find("/wiki/") != -1):
            res.append(ele)
    return res




In [5]:
#get wikipedia url from a wikidata id
def get_wikipedia_url_from_wikidata_id(wikidata_id, lang='en', debug=False):
    """code from: https://stackoverflow.com/questions/37079989/how-to-get-wikipedia-page-from-wikidata-id"""
    import requests
    from requests import utils

    url = (
        'https://www.wikidata.org/w/api.php'
        '?action=wbgetentities'
        '&props=sitelinks/urls'
        f'&ids={wikidata_id}'
        '&format=json')
    json_response = requests.get(url).json()
    if debug: print(wikidata_id, url, json_response) 

    entities = json_response.get('entities')    
    if entities:
        entity = entities.get(wikidata_id)
        if entity:
            sitelinks = entity.get('sitelinks')
            if sitelinks:
                if lang:
                    # filter only the specified language
                    sitelink = sitelinks.get(f'{lang}wiki')
                    if sitelink:
                        wiki_url = sitelink.get('url')
                        if wiki_url:
                            return requests.utils.unquote(wiki_url)
                else:
                    # return all of the urls
                    wiki_urls = {}
                    for key, sitelink in sitelinks.items():
                        wiki_url = sitelink.get('url')
                        if wiki_url:
                            wiki_urls[key] = requests.utils.unquote(wiki_url)
                    return wiki_urls
    return None   


In [33]:
#import data
allIds = pd.read_csv("wikidataLitteratureWorkIds.csv")
allIdsNp = allIds.to_numpy()

In [13]:
#preprocess data:

def preProcessData(data : list, numberOfElements):
    """takes in the data from sparQL, and converts it to just a list of ids. Returs array of length numberOfElements"""
    ids = []
    for idx, ele in enumerate(data):
        if (idx >= numberOfElements):
            return ids
        url = ele[0]
        urlArr = url.split("/")
        wikidataId = urlArr[-1]
        ids.append(wikidataId)

    return ids

In [14]:
ids = preProcessData(allIdsNp, 30)

In [27]:
def scrape(listOfIdsToScrape, numberOfElementsToScrape, filename):
    titleAndEncodedSummaries = dict()
    numberOfSummariesAdded = 0

    for idx, idToScrape in enumerate(listOfIdsToScrape):
        if (numberOfSummariesAdded >= numberOfElementsToScrape):
            break
        try:
            wikipediaUrl = get_wikipedia_url_from_wikidata_id(idToScrape) #
            soup = getSoup(wikipediaUrl)
            res = getTitleAndSummaryFromWikipediaPage(soup)
            if (res is None):
                continue
            title, summary = res
            numberOfSummariesAdded += 1
            nonStopWordSummary = removeStopWords(summary)
            embedding = embed([nonStopWordSummary]).numpy().tolist() #make embedding, convert it to np array
            titleAndEncodedSummaries[title] = embedding
        except:
            pass

    #end by writing the data to file
    with open(filename, "w") as f:
        json.dump(titleAndEncodedSummaries, f)
    f.close()
    print(f"Added {numberOfSummariesAdded} books")


In [29]:
#scrape
scrape(ids, 3, "bookTitleAndSummaries.json")

Added 3 books


In [30]:
f = open('bookTitleAndSummaries.json')
obj = json.load(f)

In [31]:
obj.keys()

dict_keys(['Lancelot, the Knight of the Cart', 'The Walrus and the Carpenter', '150 000 000'])