# Entity linking

-----
In this notebook, the entities found in documents (using spacy's entity linking) will be linked using a three step process.

##### Table of contents
1. Setup
2. Candidate generation
3. Candidate ranking and selection
4. Unlinkable mention prediciton
-----

### 1. Setup

##### Imports, settings constants

In [1]:
import pandas as pd
import requests
import pageviewapi

from bs4 import BeautifulSoup

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

### 2. Candidate entity generation

In [8]:
entities = pd.read_csv('ner_sample.csv', index_col=0)
entities = entities['label'].to_list()

In [130]:
import requests

def get_wikipedia_articles(named_entity):
  # Set the endpoint for the Wikipedia API
  endpoint = "https://en.wikipedia.org/w/api.php"

  # Set the parameters for the API call
  params = {
    "action": "query",
    "format": "json",
    "list": "search",
    "srsearch": named_entity,
    "utf8": 1,
    "formatversion": 2,
    "formatversion": 2
  }

  # Make the API call
  r = requests.get(endpoint, params=params)

  # Extract the list of articles from the response
  articles = r.json()["query"]["search"]

  # Create a list of links to the articles
  links = []
  for article in articles:
    title = article["title"]
    links.append(title)
    # title = title.replace(" ", "_")
    # link = f"https://en.wikipedia.org/wiki/{title}"
    # links.append(link)

  # Return the list of links
  return links

In [77]:
import requests

def get_wikipedia_article(named_entity):
  # Replace spaces in the named entity with underscores
  named_entity = named_entity.replace(' ', '_')

  # Make a request to the Wikipedia API to search for the named entity
  api_url = f'https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&utf8=1&formatversion=2&srsearch={named_entity}&format=json'
  response = requests.get(api_url)
  data = response.json()

  # Extract the first search result
  search_results = data['query']['search']
  if len(search_results) > 0:
    first_result = search_results[0]

    # Extract the curid link from the search result
    curid = first_result['pageid']
    return f'https://en.wikipedia.org/?curid={curid}'
  else:
    return None


In [131]:
all_articles = []
for entity in entities:
  print(entity)
  all_articles.append(get_wikipedia_articles(entity))
  # print(articles)

Donald Trump's
Tuesday
Manhattan
U.S.
two
the Trump Organization
17
the second day
New York
three years
Trump
up to $1.6 million
US
Trump
Democrats
Trump Organization
Alan Futerfas
Washington
Florida
Mar-a-Lago
2020
Justice Department
Fulton County
Georgia
the White House
last month
Holocaust
Kanye West
Constitution
the Trump Organization's
Allen Weisselberg
Weisselberg
five-month
Jeffrey McConney
month-long
Greatest Political Witch Hunt
New York City
RELIANCE
$1.7 million
McConney
W-2
Joshua Steinglass
Steinglass
Alvin Bragg
Democrat
January
Bragg
District
Cyrus Vance Jr.
Letitia James'
Republican
years ago
last year
Vance
Bragg
Monday
CBC News
first
CBC/Radio-Canada's
CBC
our Submission Guidelines
Audience Relations
CBC P.O. Box 500 Station A
Toronto
ON  Canada
1E6
Canada
1-866
Canadians
CBC Gem


### 3. Candidate entity ranking

In [80]:
# Set wikipedia api endpoint
endpoint = "https://en.wikipedia.org/w/api.php"
# Set the parameters for the API call
params = {
    "action": "query",
    "format": "json",
    "prop": "extracts",
    "exintro": 1,
    "explaintext": 1,
    "titles": "Apple",
    "redirects": 1,
    "formatversion": 2
}
# Set the headers for the API call
headers = {
    "User-Agent": "f.f.sinke@student.vu.nl"
}
# Make the API call
r = requests.get(endpoint, headers=headers)

In [98]:
def get_page_views(wikipedia_link, year=2022, month=None):
    # Extract the page name from the link
    page_name = wikipedia_link.split("/")[-1]

    # Set the time range for the request
    if month is not None:
        start_date = f"{year}-{month}-01"
        end_date = f"{year}-{month}-31"
    else:
        start_date = f"{year}-01-01"
        end_date = f"{year}-11-31"

    print("Check 1")
    # Send the GET request to the Pageviews API
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/{page_name}/daily/{start_date}/{end_date}"
    response = requests.get(url, headers={"User-Agent":"f.f.sinke@student.vu.nl"})
    data = response.json()
    return data
    print("Check 3")
    # Extract the page views data
    page_views = 0
    for item in data['items']:
        page_views += item['views']
    return page_views

wikipedia_link = "https://en.wikipedia.org/?curid=12345"
page_views = get_page_views(wikipedia_link, year=2022)
print(f"Number of page views in 2022: {page_views}")

Check 1
Number of page views in 2022: {'items': []}


In [99]:
get_page_views(all_articles[0][0], year=2022)

Check 1


{'items': []}

##### pageviewAPI

In [110]:
def pageviews(link):
  # Set the endpoint for the Wikipedia API
  endpoint = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/"

  # Set the parameters for the API call
  params = {
    "article": link,
    "granularity": "monthly",
    "start": "2021100100",
    "end": "2021103100"
  }

  # Make the API call
  r = requests.get(endpoint, params=params, headers={"User-Agent": "f.f.sinke@student.vu.nl"})

  # Extract the list of articles from the response
  pageviews = r.json()

  # Return the list of links
  return pageviews

In [113]:
all_articles[0][0]

'https://en.wikipedia.org/?curid=4848272'

In [133]:
all_articles[0]

['Donald Trump',
 'Family of Donald Trump',
 'Social media use by Donald Trump',
 'Cabinet of Donald Trump',
 'Veracity of statements by Donald Trump',
 'Presidency of Donald Trump',
 'Inauguration of Donald Trump',
 'Donald Trump Jr.',
 'Donald Trump 2016 presidential campaign',
 'Donald Trump filmography']

In [123]:
import requests

# Replace PAGE_ID with the ID of the page you want to get the views for
PAGE_ID = 4848272

# Make a request to the Wikipedia API to get the page views for the page
url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/{PAGE_ID}/monthly/2021100100/2021103100"
response = requests.get(url, headers={"User-Agent": 'f.f.sinke@student.vu.nl'})

# Get the page views data from the response
page_views = response.json()

# Print the page views data
print(page_views)

{'type': 'https://mediawiki.org/wiki/HyperSwitch/errors/not_found', 'title': 'Not found.', 'method': 'get', 'detail': 'The date(s) you used are valid, but we either do not have data for those date(s), or the project you asked for is not loaded yet.  Please check https://wikimedia.org/api/rest_v1/?doc for more information.', 'uri': '/analytics.wikimedia.org/v1/pageviews/per-article/en.wikipedia/all-access/user/4848272/monthly/2021100100/2021103100'}


In [None]:
pageviews = pageviewapi.per_article('en.wikipedia', article_name, '2022100100', '2022103100', access='all-access',
                      agent='user', granularity='monthly')
pageviews['items'][0]['views']

print(f"Wikipage {article_name} has {pageviews['items'][0]['views']} views from 2022-10-01 to 2022-10-31")

In [129]:
all_articles[0]

['https://en.wikipedia.org/wiki/Donald_Trump',
 'https://en.wikipedia.org/wiki/Family_of_Donald_Trump',
 'https://en.wikipedia.org/wiki/Social_media_use_by_Donald_Trump',
 'https://en.wikipedia.org/wiki/Cabinet_of_Donald_Trump',
 'https://en.wikipedia.org/wiki/Veracity_of_statements_by_Donald_Trump',
 'https://en.wikipedia.org/wiki/Presidency_of_Donald_Trump',
 'https://en.wikipedia.org/wiki/Inauguration_of_Donald_Trump',
 'https://en.wikipedia.org/wiki/Tax_returns_of_Donald_Trump',
 'https://en.wikipedia.org/wiki/Donald_Trump_2016_presidential_campaign',
 'https://en.wikipedia.org/wiki/Donald_Trump_Jr.']

##### mwviewsAPI

In [102]:
from mwviews.api import PageviewsClient

In [103]:
p = PageviewsClient('https://www.mediawiki.org/wiki/REST_API')

In [137]:
views = p.article_views('en.wikipedia', all_articles[0], granularity='monthly', start='2022100100', end='2022103100')

In [147]:
# get values of views defaultdict object and turn into list



In [160]:
keys = list(views[0].keys())

In [161]:
views[0][keys[0]]

9214328

In [238]:
articles = get_wikipedia_articles("harry")
views = p.article_views('en.wikipedia', articles, granularity='monthly', start='2022100100', end='2022103100')
print(views)

defaultdict(<class 'dict'>, {datetime.datetime(2022, 10, 1, 0, 0): {'Harry': 1795, 'Harry_Potter': 505954, 'Prince_Harry,_Duke_of_Sussex': 444871, 'Harry_Potter_(film_series)': 497169, 'Harry_Kane': 267928, 'Harry_S._Truman': 166239, 'Debbie_Harry': 149455, 'Harry_Styles': 795983, 'Harry_&_Meghan': None, "Harry_Potter_and_the_Philosopher's_Stone": 135585}})


In [245]:
dct = list(views.values())[0]
keys = list(dct.keys())
for key in keys:
    if dct[key] > max:
        max = dct[key]
        max_key = key
return max_key

TypeError: '>' not supported between instances of 'int' and 'builtin_function_or_method'

In [None]:
def select_most_popular(views):
    max_views = 0
    max_key = ''
    dct = list(views.values())[0]
    keys = list(dct.keys())
    # Get key with max value
    for key in keys:
        if dct[key] > max_views:
            max_views = dct[key]
            max_key = key
    return max_key

In [213]:
select_most_popular(views)

'MacOS'

In [208]:
articles = get_wikipedia_articles("Apple")
articles

['Apple',
 'Apple Inc.',
 'Apple (disambiguation)',
 'Apples to Apples',
 'MacOS',
 'IOS',
 'Apple silicon',
 'IPhone',
 'Apple Watch',
 'Apple TV']

In [232]:
def link_entity(entity):
    articles = get_wikipedia_articles(entity)
    views = p.article_views('en.wikipedia', articles, granularity='monthly', start='2022100100', end='2022103100')
    if len(views) == 0:
        print("No entity found")
        return None
    print(views)
    top_ambiguity = select_most_popular(views)
    return f"https://en.wikipedia.org/wiki/{top_ambiguity}"

In [233]:
link_entity("Harry")

defaultdict(<class 'dict'>, {datetime.datetime(2022, 10, 1, 0, 0): {'Harry': 1795, 'Harry_Potter': 505954, 'Prince_Harry,_Duke_of_Sussex': 444871, 'Harry_Potter_(film_series)': 497169, 'Harry_Kane': 267928, 'Harry_S._Truman': 166239, 'Debbie_Harry': 149455, 'Harry_Styles': 795983, 'Harry_&_Meghan': None, "Harry_Potter_and_the_Philosopher's_Stone": 135585}})


TypeError: '>' not supported between instances of 'NoneType' and 'int'