In [1]:
# import all the dependencies
import sqlite3
from sqlite3 import Error
import re
from joblib import Parallel, delayed
import math
from collections import defaultdict
import nltk
from nltk import sent_tokenize, word_tokenize, ngrams
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
from IPython.core.display import display, HTML
import json

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# connect to sqlite db
conn = None

try:
    conn = sqlite3.connect("wikiarticles.db")
except Error as e:
    print(e)

In [4]:
# create a table that will store the title and content of wikipedia articles
create_command = """
                    CREATE TABLE IF NOT EXISTS articles(
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        title TEXT,
                        content TEXT
                    ); 
                 """

c = conn.cursor()
c.execute(create_command)

<sqlite3.Cursor at 0x7fe2dc2da9d0>

In [None]:
insert_query = """
                    INSERT INTO articles(title, content)
                    VALUES (?, ?)
               """
counter = 0
added_to_db = 0

# read dump file line by line
with open("enwiki-20061130-pages-articles.xml", "rt") as inf:
    line = ''
    while (line := inf.readline()):
        title = ""
        content = ""
        if re.search('<page>', line):
            
            # if an opening page tag is found
            while not re.search('</page>', line):
                
                # search for title of page
                if re.search('<title>', line):
                    title = re.sub('<title>', "", line)
                    title = re.sub('</title>', "", title).strip()
                    
                # and content of the page
                if re.search('<text .+>', line):
                    while not re.search('</text>', line):
                        content += line
                        line = inf.readline()
                    content += line
                    content = re.sub('<text .+>', "", content)
                    content = re.sub('</text>', "", content).strip()
                    
                line = inf.readline()
        
        # print progress of processing
        if not counter % 10000:
            print("Pages processed: {}. Pages added to db: {}".format(counter, added_to_db))
            
        # commit articles to db after every 10000 articles added
        if added_to_db > 0 and not added_to_db % 10000:
            conn.commit()
            
        counter += 1
         
        # insert article into db
        if len(title) > 0 and len(content) > 1000:
            c.execute(insert_query, (title, content))
            added_to_db += 1

In [None]:
conn.commit()
conn.close()

In [5]:
# get all the internal links from an article
def get_alternate(content, i):
    return [x for x in re.findall("\[\[[^]]+\|[^]]+\]\]", content)]

In [10]:
# process wikipedia articles in parallel

get_contents = """
                    select content from articles
               """
contents = c.execute(get_contents)
alts = []
rnd = 0

while True:
    counter = 0
    cc = []
    for cont in contents:
        cc.append(cont)
        counter += 1
        if counter > 10000:
            break
    print("Round {} starts".format(str(rnd)))
    
    alt = Parallel(n_jobs=7)(delayed(get_alternate)(x[0], i) for i, x in enumerate(cc))
    alts.append(alt)
    print("Round {} ends".format(str(rnd)))
    rnd += 1
    if counter < 10000:
        break


Round 0 starts
Round 0 ends
Round 1 starts
Round 1 ends
Round 2 starts
Round 2 ends
Round 3 starts
Round 3 ends
Round 4 starts
Round 4 ends
Round 5 starts
Round 5 ends
Round 6 starts
Round 6 ends
Round 7 starts
Round 7 ends
Round 8 starts
Round 8 ends
Round 9 starts
Round 9 ends
Round 10 starts
Round 10 ends
Round 11 starts
Round 11 ends
Round 12 starts
Round 12 ends
Round 13 starts
Round 13 ends
Round 14 starts
Round 14 ends
Round 15 starts
Round 15 ends
Round 16 starts
Round 16 ends
Round 17 starts
Round 17 ends
Round 18 starts
Round 18 ends
Round 19 starts
Round 19 ends
Round 20 starts
Round 20 ends
Round 21 starts
Round 21 ends
Round 22 starts
Round 22 ends
Round 23 starts
Round 23 ends
Round 24 starts
Round 24 ends
Round 25 starts
Round 25 ends
Round 26 starts
Round 26 ends
Round 27 starts
Round 27 ends
Round 28 starts
Round 28 ends
Round 29 starts
Round 29 ends
Round 30 starts
Round 30 ends
Round 31 starts
Round 31 ends
Round 32 starts
Round 32 ends
Round 33 starts
Round 33 ends


In [11]:
# process links to get alternative phrases that might
# point to a specific article

alternate_phrases = defaultdict(set)
occurence = defaultdict(int)

while len(alts) > 0:
    alt = alts.pop()
    for arr in alt:
        for string in arr:
            spl = string.split("|")
            if len(spl) > 2:
                continue
            key = spl[1][:-2].lower()
            value = spl[0][2:]
            occurence[key] += 1
            
            alternate_phrases[value.lower()].add(value)
            if occurence[key] > 4:
                alternate_phrases[key].add(value)

In [12]:
len(alternate_phrases.keys())

2697897

In [13]:
# add the titles of the articles to it
get_titles = """
                select title from articles
             """

for title in c.execute(get_titles):
    alternate_phrases[title[0].lower()].add(title[0])

In [14]:
len(alternate_phrases.keys())

3331484

In [15]:
# save these phrases to a json file

for key in alternate_phrases.keys():
    alternate_phrases[key] = list(alternate_phrases[key])

with open("wordset.json", "wt") as outf:
    json.dump(alternate_phrases, outf)

In [41]:
# create a set from the keys of this dictionary
wordset = set(alternate_phrases.keys())

In [43]:
# size of the "controlled set"
len(wordset)

3553261

# TF-IDF

In [2]:
# load dictionary of alternative keyphrases
with open("wordset.json", "rt") as inf:
    useful_terms = json.load(inf)

In [3]:
# make a controlled vocabulary set from this dictionary
wordset = set(useful_terms.keys())

In [19]:
# get the number of the articles present in the db
get_doc_num = """
                select count(*) from articles
              """
res = c.execute(get_doc_num)
res

<sqlite3.Cursor at 0x7fe2dc2da9d0>

In [20]:
num_of_documents = 0
for r in res:
    num_of_documents = r[0]
    break

num_of_documents

1251183

In [4]:
# read the test article
text = ""
with open("test.txt", "rt") as inf:
    while line := inf.readline():
        text += line.replace("\n", " ")

In [5]:
# get possible ngram candidates
good_grams = set()
forbidden_set = set(stopwords.words('english') + [',', '.', ';', ':', '"', '``', "''", '`', '?', '(', ')', '%', '-'])
for sentence in sent_tokenize(text):
    for i in range(1, 7):
        grams = ngrams(word_tokenize(sentence), i)
        for gram in grams:
            g = " ".join(gram)
            g = g.lower()
            if g in forbidden_set:
                continue
            if g in wordset:
                good_grams.add(g)
                
good_grams

{'...',
 '1',
 '1 july',
 '10',
 '13',
 '20',
 '2020',
 '2021',
 '24',
 '24 hours',
 '73',
 '8',
 '90',
 '92',
 'a number',
 'afp',
 'agency',
 'almost',
 'already',
 'also',
 'amazonas',
 'amazonas state',
 'america',
 'american',
 'areas',
 'around',
 'associated',
 'associated press',
 'authorities',
 'average',
 'bbc',
 'beds',
 'borders',
 'brazil',
 'brazilian',
 'brewing',
 'brink',
 'business',
 'calamity',
 'capital',
 'care',
 'caribbean',
 'cases',
 'cause',
 'chart',
 'cities',
 'collapse',
 'comment',
 'community',
 'complete',
 'concern',
 'contagious',
 'continues',
 'control',
 'coronavirus',
 'countries',
 'country',
 'courts',
 'crisis',
 'critical',
 'criticism',
 'critics',
 'cross',
 'curb',
 'damage',
 'data',
 'date',
 'days',
 'death',
 'death toll',
 'deaths',
 'depression',
 'died',
 'director',
 'dose',
 'double',
 'dr',
 'drugs',
 'dying',
 'economy',
 'effective',
 'effects',
 'effort',
 'entire',
 'epidemiologist',
 'estimate',
 'ethel',
 'every',
 'eviden

In [23]:
# check if a candidate is present in an article
def search_terms(terms, txt):
    lwr_txt = txt.lower()
    return [term for term in terms if term in lwr_txt]

In [24]:
get_content = """
                select content from articles
              """
contents = c.execute(get_content)
term_app = []
rnd = 0

while True:
    cc = []
    for cont in contents:
        cc.append(cont[0])
        if len(cc) == 10000:
            break
    
    print("Round {} starts".format(str(rnd)))
    
    app = Parallel(n_jobs=7)(delayed(search_terms)(good_grams, txt) for txt in cc)
    term_app.append(app)
    
    print("Round {} ends".format(str(rnd)))
    rnd += 1
    if len(cc) < 10000:
        break

Round 0 starts
Round 0 ends
Round 1 starts
Round 1 ends
Round 2 starts
Round 2 ends
Round 3 starts
Round 3 ends
Round 4 starts
Round 4 ends
Round 5 starts
Round 5 ends
Round 6 starts
Round 6 ends
Round 7 starts
Round 7 ends
Round 8 starts
Round 8 ends
Round 9 starts
Round 9 ends
Round 10 starts
Round 10 ends
Round 11 starts
Round 11 ends
Round 12 starts
Round 12 ends
Round 13 starts
Round 13 ends
Round 14 starts
Round 14 ends
Round 15 starts
Round 15 ends
Round 16 starts
Round 16 ends
Round 17 starts
Round 17 ends
Round 18 starts
Round 18 ends
Round 19 starts
Round 19 ends
Round 20 starts
Round 20 ends
Round 21 starts
Round 21 ends
Round 22 starts
Round 22 ends
Round 23 starts
Round 23 ends
Round 24 starts
Round 24 ends
Round 25 starts
Round 25 ends
Round 26 starts
Round 26 ends
Round 27 starts
Round 27 ends
Round 28 starts
Round 28 ends
Round 29 starts
Round 29 ends
Round 30 starts
Round 30 ends
Round 31 starts
Round 31 ends
Round 32 starts
Round 32 ends
Round 33 starts
Round 33 ends


In [25]:
# calculate idf for every candidate 
idf = defaultdict(int)

for rnd in term_app:
    for results in rnd:
        for term in results:
            idf[term] += 1
            
for k in idf.keys():
    idf[k] = num_of_documents / idf[k]

In [26]:
# calculate term frequency for the candidates
tfreq = defaultdict(int)
tnum = 0

for sentence in sent_tokenize(text):
    for i in range(1, 7):
        grams = ngrams(word_tokenize(sentence.lower()), i)
        for gram in grams:
            tfreq[" ".join(gram)] += 1
            tnum += 1
            
for f in tfreq.keys():
    tfreq[f] /= tnum
    
tfreq

defaultdict(int,
            {'brazil': 0.003278688524590164,
             'has': 0.0028415300546448087,
             'recorded': 0.0006557377049180328,
             'more': 0.0013114754098360656,
             'than': 0.001092896174863388,
             '4,000': 0.0002185792349726776,
             'covid-related': 0.0002185792349726776,
             'deaths': 0.001092896174863388,
             'in': 0.00546448087431694,
             '24': 0.0004371584699453552,
             'hours': 0.0004371584699453552,
             'for': 0.0015300546448087432,
             'the': 0.01377049180327869,
             'first': 0.0002185792349726776,
             'time': 0.0002185792349726776,
             ',': 0.008961748633879782,
             'as': 0.0006557377049180328,
             'a': 0.0017486338797814208,
             'contagious': 0.0004371584699453552,
             'variant': 0.0013114754098360656,
             'fuels': 0.0002185792349726776,
             'surge': 0.0004371584699453552,
       

In [27]:
# calculate tfidf
tfidf = dict()

for term in good_grams:
    tfidf[term] = tfreq[term] * math.log(idf[term])
    
tfidf

{'politically': 0.0010436038150603707,
 'first time': 0.0007949406605013562,
 'intensive care unit': 0.0018458773075677637,
 'infections': 0.0013990030360798219,
 'quarantine': 0.0015206669928675537,
 'deaths': 0.0025478206016206783,
 'contagious': 0.0032960931239643734,
 'monthly': 0.0010065392039034938,
 'cause': 0.0003512615352763219,
 'the last': 0.0005593242784891479,
 'the movement': 0.001072451313804841,
 'brazilian': 0.0010634538712776921,
 'sabotage': 0.0013017149629326865,
 'around': 0.0004328031860507674,
 'the damage': 0.0012149161362442268,
 'lago': 0.0010295397177831722,
 'virus': 0.0033542360192493096,
 '%': 0.0014883052983891351,
 'fear': 0.0007663588009285301,
 'damage': 0.0007574622710802155,
 'reverse': 0.0009279496164208044,
 'estimate': 0.00157521755305743,
 'intensive care': 0.0016564222700263696,
 'government': 0.0004937440339038131,
 'limiting': 0.0012312819655284244,
 'data': 0.0004025950025638795,
 'system': 0.0004300381617473661,
 'continues': 0.0007771563578

In [28]:
# sort candidates by tfidf and get top 6%
num_kw = len(word_tokenize(text)) * 6 // 100

words = sorted(tfidf.items(), key= lambda kv: kv[1], reverse=True)[:num_kw]
words

[('brazil', 0.012816187202493935),
 ('variant', 0.005303652224245669),
 ('vaccines', 0.005017074528641789),
 ('coronavirus', 0.004410210423231883),
 ('the virus', 0.004397755440877296),
 ('cases', 0.004271073117495056),
 ('the country', 0.0034841673014201343),
 ('lockdown', 0.003479635788690172),
 ('health', 0.003424266460146038),
 ('some cities', 0.003419888070224915),
 ('jair', 0.0033673330406898),
 ('virus', 0.0033542360192493096),
 ('contagious', 0.0032960931239643734),
 ('pandemic', 0.00323200417349798),
 ('measures', 0.002885595992174072),
 ('2021', 0.0028811733154885935),
 ('said', 0.002701480903814233),
 ('deaths', 0.0025478206016206783),
 ('miguel nicolelis', 0.0025081203725164634),
 ('the bbc', 0.002307549708352355),
 ('24 hours', 0.0023029835093061834),
 ('vaccinating', 0.002273664240873874),
 ('country', 0.0022571880587231516),
 ('miguel', 0.0022501698457208065),
 ('situation', 0.0021869200468418947),
 ('president', 0.002184485340920032),
 ('variants', 0.00210710499168765),

In [29]:
# prepare article title to be used as a link
def wiki_prepare(term):
    return re.sub(" ", "_", term)

In [30]:
# Word Sense Disambiguation
# if more articles can be linked to a candidate then 
# find the one that's closest to the test text
# using cosine distance
def wsd(target, corp):
    vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
    tf_vect = vectorizer.fit_transform([target] + corp)
    
    arr = tf_vect.toarray()
    mini = 1
    idx = 0
    for i in range(1, len(corp) + 1):
        if cosine(arr[0], arr[i]) < mini:
            idx = i
            mini = cosine(arr[0], arr[i])
    return idx - 1 if idx > 0 else 0

In [31]:
# create wikipedia links for the chosen candidates
# and apply wsd where needed

links = dict()
get_content_for_title = """
                            select title, content from articles
                            where title in 
                        """

for w, _ in words:
    if len(useful_terms[w]) == 1:
#         print(w, useful_terms[w])
        if type(useful_terms[w][0]) == list:
            links[w] = "https://en.wikipedia.org/wiki/" + wiki_prepare(useful_terms[w][0][0])
        else:
            links[w] = "https://en.wikipedia.org/wiki/" + wiki_prepare(useful_terms[w][0])
    else:
        titles = []
        for t in useful_terms[w]:
            if type(t) == list:
                titles.append(t[0])
            else:
                titles.append(t)
        get_content_for_title = "select title, content from articles where title in ({})".format(",".join("?" * len(titles)))
        results = [txt for txt in c.execute(get_content_for_title, titles)]
        if len(results) == 0:
            continue
        
        contents = [x[1] for x in results]
        idx = wsd(text, contents)
        links[w] = "https://en.wikipedia.org/wiki/" + wiki_prepare(results[idx][0])


In [32]:
# print links
links

{'brazil': 'https://en.wikipedia.org/wiki/Brazil',
 'variant': 'https://en.wikipedia.org/wiki/Shogi_variant',
 'vaccines': 'https://en.wikipedia.org/wiki/Vaccination',
 'coronavirus': 'https://en.wikipedia.org/wiki/Coronavirus',
 'the virus': 'https://en.wikipedia.org/wiki/The_Virus',
 'cases': 'https://en.wikipedia.org/wiki/Computer_case',
 'the country': 'https://en.wikipedia.org/wiki/Colombia',
 'lockdown': 'https://en.wikipedia.org/wiki/Lockdown',
 'health': 'https://en.wikipedia.org/wiki/Public_health',
 'some cities': 'https://en.wikipedia.org/wiki/Some_Cities',
 'jair': 'https://en.wikipedia.org/wiki/Jair_da_Rosa_Pinto',
 'virus': 'https://en.wikipedia.org/wiki/Virus_(1980_film)',
 'contagious': 'https://en.wikipedia.org/wiki/Infectious_disease',
 'pandemic': 'https://en.wikipedia.org/wiki/Pandemic',
 'measures': 'https://en.wikipedia.org/wiki/Bill_(proposed_law)',
 '2021': 'https://en.wikipedia.org/wiki/2021',
 'said': 'https://en.wikipedia.org/wiki/Edward_Said',
 'deaths': 'ht

In [33]:
text

'Brazil has recorded more than 4,000 Covid-related deaths in 24 hours for the first time, as a more contagious variant fuels a surge in cases.  Hospitals are overcrowded, with people dying as they wait for treatment in some cities, and the health system is on the brink of collapse in many areas.  The country\'s total death toll is now almost 337,000, second only to the US.  But President Jair Bolsonaro continues to oppose any lockdown measures to curb the outbreak.  He argues that the damage to the economy would be worse than the effects of the virus itself, and has tried to reverse some of the restrictions imposed by local authorities in the courts.      Covid vaccines: How fast is worldwide progress?     What is the Brazil variant and do vaccines work against it?     Political crisis and Covid surge rock Brazil  Speaking to supporters outside the presidential residence on Tuesday, he criticised quarantine measures and suggested without evidence that they were linked to obesity and de

In [34]:
# substitute candidates with the coresponding link
# and display in html format
text_html = "<p>" + text + "</p>"
for term in links.keys():
    t = re.search(term, text_html, re.IGNORECASE)
    if not t is None:
        text_html = re.sub(t[0], "<a href=\"" + links[term] + "\">" + t[0] + "</a>", text_html, 1)
    
display(HTML(text_html))

Number of misclassified candidates: 6 / 49.

Number of candidates with wrong links: 8 / 49.


# Keyphraseness

In [4]:
# read the test article
text = ""
with open("test.txt", "rt") as inf:
    while line := inf.readline():
        text += line.replace("\n", " ")

In [5]:
# get possible ngram candidates
good_grams = set()
forbidden_set = set(stopwords.words('english') + [',', '.', ';', ':', '"', '``', "''", '`', '?', '(', ')', '%', '-'])
for sentence in sent_tokenize(text):
    for i in range(1, 7):
        grams = ngrams(word_tokenize(sentence), i)
        for gram in grams:
            g = " ".join(gram)
            g = g.lower()
            if g in forbidden_set:
                continue
            if g in wordset:
                good_grams.add(g)
                
good_grams

{'...',
 '1',
 '1 july',
 '10',
 '13',
 '20',
 '2020',
 '2021',
 '24',
 '24 hours',
 '73',
 '8',
 '90',
 '92',
 'a number',
 'afp',
 'agency',
 'almost',
 'already',
 'also',
 'amazonas',
 'amazonas state',
 'america',
 'american',
 'areas',
 'around',
 'associated',
 'associated press',
 'authorities',
 'average',
 'bbc',
 'beds',
 'borders',
 'brazil',
 'brazilian',
 'brewing',
 'brink',
 'business',
 'calamity',
 'capital',
 'care',
 'caribbean',
 'cases',
 'cause',
 'chart',
 'cities',
 'collapse',
 'comment',
 'community',
 'complete',
 'concern',
 'contagious',
 'continues',
 'control',
 'coronavirus',
 'countries',
 'country',
 'courts',
 'crisis',
 'critical',
 'criticism',
 'critics',
 'cross',
 'curb',
 'damage',
 'data',
 'date',
 'days',
 'death',
 'death toll',
 'deaths',
 'depression',
 'died',
 'director',
 'dose',
 'double',
 'dr',
 'drugs',
 'dying',
 'economy',
 'effective',
 'effects',
 'effort',
 'entire',
 'epidemiologist',
 'estimate',
 'ethel',
 'every',
 'eviden

In [35]:
# search for a candidate ngram in an article
# and determine if it is part of a link or not
def search_keyphrase(text, possible_phrases):
    links = [x for x in re.findall("\[\[[^]]+\]\]", text)]
    result = []
    for phrase in possible_phrases:
        if phrase in text:
            link = False
            for l in links:
                d = l[2:-2]
                dd = d.split('|')
                if phrase == dd[0] or phrase == dd[-1]:
                    link = True
                    break
            result.append((phrase, 1) if link else (phrase, 0))
    return result

In [36]:
get_content = """
                select content from articles
              """
contents = c.execute(get_content)
term_app = []
rnd = 0

while True:
    cc = []
    for cont in contents:
        cc.append(cont[0])
        if len(cc) == 10000:
            break
    
    print("Round {} starts".format(str(rnd)))
    
    app = Parallel(n_jobs=7)(delayed(search_keyphrase)(txt, good_grams) for txt in cc)
    term_app.append(app)
    
    print("Round {} ends".format(str(rnd)))
    rnd += 1
    if len(cc) < 10000:
        break

Round 0 starts
Round 0 ends
Round 1 starts
Round 1 ends
Round 2 starts
Round 2 ends
Round 3 starts
Round 3 ends
Round 4 starts
Round 4 ends
Round 5 starts
Round 5 ends
Round 6 starts
Round 6 ends
Round 7 starts
Round 7 ends
Round 8 starts
Round 8 ends
Round 9 starts
Round 9 ends
Round 10 starts
Round 10 ends
Round 11 starts
Round 11 ends
Round 12 starts
Round 12 ends
Round 13 starts
Round 13 ends
Round 14 starts
Round 14 ends
Round 15 starts
Round 15 ends
Round 16 starts
Round 16 ends
Round 17 starts
Round 17 ends
Round 18 starts
Round 18 ends
Round 19 starts
Round 19 ends
Round 20 starts
Round 20 ends
Round 21 starts
Round 21 ends
Round 22 starts
Round 22 ends
Round 23 starts
Round 23 ends
Round 24 starts
Round 24 ends
Round 25 starts
Round 25 ends
Round 26 starts
Round 26 ends
Round 27 starts
Round 27 ends
Round 28 starts
Round 28 ends
Round 29 starts
Round 29 ends
Round 30 starts
Round 30 ends
Round 31 starts
Round 31 ends
Round 32 starts
Round 32 ends
Round 33 starts
Round 33 ends


In [37]:
# count the times a candidate appeared as part
# of a link and when it didn't
keyphraseness = dict()

for batch in term_app:
    for article in batch:
        for (w, x) in article:
            if not w in keyphraseness:
                keyphraseness[w] = dict()
                keyphraseness[w]["kp"], keyphraseness[w]["nkp"] = 0, 0
            if x:
                keyphraseness[w]["kp"] += 1
            else:
                keyphraseness[w]["nkp"] += 1

In [38]:
# calculate keyphraseness values
kp_vals = dict()
for key in keyphraseness.keys():
    kp_vals[key] = keyphraseness[key]["kp"] / (keyphraseness[key]["kp"] + keyphraseness[key]["nkp"])
kp_vals

{'areas': 0.00012641514734387728,
 '73': 0.0018864958339883666,
 'intensive': 0.004318100035984167,
 'we are': 7.857928650007858e-05,
 'rock': 0.05407513729405891,
 'comment': 0.000757193336698637,
 'system': 0.003425697350530026,
 'wait': 0.00025516713447307985,
 'health': 0.016642264519277698,
 'press': 0.001371022161287904,
 'much': 1.769692250517635e-05,
 'officials': 0.0018633871053612308,
 'date': 0.000587571218252999,
 'country': 0.013252553128795556,
 'chart': 0.0035500930369209674,
 'first': 0.0007532981357508745,
 'figures': 0.001833115944583495,
 'social': 0.010653032608881518,
 'crisis': 0.006443401479595896,
 'complete': 0.0011434707125077567,
 'latest': 0.0004706103816650195,
 'the state': 0.00033505459418975913,
 'damage': 0.0015010990189245697,
 'critical': 0.0019305019305019305,
 'last': 0.0001372807059068574,
 'the us': 0.0,
 'may': 4.021339910458165e-05,
 '-': 6.590567579679962e-05,
 'unit': 0.0013431299530532147,
 'local': 0.0008534477099153118,
 'political': 0.0149

In [39]:
# choose the top 6% of the candidates
num_kw = len(word_tokenize(text)) * 6 // 100

words = sorted(kp_vals.items(), key= lambda kv: kv[1], reverse=True)[:num_kw]
words

[('obesity', 0.4950980392156863),
 ('pandemic', 0.4394184168012924),
 ('oxygen', 0.3464774789638673),
 ('epidemiologist', 0.3388888888888889),
 ('intensive care unit', 0.27956989247311825),
 ('far-right', 0.2669683257918552),
 ('vaccinating', 0.2647058823529412),
 ('quarantine', 0.2583518930957684),
 ('coronavirus', 0.2558139534883721),
 ('public health', 0.24614594039054472),
 ('brewing', 0.17156568686262746),
 ('depression', 0.1703220433092726),
 ('intensive care', 0.1702127659574468),
 ('virus', 0.16596570689097379),
 ('news agency', 0.15110356536502548),
 ('1 july', 0.13333333333333333),
 ('sabotage', 0.12991858887381275),
 ('vaccines', 0.11940298507462686),
 ('university', 0.09887842380270909),
 ('associated press', 0.09090909090909091),
 ('tragedy', 0.07924376508447305),
 ('drugs', 0.07281148679944419),
 ('health policy', 0.07086614173228346),
 ('tracker', 0.06414219474497682),
 ('brazilian', 0.0641025641025641),
 ('2021', 0.06344586728754366),
 ('latin america', 0.06122448979591

In [40]:
# prepare article title to be a link
def wiki_prepare(term):
    return re.sub(" ", "_", term)

In [41]:
def wsd(target, corp):
    vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
    tf_vect = vectorizer.fit_transform([target] + corp)
    
    arr = tf_vect.toarray()
    mini = 1
    idx = 0
    for i in range(1, len(corp) + 1):
        if cosine(arr[0], arr[i]) < mini:
            idx = i
            mini = cosine(arr[0], arr[i])
    return idx - 1 if idx > 0 else 0

In [42]:
links = dict()
get_content_for_title = """
                            select title, content from articles
                            where title in 
                        """

for w, _ in words:
    if len(useful_terms[w]) == 1:
#         print(w, useful_terms[w])
        if type(useful_terms[w][0]) == list:
            links[w] = "https://en.wikipedia.org/wiki/" + wiki_prepare(useful_terms[w][0][0])
        else:
            links[w] = "https://en.wikipedia.org/wiki/" + wiki_prepare(useful_terms[w][0])
    else:
        titles = []
        for t in useful_terms[w]:
            if type(t) == list:
                titles.append(t[0])
            else:
                titles.append(t)
        get_content_for_title = "select title, content from articles where title in ({})".format(",".join("?" * len(titles)))
        results = [txt for txt in c.execute(get_content_for_title, titles)]
        if len(results) == 0:
            continue
        
        contents = [x[1] for x in results]
        idx = wsd(text, contents)
        links[w] = "https://en.wikipedia.org/wiki/" + wiki_prepare(results[idx][0])


In [43]:
text_html = "<p>" + text + "</p>"
for term in links.keys():
    t = re.search(term, text_html, re.IGNORECASE)
    if not t is None:
        link = links[term].replace("\"", "\\\"")
        text_html = re.sub(t[0], "<a href=\"" + link + "\">" + t[0] + "</a>", text_html, 1)
    
display(HTML(text_html))

Number of misclassified candidates: 7 / 49.

Number of candidates with wrong links: 4 / 49.