In [1]:
from ContextualForest import *

In [2]:
disambiguation('Bohemian Rhapsody')

['Bohemian Rhapsody',
 "Bohemian Rhapsody (That '70s Show)",
 'Bohemian Rhapsody (The Muppets)',
 'Bohemian Rhapsody (film)',
 'The Story of Bohemian Rhapsody']

In [2]:
text = wiki.page('bohemian rhapsody').text

In [3]:
d = set_relevance(*stem_text(text))

In [6]:
fr = contextual_forest("the FDA has approved the first shot for COVID-19")

In [9]:
print(fr.dic["fda"].page.text[:100])
print(fr.dic["covid-19"].page.text[:100])

The United States Food and Drug Administration (FDA or USFDA) is a federal agency of the Department 
Coronavirus disease 2019 (COVID-19) is a contagious disease caused by severe acute respiratory syndr


In [2]:
f1 = Forest(["Cat food","Dog"])
"""
while not f.Q.empty():
    sim,u,v,t1,t2 = f.Q.get()
    print("sim:{},node:{},node:{}".format(sim,u.page.title,v.page.title))"""
f1.disambiguate()
f1.recover_words()

In [17]:
wiki.page('shot (disambiguation)').links.keys()

dict_keys(['Armor-piercing shot and shell', 'Buckshot', 'Cricket shots', 'Fathom', 'Furlong', 'Gruntruck', 'Gunshot', 'Home run', 'Moonshot (disambiguation)', 'Round shot', 'SHOT Show', 'Serious Hazards of Transfusion', "Sho't", 'Shoot (disambiguation)', 'Shooter (disambiguation)', 'Shooter (drink)', 'Shooting', 'Shooting (association football)', 'Shot (2017 film)', 'Shot (album)', 'Shot (filmmaking)', 'Shot (ice hockey)', 'Shot (medicine)', 'Shot (pellet)', 'Shot (song)', 'Shot Rev 2.0', 'Shot glass', 'Shot put', 'Shot silk', 'Shots (disambiguation)', 'Showt', 'Society for the History of Technology', 'The Shot (disambiguation)', 'Talk:Shot', 'Help:Disambiguation'])

In [3]:
disambiguation('shot')

['Armor-piercing shot and shell',
 'Buckshot',
 'Cricket shots',
 'Gunshot',
 'Round shot',
 'SHOT Show',
 'Shot (2017 film)',
 'Shot (album)',
 'Shot (filmmaking)',
 'Shot (ice hockey)',
 'Shot (medicine)',
 'Shot (pellet)',
 'Shot (song)',
 'Shot Rev 2.0',
 'Shot glass',
 'Shot put',
 'Shot silk']

In [16]:
N = 10000000
d = {a : None for a in range(N)}


In [17]:
%time
N in d

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.2 µs


False

In [18]:
%time
d[N]

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 5.96 µs


KeyError: 10000000

In [None]:
"""class Forest():
    """ Implementation of the contextual forest main data structure for 
        context-based semantic disambiguation.

        Attributes
        ----------
        words : list of str
            The keywords to disambiguate
        trees : dict
            Mapping between ContextualForest.Tree objects and the root nodes
            associated with that tree objects.
        dic : dict
            Mapping between the words provided to disambiguate and root nodes
            associated with tree objects formed in the disambiguation process.
        connections : dict
            Where keys are combinations of two possible trees and values are 
            boolean indicating wether or not the pair of trees is connected.
        Q : queue.PriorityQueue
            The priority queue structure to manage connections and expansion order
        Methods
        -------
        disambiguate()
            Performs the disambiguation process (forward).
        recover_words()
            Recovers words synsets once the disambiguation process has completed.
    """
    def __init__(self,words):
        self.words = None
        self.trees = {}
        self.dic = {}
        for word in words:
            self.dic[word] = None
            self.trees[Tree(word)]= None
        self.tree_combs = itertools.combinations(self.trees,2) #possible pairs of trees
        self.connections = {pair:False for pair in self.tree_combs}
        self.Q = pq()
        for tree1,tree2 in self.tree_combs:
            for u in tree1.to_expand:
                for v in tree2.to_expand:
                    sim = -u.similarity(v) #negative because pq orders naturally
                    if sim == 0:
                        continue
                    self.Q.put((sim,u,v,tree1,tree2)) # (similarity, node_1, node_2, tree_1, tree_2)

    def disambiguate(self):
        """ Performs the forward disambiguation process expanding the nodes till
            all trees are connected by a path.
        """
        #while the are connections to check or some tree is not connected
        while not all(self.connections.values()) and not self.Q.empty(): 
            _,u,v,t1,t2 = self.Q.get()
            key = (t1,t2) if (t1,t2) in self.connections else (t2,t1) #depends on itertools
            while self.connections[key]:
                #while key belongs to an already connectyed tree, pop from pq
                _,u,v,t1,t2 = self.Q.get()
                key  =  (t1,t2) if (t1,t2) in self.connections else (t2,t1)
            #expand both nodes
            news_t1 = t1.expand_node(u)
            news_t2 = t2.expand_node(v)
            #check for intersection
            if any([True if n in t2.to_expand else False for n in t1.to_expand]):
                self.connections[key] = True
                if self.trees[t1] == None:
                    self.trees[t1] = u.root
                if self.trees[t2] == None:
                    self.trees[t2] = v.root
            else: #no connection, add new nodes
                #expansion:
                for u in news_t1:
                    for v in news_t2:
                        sim = -u.similarity(v)
                        if sim == 0:
                            continue
                        self.Q.put((sim,u,v,t1,t2))
                           
    def recover_words(self):
        """ Recovers the Nodes associated with the disambiguation of every word provided
            in the instanziation of the class and stores it in the `dic` attribute.
        """
        found = False
        for tree,node in self.trees.items():
            for link,page in node.page.links.items():
                if tree.word.lower() == link.lower():
                    self.dic[tree.word] =  Node(page,1)
                    found = True
            if not found:
                connection = node
                while connection.depth != 1:
                    connection = connection.parent
                self.dic[tree.word] = connection
            found = False
    """