# Contextual Forests V2

### 1. BabelNet Iinterface

In [105]:
import requests
import json
import wikipediaapi
from socket import timeout
from requests.exceptions import ConnectionError
#GLOBAL VARIABLES
wiki = wikipediaapi.Wikipedia('en')
KEY  = 'b361cac5-2b53-40a8-8df8-6a2cfb140491' #Register in BabelNet to get one
BASE = 'https://babelnet.io/v5'
API = {
    "get_info": "getSynset",
    "get_synsets":"getSynsetIds",
    "get_relations":"getOutgoingEdges"
}

def __format_arguments(args):
    """
    Desc: Returns a formatted string for making an HTML request
    Params: - args (dict): dictionary for params values, format is {paramName:paramValue ...}
    Returns: (String) formatted string
    """
    return "&".join(["{}={}".format(key,value) for key,value in args.items()])

def __make_request(args,op,verbose=False):
    """
    Desc: Makes arequest to BabelNet API given a parametters dictionary
    Params: - args (dict): dictionary for params values, format is {paramName:paramValue ...}
            - op (string): operation, choose between {get_info,get_synsets,get_relations}
            - verbose (bool): show messages 
    Returns: (Dict) BabelNet's API response
    """
    args["key"] = KEY
    args["targetLang"] = "EN"
    r = requests.get("{}/{}?{}".format(BASE,API[op],__format_arguments(args)))
    if verbose:
        print("Status code: {}".format(r.status_code))
    return r.json() if r.ok else None

def synset_info(synset_id,verbose=False):
    """
    Desc: Returns the information associated with a BabelNet synset
    Params: - synset_id (String): BabelNet synset id
            - verbose (bool): show messages 
    Returns: (Dict) BabelNet's sysnset information
    """
    return __make_request({"id":synset_id},"get_info",verbose)

def synset_relations(synset_id,verbose=False):
    """
    Desc: Returns the outgoing semantic relations of a synset
    Params: - synset_id (String): BabelNet synset id
            - verbose (bool): show messages 
    Returns: (Dict) BabelNet's sysnset outgoing conexions
    """
    try:
        return __make_request({"id":synset_id},"get_relations",verbose)
    except (ConnectionError,timeout):
        return []

def general_categories(synset_id):
    """  
    Desc: Returns the ids of the general categories a synset belongs to
    Params: - synset_id (String): BabelNet synset id
    Returns: (List)[String] ids 
    """
    try:
        categories = [relation["target"] for relation in synset_relations(synset_id) 
                  if relation["pointer"]["shortName"] in ["is-a","part_of","subclass_of"]]
        res = []
        for cat in categories:
            try:
                #res.append(synset_info(cat)["senses"][0]["properties"]["fullLemma"])
                res.append(synset_info(cat)["senses"][0]["properties"]["synsetID"]["id"] )
            except IndexError:
                continue
        return set(res)
    except (ConnectionError,timeout):
        return []

def wiki_page(synset_id):
    """  
    Desc: Return the wikipedia page associated with a synset id
    Params: - synset_id (String): BabelNet synset id
    Returns: (Wikipediaapi.Page) wikipedia page object
    """
    try:
        synset = synset_info(synset_id)
        for sense in synset["senses"]:
            if sense['properties']["source"] == "WIKI":
                return wiki.page(sense['properties']["fullLemma"].replace("_"," "))
            else:
                continue
        return None
    except (ConnectionError,timeout):
        return None

def search_synsets(lemma,pos,searchLang="EN",verbose=False):
    """  
    Desc: Returns the ids of possible synsets associated to the provided word
    Params: - lemma (String): word for searching
            - pos (String): position (NOUN,VERB ...)
            - searchLang (String): language for search and results
            - verbose (bool): show messages 
    Returns: (List)[String]: ids 
    """
    args = {
        "lemma":lemma,
        "pos":pos,
        "searchLang":searchLang,
        "key":KEY
    }
    return [ele["id"] for ele in __make_request(args,"get_synsets",verbose)]


#search_synsets("apple","NOUN")   
#general_categories("bn:17306106n")
#wiki_page("bn:00289737n").text
#synset_info("bn:03094945n")["domains"]
        
    
                   

### Wikipedia Interface

In [1176]:
import spacy
import en_core_web_lg
from nltk.stem.porter import *
from more_itertools import locate
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
import string
import numpy as np
from scipy.optimize import curve_fit
from nltk.stem import WordNetLemmatizer 

stemmer = SnowballStemmer(language='english')
lemmatizer = WordNetLemmatizer() 
nlp = en_core_web_lg.load()
STOPWORDS = "stopwords.txt" #stopwords_file
with open(STOPWORDS ) as file:
        stop_words = [x.strip() for x in file.readlines()]

In [1236]:

def average_sentence_size(text):
    """Returns the average sentence size"""
    return np.mean([len(sentence.split()) for sentence in text.split(".")])

def clean_text(text):
    """Cleans the text, punctuation marks (. , \n) will be replaced by blank spaces 
    and possessive forms ('es , 's) will be removed. 
    Any other punctuation symbol will also be removed.
    @Params:
        -text: to be cleaned
    @Returns:
        Cleaned text
    """
    text = text.replace("\n"," ").replace(","," ").replace("."," ")
    text = text.replace("'s","").replace("'es","")
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def stem_text(text,title=""):
    #pendiente hacerlo mas elegante
    """
    Uses snowball stemmer to produce a dictionary with entries:
        dic = { ...,key:(words,occurr,relevance),... }
    Where words -> list of ocurrences of key in text (page's text)
          occur -> indexes of (key) occurrences in the text
          relevance -> set to 0 (see set_relevance())
    Note that every key = (STEMMED WORD,WORD_POS (Part Of Speech))
    @Params:
        - text: text to be processed
    @Returns:
        (stem_dictionary,cleaned text length)
    """
    dic = {}
    content = clean_text(text).split() #split spaces
    count = 0
    for i,x in enumerate(nlp(clean_text(text))):
        if x.pos_ not in ["NOUN","PROPN","VERB","ADJ"]:
            continue
        try:
            int(x)
            continue
        except:
            if x.text.lower() not in stop_words and "wikipedia" not in x.text.lower() and title not in x.text.lower() :
                stemmed = stemmer.stem(x.text.lower())
                key = (stemmed,x.pos_)
                if key not in dic:
                    dic[key] = {"words":set([x.text.lower()]),"occurr":[i],"relevance":.0,"pos":0}
                else:
                    dic[key]["words"].add(x.text.lower())
                    dic[key]["occurr"].append(i)
        count += 1
    return dic,len(content),text




def relevance(word,text_len,chunk_size=100): #AQUI TENEMOS EL CUELLO DE BOTELLA
    """
    Returns word relevance (probability of being in a randomly 
    positioned block of a fixed size).
    @Params:
        - word:  entry of stem_dict
        - text_len: text size (returned by stem_text())
        - chunk_size: block size, fixed
    """
    L = []
    for i in range(chunk_size,text_len,1):
        added = False
        for occ in word["occurr"]:
            if occ > i - chunk_size and occ <= i:
                L.append(1)
                added = True
                break
        if not added:
                L.append(0)
    #print("SUMA",np.sum(L),"LOng",len(L))
    return np.mean(L)    
    
def set_relevance(stem_dict,text_len,text,sort=False):
    relevances = []
    if text_len < 100: #no aceptamos artículos de menos de 300 palabras
        return None
    else:
        t1 = time()
        n_chunks = int(text_len/average_sentence_size(text)) #por ejemplo, mirar otras posibilidades dinamicas etc...
        for word in stem_dict:
            res = relevance(stem_dict[word],text_len,n_chunks)
            stem_dict[word]["relevance"] = res
        t2 = time()
    if sort:
        t3 = time()
        keys = sorted(stem_dict.keys(),key=lambda k:stem_dict[k]["relevance"], reverse=True)
        for i,key in enumerate(keys):
            stem_dict[key]["pos"] = i
        t4 = time()
   # print("1"+"#"*50,t2-t1)
   # print("2"+"#"*50,t4-t3)
    return stem_dict


def fit_model(stem_dict):
    y_data = []
    if stem_dict == None: #articulo demasiado corto
        return None,None
    keys = sorted(stem_dict.keys(),key=lambda k:stem_dict[k]["relevance"], reverse=True)
    for i,key in enumerate(keys):
        stem_dict[key]["pos"] = i
        y_data.append(stem_dict[key]["relevance"])
    y_data = np.array(y_data)[y_data != 0][0] 
    x_data = np.linspace(1,len(y_data),len(y_data))
    def zipf(x,alpha): #distribucion de Zipf
        return stem_dict[keys[0]]["relevance"]/x**alpha

    popt, pcov = curve_fit(zipf,x_data,y_data)
    return lambda x: zipf(x,popt),stem_dict

def link_relevance(link,stem_dict,model):
        words = nlp(link)
        rel = []
        if ":" in link:
            return 0
        for word in words:
            try:
                pos =  stem_dict[(stemmer.stem(word.text.lower()),word.pos_)]["pos"]
                rel.append(pos)
            except KeyError:
                rel.append(len(stem_dict))
        return model(np.mean(rel))[0] if  len(rel) * sum(rel)  != 0  else 0      

def relevant_links(page,top):
    
    stem_dict = set_relevance(*stem_text(page.text,title=page.title))
    model,stem_dict = fit_model(stem_dict)
    if model is None:
        return []
    return sorted(page.links.items(),key=lambda it: link_relevance(it[0],stem_dict,model),reverse=True)[:top]


#relevant_links(wiki.page("Apple Inc."),10)

### From terms to targetted texts

In [1430]:
def generalization(synset_id,top,l=[],level=1):
    if level > top:
        return 
    else:
        try:
            for child in general_categories(synset_id):
                l.append((child,level))
                generalization(child,top,l,level+1)
            return l
        except:
            return 
def concretization(page,top,max_expand,l=[],level=1):
    if level > top:
        return 
    else:
        try:
            for _,child in relevant_links(page,max_expand):
                l.append((child,level))
                concretization(child,top,max_expand,l,level+1)
            return l
        except:
            return []
    
class Concept():
    """Contains all the texts associated with a concept,
        each with an associated level of specificity"""
    def __init__(self,synset_id,pos,depth_up,depth_down):
        self.synset_id = synset_id
        self.depth_up,self.depth_down = depth_up,depth_down
        self.page = wiki_page(synset_id)
        self.pages = []
        
        if pos is "NOUN": #standard text mining
            #generalization
            for s_id,level in generalization(synset_id,depth_up):
                synset_page = wiki_page(s_id)
                if synset_page != None and synset_page.exists():
                    self.pages.append((synset_page,level))
            
            #concretization
            max_expand = int(np.ceil(np.power(len(self.pages),1/depth_down)))
            #for page,level in concretization(self.page,depth_down,max_expand):
            self.pages += [(link[1],-1) for link in relevant_links(self.page,len(self.pages))]
                
                
c = Concept( "bn:00005054n ","NOUN",1,1) #apple inc.         

print(c.pages)

[(Candy apple (id: 1127111, ns: 0), 1), (Pome (id: 311906, ns: 0), 1), (Apple juice (id: 454875, ns: 0), 1), (Stack cake (id: 2811212, ns: 0), 1), (Apple crisp (id: 1510907, ns: 0), 1), (Tarte Tatin (id: 523464, ns: 0), 1), (Apple strudel (id: 1191678, ns: 0), 1), (Apple soup (id: 25980154, ns: 0), 1), (Jewish apple cake (id: 30083318, ns: 0), 1), (Apfelküchle (id: 50347020, ns: 0), 1), (Cholera (food) (id: 39974802, ns: 0), 1), (Apple butter (id: 1595175, ns: 0), 1), (Žemlovka (id: 37552583, ns: 0), 1), (Vitréais (id: 46197603, ns: 0), 1), (Fruit (id: ??, ns: 0), -1), (Fruit tree (id: ??, ns: 0), -1), (Cultivar (id: ??, ns: 0), -1), (Asia (id: ??, ns: 0), -1), (Europe (id: ??, ns: 0), -1), (Golden apple (id: ??, ns: 0), -1), (Plant (id: ??, ns: 0), -1), (Apple juice (id: ??, ns: 0), -1), (Apple picking (id: ??, ns: 0), -1), (Fruit picking (id: ??, ns: 0), -1), (China (id: ??, ns: 0), -1), (United States (id: ??, ns: 0), -1), (Apple cider (id: ??, ns: 0), -1), (Cider apple (id: ??, ns:

### Article-Article distance

In [821]:
import sys
%matplotlib notebook
from matplotlib import pyplot as plt
page1 = wiki.page("")
m1,d1 = fit_model(set_relevance(*stem_text(page1.text,title=page1.title.lower())))
page2 = wiki.page("")
m2,d2 = fit_model(set_relevance(*stem_text(page2.text,title=page2.title.lower())))
("appl","NOUN") in d2


banana
apple


False

In [822]:
s1 = [key for key in d1 if key in d2]
print(len(s1))
s1 = sorted(s1,key=lambda k: (d1[k]["relevance"]*d2[k]["relevance"])/(1+abs(d1[k]["relevance"]-d2[k]["relevance"]
)),reverse=True)
vals = np.array([(d1[k]["relevance"]*d2[k]["relevance"])/(1+abs(d1[k]["relevance"]-d2[k]["relevance"])) for k in s1])
vals = vals[vals>0]
diff = np.abs(np.diff(vals))
#top = diff / np.array(vals[:-1])
x = np.argmax(diff<0.001*diff[0])
#x=40
X = np.array([d1[x]["pos"] for x in s1[:x+1]]) #/len(d1)
Y = np.array([d2[x]["pos"] for x in s1[:x+1]]) #/len(d2)
#print(max(c1,c2))

for l in s1[:x]:
    print(l,d1[l]["pos"],d2[l]["pos"])
v1 = np.zeros(2) 
v = np.array([np.mean(X),np.mean(Y)])
print(np.linalg.norm(v1-v))
print(x,len(vals))
#plt.scatter(X,Y)
#plt.scatter(np.mean(X),np.mean(Y),color='r')
#plt.show
plt.plot(vals)
plt.show


436
('fruit', 'NOUN') 0 0
('cultivar', 'NOUN') 1 2
('includ', 'VERB') 6 3
('produc', 'VERB') 2 13
('grow', 'VERB') 11 6
('plant', 'NOUN') 3 23
10.490034143249039
6 384


<IPython.core.display.Javascript object>

<function matplotlib.pyplot.show(*args, **kw)>

In [1432]:
from time import time
def aa_distance(d1,d2):
    t2 = time()
    if d1 is None or d2 is None:
        return np.inf
    s1 = [(k,(d1[k]["relevance"]*d2[k]["relevance"])/(1+abs(d1[k]["relevance"]-d2[k]["relevance"]))) for k in d1 if k in d2]
    if len(s1) == 0:
        return np.inf
    if len(s1) == 1:
        return np.linalg.norm(np.array([d1[s1[0]]["pos"],d2[s1[0]]["pos"]]))
    s1,vals = zip(*sorted(s1,key=lambda it: it[1],reverse=True))
    #print(s1)
    t3 = time()
    vals = np.array([(d1[k]["relevance"]*d2[k]["relevance"])/(1+abs(d1[k]["relevance"]-d2[k]["relevance"])) for k in s1])
    vals = vals[vals>0]
    diff = np.abs(np.diff(vals))
    if len(diff) == 0:
        return np.inf
    x = np.argmax(diff<0.001*diff[0])
    X,Y = zip(*[(d1[y]["pos"],d2[y]["pos"]) for y in s1[:x+1]]) #/len(d1)
    n = len(X)
    #print(X,Y)
    #v = np.array([(np.sum(X)-n*(n-1))/n,(np.sum(Y)-n*(n-1))/n])
    #print(t3-t2)
    v = np.array(np.mean(np.array(X)/len(d1)),np.mean(np.array(Y)/len(d2)))
    return np.linalg.norm(v)


    
page = wiki.page("Steve Jobs") 
concept_d = set_relevance(*stem_text(c.page.text,title=c.page.title.lower()),sort=True)
concept_page = set_relevance(*stem_text(page.text,title=page.title.lower()),sort=True)
distances = {}
for p,level in c.pages:
    t1 = time()
    d2 = set_relevance(*stem_text(p.text,title=p.title.lower()),sort=True)
    distances[p] = (aa_distance(concept_d,d2),d2)
    print(time()-t1)
m_distances = sorted(distances.keys(),key=lambda k: distances[k][0])
min(np.array([aa_distance(distances[p][1],concept_page)*((m_distances.index(p)+1)/len(distances)) for p,level in c.pages]))

0.07688283920288086
0.05190610885620117
0.10362887382507324
0.04845380783081055
0.09060192108154297
0.09475398063659668
0.08419489860534668
0.022555828094482422
0.02152705192565918
0.08040976524353027
0.02557682991027832
0.41231584548950195
0.026500701904296875
0.014210224151611328
0.4018211364746094
0.04622220993041992
0.40616703033447266
1.0262870788574219
1.7621493339538574
0.24219417572021484
0.891624927520752
0.10608315467834473
0.05880379676818848
0.058653831481933594
1.8964848518371582
1.7791810035705566
0.18165302276611328
0.3946380615234375


0.0006495158154830034

# playground

In [1362]:
page1 = wiki.page("Banana")
page2 = wiki.page("Apple")
d1 = set_relevance(*stem_text(page1.text,title=page1.title.lower()),sort=True)
d2 = set_relevance(*stem_text(page2.text,title=page2.title.lower()),sort=True)
aa_distance(d1,d2)
#d1

(('fruit', 'NOUN'), ('cultivar', 'NOUN'), ('includ', 'VERB'), ('produc', 'VERB'), ('grow', 'VERB'), ('plant', 'NOUN'), ('product', 'NOUN'), ('asia', 'PROPN'), ('cultiv', 'VERB'), ('cultiv', 'NOUN'), ('time', 'NOUN'), ('europ', 'PROPN'), ('diseas', 'NOUN'), ('wild', 'ADJ'), ('high', 'ADJ'), ('call', 'VERB'), ('dessert', 'NOUN'), ('varieti', 'NOUN'), ('skin', 'NOUN'), ('food', 'NOUN'), ('develop', 'VERB'), ('speci', 'NOUN'), ('caus', 'VERB'), ('grown', 'VERB'), ('centuri', 'NOUN'), ('east', 'PROPN'), ('cook', 'VERB'), ('seed', 'NOUN'), ('larg', 'ADJ'), ('commerci', 'ADJ'), ('year', 'NOUN'), ('leav', 'NOUN'), ('color', 'NOUN'), ('common', 'ADJ'), ('local', 'ADJ'), ('occur', 'VERB'), ('allow', 'VERB'), ('group', 'NOUN'), ('america', 'PROPN'), ('unit', 'PROPN'), ('state', 'PROPN'), ('modern', 'ADJ'), ('raw', 'ADJ'), ('north', 'PROPN'), ('genet', 'ADJ'), ('yellow', 'ADJ'), ('low', 'ADJ'), ('consid', 'VERB'), ('control', 'VERB'), ('market', 'NOUN'), ('rang', 'NOUN'), ('small', 'ADJ'), ('fresh

4.795368821079095

In [1336]:
def relevance(word,text_len,chunk_size=100): #AQUI TENEMOS EL CUELLO DE BOTELLA
    """
    Returns word relevance (probability of being in a randomly 
    positioned block of a fixed size).
    @Params:
        - word:  entry of stem_dict
        - text_len: text size (returned by stem_text())
        - chunk_size: block size, fixed
    """
    L = 0
    v = np.array(word["occurr"])
    if len(v) < 2:
        return 1/(text_len-chunk_size)
    #occ = v[(v>chunk_size)&(v<text_len-chunk_size)]
    v = v[v<text_len]
    occ = np.diff(v)
    for i,o in enumerate(occ):
        if o > chunk_size:
            if v[i] < chunk_size:
                L += v[i]
            elif text_len - chunk_size < v[i]:
                L += text_len - v[i]
            else:
                L+= chunk_size
        else:
            L += o
    #last one
    if len(v) > 0:
        if v[-1] < chunk_size:
            L += v[-1]
        elif text_len - chunk_size < v[-1]:
            L += text_len - v[-1]
        else:
            L += chunk_size
        #print(L)
        return (L)/(text_len-chunk_size)
    else:
        return 0
    
def relevance2(word,text_len,chunk_size=100): #AQUI TENEMOS EL CUELLO DE BOTELLA
    """
    Returns word relevance (probability of being in a randomly 
    positioned block of a fixed size).
    @Params:
        - word:  entry of stem_dict
        - text_len: text size (returned by stem_text())
        - chunk_size: block size, fixed
    """
    L = []
    for i in range(chunk_size,text_len,1):
        added = False
        for occ in word["occurr"]:
            if occ > i - chunk_size and occ <= i:
                L.append(1)
                added = True
                break
        if not added:
                L.append(0)
    #print("SUMA",np.sum(L),"LOng",len(L))
    return np.mean(L)    
    
    
page1 = wiki.page("Computer")
d,text_len,text = stem_text(page1.text,title=page1.title.lower())
n_chunks = int(text_len/average_sentence_size(text))
d["caca"] = {'occurr':[1,45]}
word = d["caca"]
print(relevance2(word,text_len,n_chunks))
relevance(word,text_len,n_chunks)

0.027131782945736434


0.027131782945736434

In [1342]:
page1 = wiki.page("computer")
t1 = time()
v = stem_text(page1.text,title=page1.title.lower())
t2 = time()
d1 = set_relevance(*v,sort=True)
t3 = time()
print(t2-t1,t3-t2)

1.6718740463256836 0.013387918472290039


In [1344]:
sorted(d1.keys(),key=lambda k: d1[k]["pos"])

[('devic', 'NOUN'),
 ('memori', 'NOUN'),
 ('machin', 'NOUN'),
 ('modern', 'ADJ'),
 ('number', 'NOUN'),
 ('time', 'NOUN'),
 ('data', 'NOUN'),
 ('earli', 'ADJ'),
 ('program', 'NOUN'),
 ('instruct', 'NOUN'),
 ('store', 'VERB'),
 ('call', 'VERB'),
 ('process', 'NOUN'),
 ('thousand', 'NOUN'),
 ('system', 'NOUN'),
 ('perform', 'VERB'),
 ('read', 'VERB'),
 ('develop', 'VERB'),
 ('design', 'VERB'),
 ('design', 'NOUN'),
 ('calcul', 'NOUN'),
 ('built', 'VERB'),
 ('oper', 'NOUN'),
 ('includ', 'VERB'),
 ('special', 'ADJ'),
 ('control', 'NOUN'),
 ('electron', 'ADJ'),
 ('circuit', 'NOUN'),
 ('exampl', 'NOUN'),
 ('digit', 'ADJ'),
 ('unit', 'NOUN'),
 ('allow', 'VERB'),
 ('work', 'VERB'),
 ('arithmet', 'ADJ'),
 ('applic', 'NOUN'),
 ('oper', 'VERB'),
 ('input', 'NOUN'),
 ('cpu', 'NOUN'),
 ('comput', 'NOUN'),
 ('provid', 'VERB'),
 ('type', 'NOUN'),
 ('form', 'NOUN'),
 ('programm', 'ADJ'),
 ('power', 'NOUN'),
 ('function', 'NOUN'),
 ('output', 'NOUN'),
 ('concept', 'NOUN'),
 ('requir', 'VERB'),
 ('complex

In [1031]:
l = np.array([1,2])
np.argmax(l[l!=0])

1