# Búsqueda de texto (*information retrieval*)
Vamos a usar el algoritmo LSI para realizar una búsqueda indexada de textos similares. Versión modificada para usar matriz TFIDF
### Cargamos librerías

In [1]:
import os
import re
import numpy as np
import pandas as pd
import warnings

# Gensim
import gensim
import gensim.corpora as corpora

from gensim.models import LsiModel
warnings.filterwarnings('ignore')

# spacy para lematizar
import spacy

Utilizamos un generador para obtener los documentos del Corpus línea a línea desde el archivo del conjunto de ejemplo y convertirlos en un listado de tokens.

In [2]:
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])
stop_words = [word.text for word in nlp.vocab if word.is_stop] #listado de stop-words

def lemmatize_doc(text, allowed_postags=['NOUN', 'PROPN', 'ADJ', 'VERB', 'ADV']):
    """Función que devuelve el lema de una string,
    excluyendo las palabras cuyo POS_TAG no está en la lista"""
    text_out = [token.lemma_.lower() for token in nlp(text) if token.pos_ in allowed_postags and len(token.lemma_)>3]
    return text_out
            
def build_texts(fname):
    """
    Generador que devuelve el texto tokenizado a partir de un archivo
    línea a línea
    """
    with open(fname) as f:
        for line in f:
            yield lemmatize_doc(line)

In [3]:
data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_data_file = data_dir + os.sep + 'lee_background.cor'

In [4]:
texto=build_texts(lee_data_file)

### Creamos el diccionario y el corpus para Topic Modeling
Las dos entradas para el modelo LDA son un diccionario (id2word) y un corpus de `gensim`.  

In [18]:
class TFIDF_Corpus(object):
    """
    Iterable: en cada iteración devuelve el vector TF-IDF
    del siguiente documento en el corpus.
    El corpus es el listado de críticas alojadas en el directorio
    pasado como argumento al instanciar la clase.
    
    Procesa un documento cada vez, así
    nunca carga el corpus entero en RAM.
    """
    def __init__(self, filename):
        self.filename = filename
        #creamos bigramas y trigramas
        self.bigram = gensim.models.Phrases(build_texts(self.filename), min_count=5, threshold=50) # higher threshold fewer phrases.
        #optimizamos una vez entreando
        self.bigram_mod = gensim.models.phrases.Phraser(self.bigram)

        self.trigram = gensim.models.Phrases(self.bigram_mod[build_texts(self.filename)], min_count=5, threshold=50)  
        self.trigram_mod = gensim.models.phrases.Phraser(self.trigram)
        #crea el diccionario = mapeo de documentos a sparse vectors
        self.diccionario = gensim.corpora.Dictionary(
            self.trigram_mod[map(lambda x: self.bigram_mod[x], build_texts(self.filename))])
        #calculamos el modelo TFIDF
        self.corpus_bow = (self.diccionario.doc2bow(text) for text in
                           self.trigram_mod[map(lambda x: self.bigram_mod[x], build_texts(self.filename))])
        self.tfidf = gensim.models.TfidfModel(self.corpus_bow)
        
    def __len__(self):
        #necesitamos saber la longitud del corpus para visualizar con pyLDAvis
        return self.diccionario.num_docs
    
    def __iter__(self):
        """
        __iter__ es un iterable => TFIDF_Corpus es un streamed iterable.
        """
        for tokens in build_texts(self.filename):
            # transforma cada doc (lista de tokens) en un vector sparse uno a uno
            yield self.tfidf[self.diccionario.doc2bow(self.trigram_mod[self.bigram_mod[tokens]])]

In [19]:
# Crea diccionario
corpus_tfidf = TFIDF_Corpus(lee_data_file)



In [20]:
# Vemos como ejemplo el primer doc
for c in corpus_tfidf:
    print(c)
    break

[(0, 0.12872502721369006), (1, 0.05823828190334948), (2, 0.07029327330225427), (3, 0.09065611929419445), (4, 0.07672275182325156), (5, 0.06153697834251972), (6, 0.1415584266049415), (7, 0.07077921330247075), (8, 0.07077921330247075), (9, 0.20397061560841767), (10, 0.08332014470159202), (11, 0.03136359321301515), (12, 0.04992408315877995), (13, 0.08332014470159202), (14, 0.10319705069331572), (15, 0.04290834240456335), (16, 0.07811518789507318), (17, 0.05679009004152465), (18, 0.05303332509683067), (19, 0.10319705069331572), (20, 0.05544927380368462), (21, 0.04811329921108218), (22, 0.06799020520280588), (23, 0.05090230731074704), (24, 0.10319705069331572), (25, 0.07407790974164098), (26, 0.1415584266049415), (27, 0.05544927380368462), (28, 0.05823828190334948), (29, 0.06153697834251972), (30, 0.07407790974164098), (31, 0.059812556084034345), (32, 0.159099975290492), (33, 0.10319705069331572), (34, 0.27576869743931665), (35, 0.06344323870986832), (36, 0.0838543868957644), (37, 0.1031970

Recuerda que en el modelo BoW de `gensim` el primer elemento de cada tupla es el ID del término en el diccionario, y el segundo su frecuencia en el doc.  
`diccionario[ID]` devuelve el término con índice ID en el vocabulario:

In [21]:
len(corpus_tfidf.diccionario.token2id)

5215

In [22]:
corpus_tfidf.tfidf.num_docs

300

## Topic modeling

### Modelo LSI
Este modelo ordena los temas y saca un listado ordenado. Hay que especificar el número de topics.


In [24]:
lsimodel = LsiModel(corpus=corpus_tfidf, num_topics=100, id2word=corpus_tfidf.diccionario)

In [25]:
for c in corpus_tfidf:
    print(lsimodel[c])
    break

[(0, -0.18522639924440773), (1, -0.09941138087729842), (2, 8.67266639211608e-05), (3, -0.05342127327638134), (4, -0.3132242847892107), (5, 0.28510302553833017), (6, 0.19251842730111027), (7, 0.01592455349799725), (8, -0.03634128387015095), (9, 0.1015789180979052), (10, 0.001044617445644786), (11, 0.09436068524524494), (12, 0.07472529983482311), (13, -0.14724256657628074), (14, 0.009283053973501532), (15, 0.021091856710228493), (16, 0.08470418390550202), (17, -0.02242799233901667), (18, -0.010419264443288796), (19, -0.007482915013074953), (20, 0.05337415027255055), (21, -0.012551836680758153), (22, 0.10773371782371018), (23, -0.07444874105052922), (24, -0.09815831491747305), (25, 0.0025985507882866337), (26, -0.12068865148977245), (27, -0.030278162767596824), (28, 0.01284002136943284), (29, 0.006849772755406738), (30, -0.054444140250587046), (31, -0.04299803394687558), (32, 0.02136032346887217), (33, 0.03161889334153835), (34, -0.04431778701857877), (35, 0.033950272741759546), (36, 0.03

##  Búsqueda de documentos por temática (*information retrieval*)
Para buscar los documentos más similares a un documento dado, hay que trabajar con el modelo *space vector* generado por el algoritmo LSI. Primero, generamos una matriz LSI para todos los documentos del corpus. Para buscar el documento más parecido a un nuevo texto, calculamos su vector LSI y buscamos cuál es el más cercano dentro de la matriz LSI del corpus.

In [26]:
#creamos un índice de similitud entre los documentos del corpus
from gensim.similarities import MatrixSimilarity

#creamos corpus transformado
lsi_corpus = lsimodel[corpus_tfidf]

In [27]:
lsi_corpus

<gensim.interfaces.TransformedCorpus at 0x1d76d7c40>

In [28]:
#primer documento, como BOW_corpus no es indexable no podemos indexar tampoco lsi_corpus
for i in lsi_corpus:
    print(i)
    break

[(0, -0.18522639924440773), (1, -0.09941138087729842), (2, 8.67266639211608e-05), (3, -0.05342127327638134), (4, -0.3132242847892107), (5, 0.28510302553833017), (6, 0.19251842730111027), (7, 0.01592455349799725), (8, -0.03634128387015095), (9, 0.1015789180979052), (10, 0.001044617445644786), (11, 0.09436068524524494), (12, 0.07472529983482311), (13, -0.14724256657628074), (14, 0.009283053973501532), (15, 0.021091856710228493), (16, 0.08470418390550202), (17, -0.02242799233901667), (18, -0.010419264443288796), (19, -0.007482915013074953), (20, 0.05337415027255055), (21, -0.012551836680758153), (22, 0.10773371782371018), (23, -0.07444874105052922), (24, -0.09815831491747305), (25, 0.0025985507882866337), (26, -0.12068865148977245), (27, -0.030278162767596824), (28, 0.01284002136943284), (29, 0.006849772755406738), (30, -0.054444140250587046), (31, -0.04299803394687558), (32, 0.02136032346887217), (33, 0.03161889334153835), (34, -0.04431778701857877), (35, 0.033950272741759546), (36, 0.03

In [29]:
#podemos recuperar los documentos que queramos en una lista con las herramientas de iteración
from itertools import islice

primer_doc = islice(lsi_corpus, 1) #devuelve un objeto de tipo generador

In [30]:
next(primer_doc) #alternativamente list(primer_doc)[0]

[(0, -0.18522639924440773),
 (1, -0.09941138087729842),
 (2, 8.67266639211608e-05),
 (3, -0.05342127327638134),
 (4, -0.3132242847892107),
 (5, 0.28510302553833017),
 (6, 0.19251842730111027),
 (7, 0.01592455349799725),
 (8, -0.03634128387015095),
 (9, 0.1015789180979052),
 (10, 0.001044617445644786),
 (11, 0.09436068524524494),
 (12, 0.07472529983482311),
 (13, -0.14724256657628074),
 (14, 0.009283053973501532),
 (15, 0.021091856710228493),
 (16, 0.08470418390550202),
 (17, -0.02242799233901667),
 (18, -0.010419264443288796),
 (19, -0.007482915013074953),
 (20, 0.05337415027255055),
 (21, -0.012551836680758153),
 (22, 0.10773371782371018),
 (23, -0.07444874105052922),
 (24, -0.09815831491747305),
 (25, 0.0025985507882866337),
 (26, -0.12068865148977245),
 (27, -0.030278162767596824),
 (28, 0.01284002136943284),
 (29, 0.006849772755406738),
 (30, -0.054444140250587046),
 (31, -0.04299803394687558),
 (32, 0.02136032346887217),
 (33, 0.03161889334153835),
 (34, -0.04431778701857877),
 (3

In [33]:
#creamos índice
index = MatrixSimilarity(lsi_corpus)

In [34]:
index

<gensim.similarities.docsim.MatrixSimilarity at 0x1d76ac280>

Podemos ver la similitud de cualquier documento del corpus al resto de documentos

In [35]:
sims = index[next(islice(lsi_corpus, 1))]
print(list(enumerate(sims)))

[(0, 0.9999999), (1, 0.046523854), (2, 0.083559826), (3, -0.024700314), (4, 0.111388355), (5, 0.04393145), (6, 0.050796308), (7, 0.16055049), (8, 0.85660654), (9, 0.34105706), (10, 0.26832837), (11, 0.22784303), (12, -0.009869134), (13, 0.07089712), (14, 0.19053873), (15, 0.0014436096), (16, 0.057745647), (17, -0.012509026), (18, 0.0783613), (19, 0.39336553), (20, -0.010620497), (21, 0.14694999), (22, 0.012844235), (23, -0.0037965085), (24, 0.040526506), (25, 0.45405835), (26, -0.018272668), (27, 0.011634372), (28, 0.066841386), (29, 0.12583002), (30, -0.03196497), (31, 0.055385564), (32, 0.0943014), (33, 0.7800227), (34, 0.013477191), (35, 0.0394985), (36, 0.005104189), (37, 0.03823734), (38, 0.034502186), (39, 0.06341195), (40, 0.80324656), (41, 0.04257937), (42, 0.024793427), (43, 0.13223088), (44, 0.1884181), (45, 0.030847182), (46, 0.16265452), (47, 0.08870953), (48, 0.8516284), (49, 0.0714974), (50, 0.04258704), (51, 0.11310816), (52, 0.09289202), (53, -0.0035911575), (54, 0.0743

In [36]:
len(sims)

300

In [37]:
#nos quedamos con los 10 primeros
sims_sorted = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims_sorted[:10])

[(0, 0.9999999), (8, 0.85660654), (48, 0.8516284), (40, 0.80324656), (33, 0.7800227), (25, 0.45405835), (19, 0.39336553), (109, 0.37516898), (9, 0.34105706), (255, 0.32914102)]


Vemos el documento original para evaluar su parecido


In [38]:
import linecache

linecache.getline(lee_data_file, 1) #noticia  nº0

'Hundreds of people have been forced to vacate their homes in the Southern Highlands of New South Wales as strong winds today pushed a huge bushfire towards the town of Hill Top. A new blaze near Goulburn, south-west of Sydney, has forced the closure of the Hume Highway. At about 4:00pm AEDT, a marked deterioration in the weather as a storm cell moved east across the Blue Mountains forced authorities to make a decision to evacuate people from homes in outlying streets at Hill Top in the New South Wales southern highlands. An estimated 500 residents have left their homes for nearby Mittagong. The New South Wales Rural Fire Service says the weather conditions which caused the fire to burn in a finger formation have now eased and about 60 fire units in and around Hill Top are optimistic of defending all properties. As more than 100 blazes burn on New Year\'s Eve in New South Wales, fire crews have been called to new fire at Gunning, south of Goulburn. While few details are available at th

In [39]:
linecache.getline(lee_data_file, sims_sorted[1][0]+1) #noticia más parecida

'There has been welcome relief for firefighters in New South Wales overnight with milder weather allowing them to strengthen containment lines around the most severe fires. But fire authorities are not getting overly optimistic as dry and hot weather is forecast to continue. The weather bureau is forecasting temperatures in the high 30s and westerly winds until at least Friday, which means fire authorities are reluctant to get too excited about last night\'s favourable conditions. Marks Sullivan from the Rural Fire Service says fire fighters are remaining on guard. "A lot of fires that have been burning in the areas around Sydney and the north coast and further south have been burning within areas that are known and are contained," he said. "However, that\'s not to say that these fires won\'t pose a threat given the weather conditions that are coming up over the next few days." Despite the caution, the Rural Fire Service says most of the state\'s fires that threaten property are burnin

También podemos calcular el documento más similar dentro del corpus a un nuevo documento calculando primero su matriz TF-IDF/BoW y luego transformando a matriz LSI

In [42]:
new_doc = "the new Pakistan government falled in the terrorist attack by the islamic group Hamas"
texto_lemmatizado = lemmatize_doc(new_doc)

In [43]:
texto_new = corpus_tfidf.trigram_mod[corpus_tfidf.bigram_mod[texto_lemmatizado]]
corpus_bow_new = corpus_tfidf.diccionario.doc2bow(texto_new)
corpus_tfidf_new = corpus_tfidf.tfidf[corpus_bow_new]
lsi_corpus_new = lsimodel[corpus_tfidf_new]

In [45]:
texto_new

['pakistan',
 'government',
 'fall',
 'terrorist_attack',
 'islamic',
 'group',
 'hamas']

In [47]:
corpus_bow_new

[(32, 1), (126, 1), (148, 1), (213, 1), (785, 1), (788, 1), (1034, 1)]

In [48]:
corpus_tfidf_new

[(32, 0.4127301483889559),
 (126, 0.2551354434978296),
 (148, 0.404193827943508),
 (213, 0.21108109209837742),
 (785, 0.4127301483889559),
 (788, 0.39614557094705677),
 (1034, 0.47890955651645445)]

In [49]:
lsi_corpus_new

[(0, -0.14100186907199255),
 (1, 0.07167561185530018),
 (2, -0.03361617438529427),
 (3, -0.02706467834937794),
 (4, 0.03465714532970966),
 (5, -0.01768978765645103),
 (6, -0.025121111608230506),
 (7, -0.013984775576135065),
 (8, -0.023764879598198393),
 (9, 0.09983343778626569),
 (10, 0.08103627413713693),
 (11, -0.00924080598767679),
 (12, -0.04267466381972078),
 (13, -0.013374876703527673),
 (14, 0.003178607643777977),
 (15, 0.043716632374400814),
 (16, 0.022924029222854733),
 (17, 0.0007342650059334096),
 (18, -0.0008635069801688228),
 (19, -0.09165248971323114),
 (20, -0.05091996514249644),
 (21, 0.05261153259868094),
 (22, 0.027999791170345817),
 (23, 0.12536494795073186),
 (24, -0.06079740229022334),
 (25, -0.018268192590219754),
 (26, 0.01843735847010293),
 (27, 0.02688459033813403),
 (28, -0.0356268712957837),
 (29, 0.008415190520503562),
 (30, 0.008180456397462267),
 (31, -0.02568925034725398),
 (32, 0.034625531361695466),
 (33, -0.00829438064285137),
 (34, -0.0089560208440439

Ahora buscamos en el índice cuáles son los documentos más parecidos dentro del corpus al nuevo documento:

In [50]:
sims = index[lsi_corpus_new]

In [52]:
sims_sorted = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims_sorted[:10])

[(85, 0.61457443), (93, 0.59449697), (110, 0.5659039), (227, 0.5492055), (220, 0.54377335), (12, 0.53516614), (26, 0.49097526), (60, 0.47723424), (1, 0.45232993), (267, 0.39375085)]


El texto del documento más cercano es:

In [53]:
linecache.getline(lee_data_file, sims_sorted[0][0]+1)

"Hamas militants have fought gun battles with Palestinian security forces in the Gaza Strip, trying to arrest one of the Islamic group's senior political leaders. Reports say the fight erupted in the Gaza Strip after dozens of Hamas members surrounded the home of Abdel-Aziz al-Rantissi when Palestinian police arrived to detain him. The Palestinian leader Yasser Arafat, under international pressure to crack down on militants after a wave of suicide bombings in Israel in the past month, has outlawed the military wings of Hamas and other groups and arrested dozens of militants. \n"