In [1]:
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim import similarities
from smart_open import smart_open
import xml.etree.ElementTree as ET
import math
p = PorterStemmer()




## Procesamiento de el documento

In [2]:
def process(text):
    doc_nor= text.lower()
    doc_sw = remove_stopwords(doc_nor)
    doc_stem = p.stem_sentence(doc_sw)
    return doc_stem.split()

## Lista de Documentos Procesados

In [3]:
docDict = []
for index in range(1,332):
    number = ""
    if math.floor(index/10) == 0:
        number = "00"+str(index)
    else:
        if math.floor(index/100) == 0:
            number = "0"+str(index)
        else:
            if math.floor(index/1000) == 0:
                number = str(index)
    root = ET.parse('docs-raw-texts/wes2015.d'+number+'.naf').getroot()
    title = root.find('nafHeader').find('fileDesc').attrib['title']
    content = root.find('raw').text
    docDict.append(process(title))
print(docDict)

[['william', 'beaumont', 'human', 'digest'], ['selma', 'lagerlöf', 'wonder', 'adventur', 'niel', 'holgersson'], ['ferdinand', 'lessep', 'suez', 'canal'], ['walt', 'disney’', '‘steamboat', 'willie’', 'rise', 'mickei', 'mous'], ['eugen', 'wigner', 'structur', 'atom', 'nucleu'], ['eugenio', 'beltrami', 'non-euclidian', 'geometri'], ['bernard', 'mandevil', 'fabl', 'bee'], ['leo', 'baekeland', 'begin', 'plastic', 'ag'], ['dorothea', 'erxleben', '–', 'germany’', 'femal', 'medic', 'doctor'], ['sir', 'jame', 'young', 'simpson', 'chloroform'], ['loui', 'antoin', 'bougainvil', 'voyag', 'world'], ['robert', 'morison', 'classif', 'plant'], ['florenc', 'sabin', '–', 'prepar', 'ground', 'women', 'medic', 'scienc'], ['hermann', '‘klecks’', 'rorschach', 'eponym', 'test'], ['william', 'stukelei', 'mysteri', 'stoneheng'], ['adolph', 'sax', 'saxophon'], ['edvard', 'munch', 'munch', 'affair'], ['spyridon', 'marinato', 'discoveri', 'akrotiri'], ['daniel', 'rutherford', 'isol', 'nitrogen'], ['alexand', 'lip

## Generar Diccionario

In [4]:
dictionary = corpora.Dictionary(docDict)
dictionary.save('clase3.dict')
print(dictionary)

Dictionary(1078 unique tokens: ['beaumont', 'digest', 'human', 'william', 'adventur']...)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Generar Corpus

In [5]:
class MyCorpus(object):
    def __iter__(self):
        for index in range(1,332):
            number = ""
            if math.floor(index/10) == 0:
                number = "00"+str(index)
            else:
                if math.floor(index/100) == 0:
                    number = "0"+str(index)
                else:
                    if math.floor(index/1000) == 0:
                        number = str(index)
            root = ET.parse('docs-raw-texts/wes2015.d'+number+'.naf').getroot()
            title = root.find('nafHeader').find('fileDesc').attrib['title']
            content = root.find('raw').text
            yield dictionary.doc2bow(process(title))

corpus_memory_friendly = MyCorpus()#Todos mis documentos ahora estan representados como una bolsa de palabras
#Almaceno mi corpus
corpora.MmCorpus.serialize('corpus.mm', corpus_memory_friendly)  

## Cargar el Corpus

In [6]:
#Cargo mi corpus
corpus = corpora.MmCorpus('corpus.mm')
print(corpus) #No lo carga en memoria
#Para leer la representacion de bolda de palabras resultante de cada documento en el corpus
for doc in corpus:
    print(doc)

MmCorpus(331 documents, 1078 features, 1493 non-zero entries)
[(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)]
[(4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0), (9, 1.0)]
[(10, 1.0), (11, 1.0), (12, 1.0), (13, 1.0)]
[(14, 1.0), (15, 1.0), (16, 1.0), (17, 1.0), (18, 1.0), (19, 1.0), (20, 1.0)]
[(21, 1.0), (22, 1.0), (23, 1.0), (24, 1.0), (25, 1.0)]
[(26, 1.0), (27, 1.0), (28, 1.0), (29, 1.0)]
[(30, 1.0), (31, 1.0), (32, 1.0), (33, 1.0)]
[(34, 1.0), (35, 1.0), (36, 1.0), (37, 1.0), (38, 1.0)]
[(39, 1.0), (40, 1.0), (41, 1.0), (42, 1.0), (43, 1.0), (44, 1.0), (45, 1.0)]
[(46, 1.0), (47, 1.0), (48, 1.0), (49, 1.0), (50, 1.0)]
[(51, 1.0), (52, 1.0), (53, 1.0), (54, 1.0), (55, 1.0)]
[(56, 1.0), (57, 1.0), (58, 1.0), (59, 1.0)]
[(44, 1.0), (45, 1.0), (60, 1.0), (61, 1.0), (62, 1.0), (63, 1.0), (64, 1.0), (65, 1.0)]
[(66, 1.0), (67, 1.0), (68, 1.0), (69, 1.0), (70, 1.0)]
[(3, 1.0), (71, 1.0), (72, 1.0), (73, 1.0)]
[(74, 1.0), (75, 1.0), (76, 1.0)]
[(77, 1.0), (78, 1.0), (79, 2.0)]
[(80, 1.0), (81, 1.

## Cargar de la memoria el Corpus y el Diccionario

In [7]:
dictionary = corpora.Dictionary.load('clase3.dict')
corpus = corpora.MmCorpus('corpus.mm')

## TF-IDF

In [8]:
tfidf = models.TfidfModel(corpus) 

## Construir la Matriz

In [9]:
index = similarities.MatrixSimilarity(tfidf[corpus])
index.save('clase3tfidf.index')

In [10]:
index = similarities.MatrixSimilarity.load('clase3tfidf.index')

## Construir el query

In [31]:
qbow_List = []
for index in range(1,36):
    number = ""
    if math.floor(index/10) == 0:
        number = "0"+str(index)
    else:
        if math.floor(index/100) == 0:
            number = str(index)
    root_query = ET.parse('queries-raw-texts/wes2015.q'+number+'.naf').getroot()
    query = root_query.find('raw').text
    # Query a Bag of Words
    query_doc_bow = dictionary.doc2bow(process(query))
    qbow_List.append(query_doc_bow)
qbow_List

[[(738, 1)],
 [(103, 1), (548, 1), (724, 1)],
 [],
 [(478, 1), (764, 1)],
 [(538, 1)],
 [(114, 1)],
 [(379, 1), (542, 1)],
 [(21, 1), (855, 1), (894, 1)],
 [(211, 1), (631, 1)],
 [(132, 1)],
 [],
 [(929, 1)],
 [(724, 1), (831, 1)],
 [(211, 1), (538, 1)],
 [],
 [(649, 1)],
 [(541, 1), (616, 1)],
 [(215, 1), (420, 1), (670, 1), (691, 1), (702, 1)],
 [(3, 1)],
 [(247, 1), (1030, 1)],
 [],
 [(47, 1), (929, 1)],
 [(724, 1), (752, 1), (753, 1)],
 [(691, 1), (865, 1)],
 [],
 [(367, 1), (558, 1), (991, 1)],
 [(125, 1), (504, 1)],
 [(34, 1), (157, 1), (503, 1), (504, 1)],
 [(157, 1), (158, 1)],
 [(378, 1), (617, 1)],
 [(929, 1)],
 [],
 [(102, 1), (720, 1)],
 [],
 [(435, 1)]]

## Encontrar Similitudes

In [80]:
sims_List = []
for indice, query in enumerate(qbow_List):
    print(query)
    sims = index[tfidf[query]]

[(738, 1)]


TypeError: 'int' object is not subscriptable

In [None]:
for i, p in enumerate(sims):
    if p != 0:
        print (str(i+1), p)