# Gensim tests
#### Inspired by https://radimrehurek.com/gensim/tutorial.html

In [None]:
import os
import logging
from collections import defaultdict
from pprint import pprint  # pretty-printer

In [None]:
from six import iteritems
from gensim import corpora, models, similarities

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Corpora and Vector Spaces

In [None]:
# documents = ["Human machine interface for lab abc computer applications",
#              "A survey of user opinion of computer system response time",
#              "The EPS user interface management system",
#              "System and human system engineering testing of EPS",
#              "Relation of user perceived response time to error measurement",
#              "The generation of random binary unordered trees",
#              "The intersection graph of paths in trees",
#              "Graph minors IV Widths of trees and well quasi ordering",
#              "Graph minors A survey"]

documents = open("../data/articles.txt", "r").read().splitlines()

In [None]:
for i, s in enumerate(open("../data/sources.txt", "r").read().splitlines()):
    print(i, s)

In [None]:
# remove common words and tokenize
stoplist = set("for a an of the and to in on by from but at as or so it with "
               "this that those these there "
               "i you he she we they me him my your his her its our their m mr ms dr jr "
               "s d re do did got is no b f j k l t w"
               "- *".split())

texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

# remove words that appear only once
frequency = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
         for text in texts]

print(texts)

In [None]:
dictionary = corpora.Dictionary(texts)
dictionary.save('../tmp/articles.dict')  # store the dictionary, for future reference
print(dictionary)
# print(dictionary.token2id)

In [None]:
# new_doc = "Human computer interaction"
# new_vec = dictionary.doc2bow(new_doc.lower().split())
# print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored

In [None]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('../tmp/articles.mm', corpus)  # store to disk, for later use
print(corpus)

### Memory efficience

In [None]:
class MyCorpus(object):
    def __iter__(self):
        for line in open('../data/articles.txt'):
            # assume there's one document per line, tokens separated by whitespace
            yield dictionary.doc2bow(line.lower().split())

In [None]:
corpus_memory_friendly = MyCorpus()  # doesn't load the corpus into memory!
print(corpus_memory_friendly)

In [None]:
for vector in corpus_memory_friendly:  # load one vector into memory at a time
    print(vector)

In [None]:
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('../data/articles.txt'))

# remove stop words and words that appear only once
stoplist = set("for a an of the and to in by from at as with this that those these there"
               "i me you he she we they s d re my your his her our their m mr ms dr jr"
               "do did got"
               "b f k l".split())
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
            if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids)  # remove stop words and words that appear only once

dictionary.compactify()  # remove gaps in id sequence after words that were removed

dictionary.save('../tmp/articles.dict')  # store the dictionary, for future reference

print(dictionary)

In [None]:
print('\n'.join(sorted([k for k in dictionary.token2id])))
print(dictionary.token2id)

In [None]:
# create a toy corpus of 2 documents, as a plain Python list
corpus = [[(1, 0.5)], []]  # make one document empty, for the heck of it

corpora.MmCorpus.serialize('../tmp/corpus.mm', corpus)

In [None]:
corpus = corpora.MmCorpus('../tmp/corpus.mm')

In [None]:
print(corpus)

# one way of printing a corpus: load it entirely into memory
print(list(corpus)) 

# another way of doing it: print one document at a time, making use of the streaming interface
for doc in corpus:
    print(doc)

## Topics and Transformations

In [None]:
if (os.path.exists("../tmp/articles.dict")):
    dictionary = corpora.Dictionary.load('../tmp/articles.dict')
    corpus = corpora.MmCorpus('../tmp/articles.mm')
    print("Used files generated from first tutorial")
else:
    print("Please run first tutorial to generate data set")

In [None]:
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model

In [None]:
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors

In [None]:
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

In [None]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=5) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

In [None]:
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    print(doc)

In [None]:
lsi.save('../tmp/model.lsi') # same for tfidf, lda, ...
lsi = models.LsiModel.load('../tmp/model.lsi')

## Similarity Queries

### Load the model

In [None]:
dictionary = corpora.Dictionary.load('../tmp/articles.dict')
corpus = corpora.MmCorpus('../tmp/articles.mm') # comes from the first tutorial, "From strings to vectors"
print(corpus)

In [None]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=5)

In [None]:
index = similarities.MatrixSimilarity(lsi[corpus])  # transform corpus to LSI space and index it
index.save('../tmp/articles.index')                 # save the index

### Similarity query step by step

In [None]:
doc = "trump propaganda"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
print(vec_lsi)

In [None]:
index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it

In [None]:
index.save('../tmp/articles.index')
index = similarities.MatrixSimilarity.load('../tmp/articles.index')

In [None]:
sims = index[vec_lsi] # perform a similarity query against the corpus
print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples

In [None]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims) # print sorted (document number, similarity score) 2-tuples

### Similarity query function

In [None]:
def search_related_articles(phrase):
    """Sort articles that matches 'phrase' the best"""
    # load LSI space index
    index = similarities.MatrixSimilarity.load('../tmp/articles.index')
    # convert the query to LSI space
    vec_bow = dictionary.doc2bow(phrase.lower().split())
    vec_lsi = lsi[vec_bow]
    # perform a similarity query against the corpus
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    # load articles sources and print the results
    sources = open("../data/sources.txt", "r").read().splitlines()
    for (a_id, rating) in sims:
        print("{:02d}: {:.1f}% # {}".format(a_id, rating*100, sources[a_id]))

In [None]:
search_related_articles("trump propaganda")

In [None]:
search_related_articles("health")

In [None]:
search_related_articles("violence")