In [65]:
# Simple search enginge straight from the GenSim tutorials
import json
import glob
from gensim import corpora, models, similarities


class Story:
    def __init__(self, docid, headline, text, links):
        self.docid = docid
        self.text = text
        self.headline = headline
        self.links = links

        
def get_documents(corpus_loc):
    docid = 0
    files = glob.glob(corpus_loc + "*json")
    output = []
    for filename in files:
        with open(filename) as f:
            for line in f:
                full_text = json.loads(line)['full_text']
                headline = json.loads(line)['headline']
                links = json.loads(line)['links']
                output.append(Story(docid, headline, full_text, links))
                docid = docid + 1
    return output

documents = get_documents("/Volumes/bigone/lens_corpus/files/")

In [66]:
from pprint import pprint   # pretty-printer

stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.text.lower().split() if word not in stoplist]
         for document in documents]

# remove words that -appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]

dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/lens.dict')

corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/lens.mm', corpus) # store to disk, for later use

In [70]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=200)
doc = "OPSB"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it
index.save('/tmp/lens.index')
index = similarities.MatrixSimilarity.load('/tmp/lens.index')
sims = index[vec_lsi] # perform a similarity query against the corpus
sims = sorted(enumerate(sims), key=lambda item: -item[1])

sims = sims[0:10]

for s in sims:
    print s
    dcd = s[0]
    print dcd
    print [d for d in documents if d.docid==dcd][0].headline



(2911, 0.26134795)
2911
Orleans school board must agree on direction before hiring a superintendent
(3006, 0.25180304)
3006
Orleans Parish School Board narrows superintendent search to two finalists
(2332, 0.25106037)
2332
KIPP leadership to vote on returning to Orleans school board
(2915, 0.2210699)
2915
Ugliness in New Orleans school board meetings deters superintendent candidates
(2925, 0.21933365)
2925
School board needs clear direction, not superintendent who will tolerate ambiguity
(2920, 0.21889541)
2920
Complex education system requires that Orleans Parish School Board finds the right superintendent
(2379, 0.21759659)
2379
Vote tonight, but little support for moving Collegiate Academies to parish control
(2912, 0.21721315)
2912
School board should look for opportunity in upcoming court ruling on post-Katrina teacher layoffs
(2148, 0.21390912)
2148
Contrary to critics’ claims, administrative staffing at OPSB has dropped along with enrollment
(2360, 0.20800714)
2360
Morris Jeff s