# Gensim Tutorial: Topics and Transformations

## 1. Set logging

In [1]:
import logging
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)

## 2. Transformation interface

In [2]:
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('./deerwester.dict')
corpus = corpora.MmCorpus('./deerwester.mm')
print corpus

MmCorpus(9 documents, 12 features, 28 non-zero entries)


** Creating a transformation **

In [3]:
tfidf = models.TfidfModel(corpus)

** Transforming vectors **

In [7]:
# from now on, tfidf is treated as a read-only object that can be used to convert any vector from the old representation(bag of words integer 
# counts) to the new representation.
doc_bow = [(0, 1), (1, 1)]
print tfidf[doc_bow]

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


In [5]:
# to apply a transformation to a whole corpus.
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print doc

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(1, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.44424552527467476), (6, 0.3244870206138555), (7, 0.3244870206138555)]
[(0, 0.5710059809418182), (6, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(2, 0.49182558987264147), (6, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (4, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(5, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


** Transformations can also be serialized, one on top of another, in a sort of chain **

In [8]:
lsi = models.LsiModel(corpus_tfidf, id2word = dictionary, num_topics=2)
corpus_lsi = lsi[corpus_tfidf]

In [9]:
lsi.print_topics(2)

[(0,
  u'0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  u'-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

In [10]:
for doc in corpus_lsi:
    print doc

[(0, 0.066007833960903414), (1, -0.52007033063618502)]
[(0, 0.19667592859142477), (1, -0.76095631677000419)]
[(0, 0.089926399724464465), (1, -0.72418606267525099)]
[(0, 0.075858476521781668), (1, -0.632055158600343)]
[(0, 0.10150299184980109), (1, -0.57373084830029486)]
[(0, 0.70321089393783087), (1, 0.16115180214025754)]
[(0, 0.87747876731198293), (1, 0.16758906864659384)]
[(0, 0.90986246868185772), (1, 0.14086553628718998)]
[(0, 0.61658253505692806), (1, -0.053929075663893544)]


In [12]:
lsi.save('./model.lsi')
lsi = models.LsiModel.load('./model.lsi')

## 3. Available transformations

** Term Frequency * Inverse Document Frequency, tfidf **

In [16]:
model_tfidf = tfidfmodel.TfidfModel(bow_corpus, normalize = True)

NameError: name 'tfidfmodel' is not defined

** Latent Semantic Indexing, LSI(LSA) **

In [17]:
model = lsimodel.LsiModel(tfidf_corpus, id2word = dictionary, num_topics =300)

NameError: name 'lsimodel' is not defined

In [18]:
model.add_documents(another_tfidf_corpus)
lsi_vec = model[tfidf_vec]

NameError: name 'model' is not defined

In [19]:
model.add_documents(more_documents)
lsi_vec = model[tfidf_vec]

NameError: name 'model' is not defined

** Random Projections, RP **

In [20]:
model = rpmodel.RpModel(tfidf_corpus, num_topics = 500)

NameError: name 'rpmodel' is not defined

In [21]:
model = ldamodel.LdaModel(bow_corpus, id2word = dictionary, num_topics = 100)

NameError: name 'ldamodel' is not defined

** Hierarchical Dirichlet Process, HDP **

In [22]:
model = hdpmodel.HdpModel(bow_corpus, id2word = dictionary)

NameError: name 'hdpmodel' is not defined