In [1]:
from gensim import corpora, models
from normalization import normalize_corpus
import numpy as np

toy_corpus = [
    "The fox jumps over the dog",
    "The fox is very clever and quick",
    "The dog is slow and lazy",
    "The cat is smarter than the fox and the dog",
    "Python is an excellent programming language",
    "Java and Ruby are other programming languages",
    "Python and Java are very popular programming languages",
    "Python programs are smaller than Java programs"
]

## LSI topic model

In [2]:
norm_tokenized_corpus = normalize_corpus(toy_corpus, tokenize=True)
norm_tokenized_corpus

dictionary = corpora.Dictionary(norm_tokenized_corpus)
print(dictionary.token2id)

corpus = [dictionary.doc2bow(text) for text in norm_tokenized_corpus]
print(corpus)

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

total_topics = 2

lsi = models.LsiModel(corpus_tfidf,
                      id2word=dictionary,
                      num_topics=total_topics)

for index, topic in lsi.print_topics(total_topics):
    print('Topic #'+str(index+1))
    print(topic)
    print()



{'lazy': 5, 'excellent': 9, 'small': 17, 'ruby': 14, 'python': 12, 'slow': 6, 'programming': 11, 'cat': 7, 'popular': 15, 'program': 16, 'smarter': 8, 'language': 10, 'dog': 0, 'quick': 4, 'jump': 2, 'fox': 1, 'clever': 3, 'java': 13}
[[(0, 1), (1, 1), (2, 1)], [(1, 1), (3, 1), (4, 1)], [(0, 1), (5, 1), (6, 1)], [(0, 1), (1, 1), (7, 1), (8, 1)], [(9, 1), (10, 1), (11, 1), (12, 1)], [(10, 1), (11, 1), (13, 1), (14, 1)], [(10, 1), (11, 1), (12, 1), (13, 1), (15, 1)], [(12, 1), (13, 1), (16, 2), (17, 1)]]
Topic #1
0.459*"language" + 0.459*"programming" + 0.344*"python" + 0.344*"java" + 0.336*"popular" + 0.318*"excellent" + 0.318*"ruby" + 0.148*"program" + 0.074*"small" + -0.000*"clever"

Topic #2
-0.459*"fox" + -0.459*"dog" + -0.444*"jump" + -0.322*"cat" + -0.322*"smarter" + -0.208*"slow" + -0.208*"lazy" + -0.208*"clever" + -0.208*"quick" + -0.000*"program"



In [3]:
def print_topics_gensim(topic_model, total_topics=1,
                        weight_threshold=0.0001,
                        display_weights=False,
                        num_terms=None):

    for index in range(total_topics):
        topic = topic_model.show_topic(index)
        topic = [(word, round(wt,2))
                 for word, wt in topic
                 if abs(wt) >= weight_threshold]
        if display_weights:
            print('Topic #' + str(index + 1) + ' with weights')
            print(topic[:num_terms] if num_terms else topic)
        else:
            print('Topic #' + str(index + 1) + ' without weights')
            tw = [term for term, wt in topic]
            print(tw[:num_terms] if num_terms else tw)
        print()


print_topics_gensim(topic_model=lsi,
                    total_topics=total_topics,
                    num_terms=5,
                    display_weights=True)




Topic #1 with weights
[('language', 0.46), ('programming', 0.46), ('python', 0.34), ('java', 0.34), ('popular', 0.34)]

Topic #2 with weights
[('fox', -0.46), ('dog', -0.46), ('jump', -0.44), ('cat', -0.32), ('smarter', -0.32)]



## LSI custom built topic model

In [4]:
from utils import build_feature_matrix, low_rank_svd

norm_corpus = normalize_corpus(toy_corpus)

vectorizer, tfidf_matrix = build_feature_matrix(
    norm_corpus,
    feature_type='tfidf'
)

td_matrix = tfidf_matrix.transpose()

td_matrix = td_matrix.multiply(td_matrix > 0)

total_topics = 2
feature_names = vectorizer.get_feature_names()

u, s, vt = low_rank_svd(td_matrix, singular_count=total_topics)
weights = u.transpose() * s[:, None]

In [5]:
def get_topics_terms_weights(weights, feature_names):
    feature_names = np.array(feature_names)
    sorted_indices = np.array([list(row[::-1])
                           for row
                           in np.argsort(np.abs(weights))])
    sorted_weights = np.array([list(wt[index])
                               for wt, index
                               in zip(weights,sorted_indices)])
    sorted_terms = np.array([list(feature_names[row])
                             for row
                             in sorted_indices])

    topics = [np.vstack((terms.T,
                     term_weights.T)).T
              for terms, term_weights
              in zip(sorted_terms, sorted_weights)]

    return topics


def print_topics_udf(topics, total_topics=1,
                     weight_threshold=0.0001,
                     display_weights=False,
                     num_terms=None):

    for index in range(total_topics):
        topic = topics[index]
        topic = [(term, float(wt))
                 for term, wt in topic]
        topic = [(word, round(wt,2))
                 for word, wt in topic
                 if abs(wt) >= weight_threshold]

        if display_weights:
            print('Topic #' + str(index + 1) + ' with weights')
            print(topic[:num_terms] if num_terms else topic)
        else:
            print('Topic #' + str(index + 1) + ' without weights')
            tw = [term for term, wt in topic]
            print(tw[:num_terms] if num_terms else tw)
        print()


topics = get_topics_terms_weights(weights, feature_names)
print_topics_udf(topics=topics,
                 total_topics=total_topics,
                 weight_threshold=0.15,
                 display_weights=False)


Topic #1 without weights
['dog', 'fox', 'jump', 'smarter', 'cat', 'quick', 'clever', 'slow', 'lazy']

Topic #2 without weights
['programming', 'language', 'python', 'java', 'popular', 'excellent', 'ruby', 'program']



In [6]:
def train_lsi_model_gensim(corpus, total_topics=2):

    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text)
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lsi = models.LsiModel(corpus_tfidf,
                          id2word=dictionary,
                          num_topics=total_topics)
    return lsi





def train_lda_model_gensim(corpus, total_topics=2):

    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text)
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lda = models.LdaModel(corpus_tfidf,
                          id2word=dictionary,
                          iterations=1000,
                          num_topics=total_topics)
    return lda



lda_gensim = train_lda_model_gensim(toy_corpus,
                                    total_topics=2)

print_topics_gensim(topic_model=lda_gensim,
                    total_topics=2,
                    num_terms=5,
                    display_weights=True)


Topic #1 with weights
[('language', 0.08), ('programming', 0.08), ('popular', 0.07), ('python', 0.07), ('java', 0.07)]

Topic #2 with weights
[('dog', 0.08), ('jump', 0.07), ('program', 0.07), ('fox', 0.07), ('cat', 0.06)]



In [7]:
from sklearn.decomposition import LatentDirichletAllocation

norm_corpus = normalize_corpus(toy_corpus)
vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus,
                                    feature_type='tfidf')
total_topics = 2
lda = LatentDirichletAllocation(n_components=total_topics,
                                max_iter=1000,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=42)
lda.fit(tfidf_matrix)

feature_names = vectorizer.get_feature_names()
weights = lda.components_

topics = get_topics_terms_weights(weights, feature_names)
print_topics_udf(topics=topics,
                 total_topics=total_topics,
                 num_terms=8,
                 display_weights=True)


Topic #1 with weights
[('fox', 1.86), ('dog', 1.86), ('jump', 1.19), ('clever', 1.12), ('quick', 1.12), ('lazy', 1.12), ('slow', 1.12), ('cat', 1.06)]

Topic #2 with weights
[('programming', 1.8), ('language', 1.8), ('java', 1.64), ('python', 1.64), ('program', 1.3), ('ruby', 1.11), ('excellent', 1.11), ('popular', 1.06)]



In [8]:
from sklearn.decomposition import NMF

norm_corpus = normalize_corpus(toy_corpus)
vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus,
                                    feature_type='tfidf')
total_topics = 2
nmf = NMF(n_components=total_topics,
          random_state=42, alpha=.1, l1_ratio=.5)
nmf.fit(tfidf_matrix)

feature_names = vectorizer.get_feature_names()
weights = nmf.components_

topics = get_topics_terms_weights(weights, feature_names)
print_topics_udf(topics=topics,
                 total_topics=total_topics,
                 num_terms=None,
                 display_weights=True)




Topic #1 with weights
[('programming', 0.55), ('language', 0.55), ('python', 0.4), ('java', 0.4), ('popular', 0.24), ('excellent', 0.23), ('ruby', 0.23), ('program', 0.09), ('small', 0.03)]

Topic #2 with weights
[('dog', 0.57), ('fox', 0.57), ('jump', 0.35), ('smarter', 0.26), ('cat', 0.26), ('quick', 0.13), ('slow', 0.13), ('clever', 0.13), ('lazy', 0.13)]



In [9]:
import pandas as pd
import numpy as np

CORPUS = pd.read_csv('amazon_skyrim_reviews.csv')
CORPUS = np.array(CORPUS['Reviews'])

# view sample review
print(CORPUS[12])

total_topics = 5

I base the value of a game on the amount of enjoyable gameplay I can get out of it and this one was definitely worth the price!


In [10]:
lsi_gensim = train_lsi_model_gensim(CORPUS,
                                    total_topics=total_topics)
print_topics_gensim(topic_model=lsi_gensim,
                    total_topics=total_topics,
                    num_terms=10,
                    display_weights=False)

Topic #1 without weights
['skyrim', 'one', 'quest', 'like', 'play', 'oblivion', 'go', 'get', 'time', 'level']

Topic #2 without weights
['recommend', 'love', 'ever', 'best', 'great', 'level', 'highly', 'buy', 'play', 'elder']

Topic #3 without weights
['recommend', 'fun', 'highly', 'love', 'ever', 'wonderful', 'scroll', 'elder', 'series', 'definitely']

Topic #4 without weights
['fun', 'scroll', 'elder', 'recommend', 'highly', 'wonderful', 'series', 'graphic', 'fan', 'everyone']

Topic #5 without weights
['fun', 'love', 'scroll', 'elder', '5', 'highly', 'dont', 'hour', 'hundred', 'series']



In [11]:
lda_gensim = train_lda_model_gensim(CORPUS,
                                    total_topics=total_topics)
print_topics_gensim(topic_model=lda_gensim,
                    total_topics=total_topics,
                    num_terms=10,
                    display_weights=False)

Topic #1 without weights
['love', 'amazing', 'dont', 'ever', 'hundred', 'elder', 'say', 'system', 'gameplay', 'scroll']

Topic #2 without weights
['skyrim', 'play', 'one', 'best', 'great', 'ever', 'rpg', 'buy', 'like', 'character']

Topic #3 without weights
['good', 'one', 'play', 'really', 'love', 'time', 'make', 'want', 'like', 'highly']

Topic #4 without weights
['quest', 'get', 'good', 'thing', 'oblivion', 'one', 'like', 'time', 'go', 'much']

Topic #5 without weights
['fun', 'play', 'get', 'long', 'great', 'buy', 'hour', 'love', 'much', 'quest']



In [12]:
norm_corpus = normalize_corpus(CORPUS)
vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus,
                                    feature_type='tfidf')
feature_names = vectorizer.get_feature_names()

In [13]:
lda = LatentDirichletAllocation(n_components=total_topics,
                                max_iter=1000,
                                learning_method='online',
                                learning_offset=10.,
                                random_state=42)
lda.fit(tfidf_matrix)
weights = lda.components_
topics = get_topics_terms_weights(weights, feature_names)

print_topics_udf(topics=topics,
                 total_topics=total_topics,
                 num_terms=10,
                 display_weights=False)

Topic #1 without weights
['estatic', 'booklet', 'wonder4ful', 'electricity', 'heat', 'trhats', 'amazingly', 'interfere', 'chirstmas', '12yr']

Topic #2 without weights
['game', 'play', 'get', 'one', 'skyrim', 'great', 'like', 'time', 'quest', 'much']

Topic #3 without weights
['de', 'crédito', 'pagar', 'momento', 'compras', 'responsabilidad', 'para', 'recomiendo', 'futuras', 'skyrimseguridad']

Topic #4 without weights
['booklet', 'estatic', 'wonder4ful', 'electricity', 'heat', 'trhats', 'amazingly', 'interfere', 'chirstmas', '12yr']

Topic #5 without weights
['estatic', 'booklet', 'wonder4ful', 'electricity', 'trhats', 'heat', 'amazingly', 'interfere', 'chirstmas', '12yr']



In [14]:
nmf = NMF(n_components=total_topics,
          random_state=42, alpha=.1, l1_ratio=.5)
nmf.fit(tfidf_matrix)

feature_names = vectorizer.get_feature_names()
weights = nmf.components_

topics = get_topics_terms_weights(weights, feature_names)
print_topics_udf(topics=topics,
                 total_topics=total_topics,
                 num_terms=10,
                 display_weights=False)

Topic #1 without weights
['game', 'get', 'skyrim', 'play', 'time', 'quest', 'like', 'one', 'go', 'much']

Topic #2 without weights
['game', 'recommend', 'love', 'great', 'highly', 'play', 'wonderful', 'like', 'would', 'graphic']

Topic #3 without weights
['scroll', 'elder', 'series', 'always', 'love', 'pass', 'franchise', 'buy', 'game', 'far']

Topic #4 without weights
['ever', 'best', 'game', 'play', 'rpg', 'one', 'ive', 'hour', 'great', 'definitely']

Topic #5 without weights
['fun', 'game', 'much', 'graphic', 'improvement', 'mission', 'expect', 'see', 'hour', 'couple']

