In [1]:
import numpy as np
import pandas as pd
import scipy
import operator
import nltk
import os, glob
import string
import copy
import copy
import pickle
import datetime
import joblib, multiprocessing
import utils as my_utils

from scipy import spatial
from collections import Counter
from scipy.special import gammaln
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [2]:
min_df = 5
max_df = .5
max_features = 50000

n_cores = -1
max_iter = 20
n_top_words = 10

In [3]:
datasets = glob.glob("datasets/*")

In [4]:
datasets = ['datasets/amazon_home_20000_dataset',
            'datasets/amazon_movies_20000_dataset',
            'datasets/amazon_kindle_20000_dataset']

In [5]:
def get_diversity_score(list):

    all_words = []

    for l in list:

        all_words += l

    temp = pd.DataFrame(all_words,columns=['a'])

    temp = temp['a'].value_counts().reset_index()

    temp.columns = ['word','tot_cnt']

    return temp[temp.tot_cnt == 1].shape[0]*1.0/temp.tot_cnt.sum()

In [8]:
for d in datasets:
    for n_topics in [5, 25, 50]:
        print(d, n_topics)

        dataset = pd.read_pickle(d)
        vectorizer = CountVectorizer(analyzer="word",tokenizer=None,preprocessor=None,
                                     stop_words="english", max_features=max_features,
                                     max_df=max_df, min_df=min_df)

        count_matrix = vectorizer.fit_transform(dataset.text.tolist()).toarray()
        words = vectorizer.get_feature_names()

        vocabulary = dict(zip(words,np.arange(len(words))))

        model = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter, n_jobs=n_cores, verbose=0)

        dt_distribution = model.fit_transform(count_matrix)

        topic_words = {}
        for topic, comp in enumerate(model.components_):
            word_idx = np.argsort(comp)[::-1][:n_top_words]
            topic_words[topic] = [words[i] for i in word_idx]

        sample_df = []
        for topic, word in topic_words.items():
            sample_df.append(', '.join(word).split(", "))


        print(my_utils.get_hscore_multi(dt_distribution, count_matrix, n_topics, 2000), 
              silhouette_score(count_matrix, dt_distribution.argmax(axis=1)),
              davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)),
              my_utils.coherence_score(count_matrix, sample_df, vocabulary),
              model.score(count_matrix),
              model.perplexity(count_matrix),
              my_utils.coherence_score2(count_matrix, sample_df, vocabulary),
              get_diversity_score(sample_df))

datasets/amazon_home_20000_dataset 5
0.20484953318416485 -0.020598524253569072 13.213320272658683 -99.66533497403358 -2118173.6413778164 1217.2113232629647 0.05230238564676634 0.4
datasets/amazon_home_20000_dataset 25
0.4448792067092359 -0.06102645032803739 8.890327700798919 -129.04242225088126 -2180171.5796270506 1498.5567124619836 -0.11774748525356267 0.608
datasets/amazon_home_20000_dataset 50
0.5561192673727636 -0.07108510957448319 8.057126309116065 -137.42937467547853 -2248558.3669411303 1884.8920547315863 -0.042144197892468184 0.67
datasets/amazon_movies_20000_dataset 5
0.28134043896046335 -0.048343009603952256 12.715371224276112 -97.3411599641646 -2186702.9600289147 1769.92860387285 0.1894072164698331 0.48
datasets/amazon_movies_20000_dataset 25
0.4445355196115952 -0.062207597201462544 11.715722724418477 -120.31474874711526 -2263327.001538447 2300.207648165319 0.17000349835660133 0.468
datasets/amazon_movies_20000_dataset 50
0.5257447102773432 -0.07681291471320877 10.94598698218

In [None]:
dataset = pd.read_pickle("datasets/amazon_electronics_20000_dataset")

In [None]:
vectorizer = CountVectorizer(analyzer="word",tokenizer=None,preprocessor=None,
                             stop_words="english", max_features=max_features,
                             max_df=max_df, min_df=min_df)

In [None]:
count_matrix = vectorizer.fit_transform(dataset.text.tolist()).toarray()
words = vectorizer.get_feature_names()

In [None]:
vocabulary = dict(zip(words,np.arange(len(words))))

In [None]:
model = LatentDirichletAllocation(n_components=n_topics, max_iter=20, n_jobs=n_cores, verbose=0)

In [None]:
model.fit(count_matrix)

In [None]:
topic_words = {}
for topic, comp in enumerate(model.components_):
    word_idx = np.argsort(comp)[::-1][:n_top_words]
    topic_words[topic] = [words[i] for i in word_idx]

sample_df = []
for topic, word in topic_words.items():
    sample_df.append(', '.join(word).split(", "))

dt_distribution = model.transform(count_matrix)

In [None]:
print(my_utils.get_hscore_multi(dt_distribution, count_matrix, n_topics, 2000), 
      silhouette_score(count_matrix, dt_distribution.argmax(axis=1)),
      davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)),
      my_utils.coherence_score(count_matrix, sample_df, vocabulary),
      model.score(count_matrix),
      model.perplexity(count_matrix))

# Appendix

In [None]:
# print("H Score:", my_utils.get_hscore_multi(dt_distribution, count_matrix, n_topics, 2000))
# print("Silhouette Score:", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)))
# print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))
# print("Coherance Score:", my_utils.coherence_score(count_matrix, sample_df, vocabulary))
# print("Log Likelihood:", model.score(count_matrix))
# print("Perplexity:", model.perplexity(count_matrix))

In [None]:
# for k, model in zip(topics_grid, models_dump):

#     topic_words = {}
#     for topic, comp in enumerate(model.components_):
#         word_idx = np.argsort(comp)[::-1][:n_top_words]
#         topic_words[topic] = [words[i] for i in word_idx]

#     sample_df = []
#     for topic, word in topic_words.items():
#         sample_df.append(', '.join(word).split(", "))

#     dt_distribution = model.transform(count_matrix)

#     print("\nK:", k)
#     print("Running Metrics...")
#     print("H Score:", my_utils.get_hscore_multi(dt_distribution, count_matrix, k, 3000))
#     print("Log Likelihood:", model.score(count_matrix))
#     print("Perplexity:", model.perplexity(count_matrix))
#     print("Coherance Score:", my_utils.coherence_score(count_matrix, sample_df, vocabulary))
#     print("Silhouette Score:", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)))
#     print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))
    
# #     print my_utils.coherence_score(count_matrix, sample_df, vocabulary), "\t", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)), "\t", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1))

In [None]:
# def get_evaluations_multi(model):
#     topic_words = {}
#     for topic, comp in enumerate(model.components_):
#         word_idx = np.argsort(comp)[::-1][:n_top_words]
#         topic_words[topic] = [words[i] for i in word_idx]

#     sample_df = []
#     for topic, word in topic_words.items():
#         sample_df.append(', '.join(word).split(", "))

#     dt_distribution = model.transform(count_matrix)

#     h_score =  my_utils.get_hscore_multi(dt_distribution, count_matrix, k)
#     likelihood =  model.score(count_matrix)
#     perplexity = model.perplexity(count_matrix)
#     coherance_score = my_utils.coherence_score(count_matrix, sample_df, vocabulary)
#     silhouette = silhouette_score(count_matrix, dt_distribution.argmax(axis=1))
#     return [h_score, likelihood, perplexity, coherance_score, silhouette]

In [None]:
# for k, model in zip(topics_grid, models_dump):

#     topic_words = {}
#     for topic, comp in enumerate(model.components_):
#         word_idx = np.argsort(comp)[::-1][:n_top_words]
#         topic_words[topic] = [words[i] for i in word_idx]

#     sample_df = []
#     for topic, word in topic_words.items():
#         sample_df.append(', '.join(word).split(", "))

#     dt_distribution = model.transform(count_matrix)

#     print("\nK:", k)
#     print("Running Metrics...")
#     print("Coherance Score:", my_utils.coherence_score(count_matrix, sample_df, vocabulary))
#     print("Silhouette Score:", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)))
#     print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))

In [None]:
# print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))

In [None]:
#print("H-Score:", my_utils.get_hscore(dt_distribution, count_matrix, k))

In [None]:
# import matplotlib.pyplot as plt
# from sklearn.manifold import TSNE

# X_embedded = TSNE(n_components=2).fit_transform(dt_distribution)

# X_embedded.shape

# plt.figure(figsize=(10, 10))
# plt.scatter([i[0] for i in X_embedded], [i[1] for i in X_embedded], c=dt_distribution.argmax(axis=1))