In [2]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

import pandas as ps
import numpy as np
import json

n_features = 10000
n_topics = 3
n_top_words = 10

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


print("Loading dataset...")
t0 = time()
df = ps.read_csv("data/transcription_table.csv")
data_samples = sum(df.transcriptionText.map(lambda x: map(lambda y : y['onebest'], json.loads(x))), [])
n_samples = len(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features, ngram_range=(1,2))
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features)
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Fit the NMF model
print("Fitting the NMF model with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5)
w_nmf = nmf.fit_transform(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
w_lda = lda.fit_transform(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Loading dataset...
done in 0.027s.
Extracting tf-idf features for NMF...
done in 0.076s.
Extracting tf features for LDA...
done in 0.046s.
Fitting the NMF model with tf-idf features, n_samples=7571 and n_features=10000...
done in 0.058s.

Topics in NMF model:
Topic #0:
您好 请问有什么能帮您的吗 您好 966为您服务 966为您服务 966为您服务 请问有什么能帮您的吗 请问有什么能帮你了吗 您好 感谢耐心等待 感谢耐心等待 您好 九六为您服务 九六为您服务
Topic #1:
先生 还有其他能帮你的吗 先生 您好 先生 还有其他能帮你的吗 先生 我们这边是第三方支付平台 我们这边是第三方支付平台 还有其他的帮您的吗 还有其他能帮您的吗 先生 是的 先生 您是手机话费扣款了吗 先生
Topic #2:
您稍等 好的 好的 您稍等 嗯好的 嗯好的 您稍等 您稍等 我帮您查询 您稍等 我帮您进行查询 你好 我帮您查询 您稍等 我再帮您查询一下

Fitting LDA models with tf features, n_samples=7571 and n_features=10000...




done in 1.505s.

Topics in LDA model:
Topic #0:
先生 你好 好的 请问有什么能帮你的吗 您好先生 没有 不客气 请您稍等 请问有什么能帮你了吗 你好久了
Topic #1:
再见 请您稍后评价 谁知 感谢您的来电 没有了 还有其他能帮你的吗 感谢来电 还有其他能帮你了吗 对吧 女士
Topic #2:
您好 您稍等 还有其他能帮您的吗 请问有什么能帮您的吗 嗯好的 您需要吗 966为您服务 稍等 唉你好 谢谢



In [27]:
np.argmax(w_nmf[1:10], axis=1)

array([0, 0, 0, 0, 5, 0, 0, 0, 0])

In [2]:
np.argmax(w_lda[1:10], axis=1)

NameError: name 'w_lda' is not defined

In [3]:
pwd

u'/home/jyonkov/dev'