In [8]:
import os, re

In [4]:
root_path = '20_newsgroups/'
#top_view folders
folders = [root_path + folder + '/' for folder in os.listdir(root_path)]
#20 classes
class_titles = os.listdir(root_path)

print(class_titles)
#list of all the files belonging to each class
files = {}
for folder, title in zip(folders, class_titles):
    files[title] = [folder + f for f in os.listdir(folder)]

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [5]:
train_test_ratio = 0.75

def train_test_split(ratio, classes, files):
    train_dict = {}
    test_dict = {}
    for cl in classes:
        train_cnt = int(ratio * len(files[cl]))
        train_dict[cl] = files[cl][:train_cnt]
        test_dict[cl] = files[cl][train_cnt:]
    return train_dict, test_dict

In [6]:
train_path, test_path = train_test_split(train_test_ratio, class_titles, files)

In [9]:
pattern = re.compile(r'([a-zA-Z]+|[0-9]+(\.[0-9]+)?)')

In [10]:
import string

def cleanupText(path):
    from string import punctuation, digits
    text_translated = ''
    try:
        f = open(path)
        raw = f.read().lower()
        text = pattern.sub(r' \1 ', raw.replace('\n', ' '))
        #text_translated = raw.translate( punctuation + digits)
        table = str.maketrans({key: None for key in string.punctuation})
        new_s = text.translate(table)     
        #text_translated2=text.translate(string.maketrans("",""), string.punctuation)
        text_translated = ' '.join([word for word in new_s.split(' ') if (word and len(word) > 1)])
    finally:
        f.close()
    return text_translated

In [11]:
train_arr = []
test_arr = []
train_lbl = []
test_lbl = []
for cl in class_titles:
    for path in train_path[cl]:
        train_arr.append(cleanupText(path))
        train_lbl.append(cl)
    for path in test_path[cl]:
        test_arr.append(cleanupText(path))
        test_lbl.append(cl)
        
print (len(train_arr))
print (len(test_arr))

14997
5000


In [12]:
news = [' '.join(filter(str.isalpha, raw.lower().split())) for raw in
        train_arr + test_arr]

# LDA Visualization

In [15]:
import lda
from sklearn.feature_extraction.text import CountVectorizer

n_topics = 20 # number of topics
n_iter = 100 # number of iterations

# vectorizer: ignore English stopwords & words that occur less than 5 times
cvectorizer = CountVectorizer(min_df=5, stop_words='english')
cvz = cvectorizer.fit_transform(news)

# train an LDA model
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

INFO:lda:n_documents: 19997
INFO:lda:vocab_size: 33579
INFO:lda:n_words: 4244503
INFO:lda:n_topics: 20
INFO:lda:n_iter: 100
INFO:lda:<0> log likelihood: -49965538
INFO:lda:<10> log likelihood: -38250730
INFO:lda:<20> log likelihood: -36166475
INFO:lda:<30> log likelihood: -35532075
INFO:lda:<40> log likelihood: -35211962
INFO:lda:<50> log likelihood: -35011427
INFO:lda:<60> log likelihood: -34862684
INFO:lda:<70> log likelihood: -34758157
INFO:lda:<80> log likelihood: -34680710
INFO:lda:<90> log likelihood: -34616791
INFO:lda:<99> log likelihood: -34555161


In [16]:
import numpy as np

threshold = 0.5
_idx = np.amax(X_topics, axis=1) > threshold  # idx of doc that above the threshold
X_topics = X_topics[_idx]

In [17]:
import pyLDAvis
import pyLDAvis.sklearn

lda_data=pyLDAvis.sklearn.prepare(lda_model,cvz,cvectorizer)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]


In [18]:
pyLDAvis.display(lda_data)

In [19]:
del lda_data

# TSNE Visualization

In [20]:
from sklearn.manifold import TSNE

In [21]:
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')

In [23]:
tsne_lda = tsne_model.fit_transform(X_topics)


[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 8644
[t-SNE] Computed conditional probabilities for sample 2000 / 8644
[t-SNE] Computed conditional probabilities for sample 3000 / 8644
[t-SNE] Computed conditional probabilities for sample 4000 / 8644
[t-SNE] Computed conditional probabilities for sample 5000 / 8644
[t-SNE] Computed conditional probabilities for sample 6000 / 8644
[t-SNE] Computed conditional probabilities for sample 7000 / 8644
[t-SNE] Computed conditional probabilities for sample 8000 / 8644
[t-SNE] Computed conditional probabilities for sample 8644 / 8644
[t-SNE] Mean sigma: 0.079169
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.433880
[t-SNE] Error after 375 iterations: 1.433880


In [28]:
import numpy as np
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool

n_top_words = 15 # number of keywords we show

# 20 colors
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])


In [29]:
_lda_keys = []
for i in range(X_topics.shape[0]): 
    _lda_keys +=  X_topics[i].argmax(),

In [30]:
topic_summaries = []
topic_word = lda_model.topic_word_  # all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
  topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
  topic_summaries.append(' '.join(topic_words))

In [31]:
title = 'dataset View'
num_example = len(X_topics)

plot_lda = bp.figure(plot_width=1400, plot_height=1100,
                     title=title,
                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
                 color=colormap[_lda_keys][:num_example],
                 source=bp.ColumnDataSource({
                   "content": news[:num_example],
                   "topic_key": _lda_keys[:num_example]
                   }))

Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)
Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)
Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)
Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)


In [32]:
topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
  if not np.isnan(topic_coord).any():
    break
  topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

# plot crucial words
for i in range(X_topics.shape[1]):
  plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
save(plot_lda, '{}.htm'.format(title))

You can access Timestamp as pandas.Timestamp
  if pd and isinstance(obj, pd.tslib.Timestamp):


'C:\\Users\\FUJITSU\\Desktop\\blog\\dataset View.htm'