In [1]:
import pandas as pd
import numpy as np
import re
import time
import lda
import textmining
import pickle
from ggplot import *
import gensim

Using TensorFlow backend.


In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA

In [3]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

In [4]:
data = pd.read_csv('./pickles/data_pd.csv')

In [5]:
def bokeh_plot(data_in, num_points, x_col, y_col, title, tooltips):
    shuffle = np.random.permutation(data_in.shape[0])
    data_rdm = data_in.loc[shuffle[:num_points], :].copy()
    output_notebook()
    plot_tfidf = bp.figure(plot_width=700, plot_height=600, title=title,
        tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
        x_axis_type=None, y_axis_type=None, min_border=1)
    plot_tfidf.scatter(x=x_col, y=y_col, source=data_rdm)
    hover = plot_tfidf.select(dict(type=HoverTool))
    hover.tooltips=tooltips
    show(plot_tfidf)

In [6]:
def ggplot_plot(data_in, num_points, x_col, y_col, title): 
    rndperm = np.random.permutation(data_in.shape[0])
    data_rdm = data_in.loc[rndperm[:num_points], :].copy()
    chart = ggplot(data_rdm, aes(x=x_col, y=y_col) ) \
            + geom_point(size=75,alpha=0.8) \
            + ggtitle(title)
    chart.show()
# ggplot_plot(data, 4000, 'pca-one', 'pca-two', 'TF-IDF Clustering')

In [7]:
def pca_plot(df, mat, n_components):
    if n_components < 2: raise ValueError('Needs to be > 3 components')
    ursvd = PCA(n_components=10)
    ursvd_res = ursvd.fit_transform(mat)
    df['pca-one'] = ursvd_res[:,0]
    df['pca-two'] = ursvd_res[:,1] 
    df['pca-three'] = ursvd_res[:,2]
    print('Variation per principal component: {}'.format(ursvd.explained_variance_ratio_))
    bokeh_plot(df, 4000, 'pca-one', 'pca-two', 'PCA Clustering', {"title": "@title", "authors":"@authors"})

In [8]:
def TSVD_plot(df, mat, n_components):
    if n_components < 2: raise ValueError('Needs to be > 3 components')
    ursvd = TruncatedSVD(n_components=10)
    ursvd_res = ursvd.fit_transform(mat)
    df['pca-one'] = ursvd_res[:,0]
    df['pca-two'] = ursvd_res[:,1] 
    df['pca-three'] = ursvd_res[:,2]
    print('Variation per principal component: {}'.format(ursvd.explained_variance_ratio_))
    bokeh_plot(df, 4000, 'pca-one', 'pca-two', 'PCA Clustering', {"title": "@title", "authors":"@authors"})

## TFIDF - then TruncatedSVD (PCA with Spare Matrices)

In [140]:
vz_mat = TfidfVectorizer(min_df=5, max_features=30000, stop_words = 'english').fit_transform(data['summary'])

In [145]:
# TSVD_plot(data, vz_mat, 10)

## LDA
#### Using the approximation that there are ~ 50 papers per overarching topic

In [10]:
tdm = textmining.TermDocumentMatrix()
for row in data['summary'].values:
    row_text = ' '.join(textmining.simple_tokenize_remove_stopwords(row))
    tdm.add_doc(row_text)
tdm.write_csv('./pickles/lda_prep/tdm.csv', cutoff=60)

In [11]:
file = pd.read_csv('./pickles/lda_prep/tdm.csv', dtype = np.int64)

In [149]:
# full_file = np.array(tdm.rows(cutoff=5))
vocab = file.columns.values
X = np.array(file.iloc[0:, :].values) #NOTE THE ONE OFFSET

In [20]:
id2word = {}
i = 0
for item in vocab:
    id2word[i] = item
    i+=1

In [147]:
model = lda.LDA(n_topics=int(len(data)/50), n_iter=1500, random_state=1) #NOTE THE ONE OFFSET ON THE SAVED FILES
model.fit(X)

In [146]:
topic_word = model.topic_word_  # model.components_ also works
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

In [148]:
doc_topic = model.doc_topic_
print(np.shape(doc_topic))
for i in range(10):
     print("{} (top topic: {})".format(i, doc_topic[i].argmax()))

In [155]:
# pca_plot(data, topic_dist, 10)

In [43]:
pickle.dump(file=open('./pickles/lda_topics.pickle', 'wb'), obj=topic_word)
pickle.dump(file=open('./pickles/lda_dist.pickle', 'wb'), obj=doc_topic)

#### Using Gensim instead of LDA (with Mallet)

In [9]:
stopwords = set(line.strip() for line in open('./pickles/lda_prep/stopwords.txt'))
texts = [[i for i in doc.lower().split() if i not in stopwords] for doc in data['summary'].values]

In [10]:
pickle.dump(file = open('./pickles/lda_prep/text_stopped.pickle', 'wb'), obj=texts)

In [11]:
dictionary = gensim.corpora.Dictionary(texts) #Create Gensim Dictionary
dictionary.filter_extremes(no_below=10, no_above=1.0, keep_n=None) #Filter extremes
corpus = [dictionary.doc2bow(text) for text in texts] #NOW CONVERT TO DICTIONARY

In [12]:
dictionary.save('./pickles/lda_prep/text_dict.txtdic') #Save backup
pickle.dump(file=open('./pickles/lda_prep/text_corpus.corpus', 'wb'), obj=corpus)

In [13]:
model = gensim.models.wrappers.LdaMallet('../../Mallet/bin/mallet', corpus, num_topics=750, iterations=2000, id2word=dictionary)
docs_corpus = model[corpus]

In [14]:
docs_corpus = np.array(docs_corpus) #Topic model corpus

In [15]:
learnt_topics = model.show_topics(num_topics=-1, num_words=15, log=False, formatted=False)

In [17]:
pickle.dump(file = open('./pickles/mallet_learnt_topics.pickle', 'wb'), obj = learnt_topics)
pickle.dump(file = open('./pickles/mallet_doc_corpus.pickle', 'wb'), obj = docs_corpus[:,:, 1])

## Other Explorations

In [18]:
[i for i in enumerate(data['title'].values) if 'Continuous State-Space Models for Optimal Sepsis Treatment' in i[1] ]

[(17060,
  'Continuous State-Space Models for Optimal Sepsis Treatment - a Deep\n  Reinforcement Learning Approach')]

In [19]:
print(data.iloc[17060, :])

Unnamed: 0                                                17060
title         Continuous State-Space Models for Optimal Seps...
summary       Sepsis is a leading cause of mortality in inte...
date                                                 1495559591
authors       ['Aniruddh Raghu', 'Matthieu Komorowski', 'Leo...
tags                                                  ['cs.LG']
Name: 17060, dtype: object
