
# 1) Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation


This is an example of applying :class:`sklearn.decomposition.NMF` and
:class:`sklearn.decomposition.LatentDirichletAllocation` on a corpus
of documents and extract additive models of the topic structure of the
corpus.  The output is a list of topics, each represented as a list of
terms (weights are not shown).

Non-negative Matrix Factorization is applied with two different objective
functions: the Frobenius norm, and the generalized Kullback-Leibler divergence.
The latter is equivalent to Probabilistic Latent Semantic Indexing.

The default parameters (n_samples / n_features / n_components) should make
the example runnable in a couple of tens of seconds. You can try to
increase the dimensions of the problem, but be aware that the time
complexity is polynomial in NMF. In LDA, the time complexity is
proportional to (n_samples * iterations).


Source: http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html


#### Set paths

In [5]:
%matplotlib inline
root = "C:/Users/Badrul/Documents/DataScience/TextAnalysis"
rawDataPath = root + "/Data/RawData/PubMed"
dictPath = root + "/Data/tmp/datadictionary.dict"
corpusPath = root + "/Data/tmp/corpus.mm"
saveModelPath = root + "/Data/savedmodels"

#### Import Libraries

In [6]:
import pickle  # Used for saving models
from sklearn.datasets import load_files
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [7]:
# Parameters
n_features = 1000
n_components = 10
n_top_words = 20
no_topics = 20 

#### Load Dataset from folder

In [8]:
TrainDataset = load_files(rawDataPath,description= None,load_content = True, 
                           encoding='latin1', decode_error='strict', shuffle=True, 
                           random_state=42)
#text_test_subset = text_train_subset # load your actual test data here

print("Loading dataset...")
t0 = time()
#dataset = fetch_20newsgroups(shuffle=True, random_state=1,
#                            remove=('headers', 'footers', 'quotes'))
data_samples = TrainDataset.data
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 0.000s.


#### Convert text to vectors using tfid

In [5]:
print("Extracting tf-idf features for NMF...")

# max_df: Build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.
# min_df: When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
# NMF is able to use tf-idf
tfidf = tfidf_vectorizer.fit_transform(data_samples)
tfidf_feature_names_nmf = tfidf_vectorizer.get_feature_names()
print("done in %0.3fs." % (time() - t0))

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples[0:-2])
tf_feature_names_lda = tf_vectorizer.get_feature_names()
print("done in %0.3fs." % (time() - t0))




Extracting tf-idf features for NMF...
done in 0.010s.
Extracting tf features for LDA...
done in 0.008s.


In [6]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):

    for topic_idx, topic in enumerate(H):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]

        for doc_index in top_doc_indices:
            print (documents[doc_index])

#### NMF Method

In [7]:
# if you wish to time each stage use this code:
#t0 = time()
#print("done in %0.3fs." % (time() - t0))

# NMF model 1
print("Fitting the NMF model (Frobenius norm) with tf-idf features. n_features=%d...".format(n_features))
nmf1 = NMF(n_components=n_components, random_state=1,alpha=.1, l1_ratio=.5).fit(tfidf)

# NMF model 2
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features n_features=%d...".format(n_features))
nmf2 = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,l1_ratio=.5).fit(tfidf)

# NMF Model 3
nmf3 = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)


Fitting the NMF model (Frobenius norm) with tf-idf features. n_features=%d...
Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features n_features=%d...


#### LDA Method

In [9]:
# LDA 1
print("Fitting LDA models with tf features, "
      " n_features=%d...".format(n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)

Fitting LDA models with tf features,  n_features=%d...


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

#### Display Results

In [10]:

no_top_words = 10
#display_topics(nmf1, tfidf_feature_names_nmf, no_top_words)
#display_topics(lda, tf_feature_names_lda, no_top_words)

print("\nTopics in NMF model 1:")
print_top_words(nmf1, tfidf_feature_names_nmf, n_top_words)

print("\nTopics in LDA model:")
print_top_words(lda, tf_feature_names_lda, n_top_words)


Topics in NMF model 1:
Topic #0: attributes stringelement label nlmcategory patients methods treatment results conclusions study stage line phase ii survival adjuvant cost treated months objective
Topic #1: cells cell differentiation regulatory stem expression kinase inhibitors protein bcl induced cancer therapies complex short targeted essential p21 combination transcription
Topic #2: cancer expression patients analysis 95 prognostic ci prognosis serum hr value metastasis level mir testing breast survival meta os 485
Topic #3: replication dna knockdown machinery chromosomal growth development drosophila factors phenotype loss essential cells involved tissues independent specific amplification gene particular
Topic #4: palliative care population hospice development countries european differences mortality early services rates gained evidence role medicine model negatively purpose conditions
Topic #5: hpv cervical viral transcriptional infection expression factors mir promoter carcinog

#### Save Models

In [13]:
# Save model
#s = pickle.dumps(lda)
pickle.dump(lda, open( saveModelPath+ "/lda.p", "wb" ) )
pickle.dump(nmf1, open( saveModelPath+ "/nmf.p", "wb" ) )


### Use model to predict topic of new documents (in development)

In [None]:
# predict topics for test data
# unnormalized doc-topic distribution
import numpy as np


tf = tf_vectorizer.fit_transform(data_samples)
tf_feature_names_lda = tf_vectorizer.get_feature_names()

doc_topic_dist_unnormalized = np.matrix(lda.transform(tf))

# normalize the distribution (only needed if you want to work with the probabilities)
doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)
doc_topic_dist.argmax(axis=1)
# print(tf_feature_names_lda)

## 2) Named Entity Recognition

http://www.nltk.org/book/ch07.html

In [None]:
import nltk  # http://www.nltk.org/install.html

print(nltk.ne_chunk(data_samples[0], binary=True)) 

## 3) Twitter

In [1]:
# Run pip install python-twitter
import twitter
#, OAuth, TwitterHTTPError

access_token_key="1031692224-PR1zGCQ6OksO1YrvdBcgmIpBMsiYeccrbXlmg8n"
access_token_secret="RI5CEEsrlX0qsTa1W34TLiPH9lYWmQrmVdQqUoYR6ANjT"
consumer_key="8DYqTuDnBfXEdcG71ZcRE0RQ8"
consumer_secret="PjF3EebGTRaRTWs1vmjGMcWpEK0OEk1g3fykQutEo6oUK6BaqS"

bot = twitter.Api(consumer_key,consumer_secret,access_token_key,access_token_secret)

In [15]:
results = bot.GetSearch(
    raw_query="q=oncology%20startup&result_type=recent&since=2014-07-19&count=100")


In [17]:
print(results[0]) # Example of 1 result

{"created_at": "Sun Sep 24 00:22:52 +0000 2017", "favorite_count": 1, "hashtags": [{"text": "Cancer"}, {"text": "Startup"}], "id": 911747728765476864, "id_str": "911747728765476864", "lang": "en", "source": "<a href=\"https://dlvrit.com/\" rel=\"nofollow\">dlvr.it</a>", "text": "Rosemarie Truman: Freedom from #Cancer #Startup Challenge is Open for Entry! Join today\u2026 https://t.co/ZTqpDtcExY\u2026 https://t.co/ENssqS5IT1", "truncated": true, "urls": [{"expanded_url": "http://dlvr.it/Pq1vkj", "url": "https://t.co/ZTqpDtcExY"}, {"expanded_url": "https://twitter.com/i/web/status/911747728765476864", "url": "https://t.co/ENssqS5IT1"}], "user": {"created_at": "Thu Jul 07 11:49:16 +0000 2016", "default_profile": true, "description": "MedicBoard for Oncology specialists", "followers_count": 665, "id": 751020252247519232, "lang": "en", "listed_count": 314, "name": "Oncology Board", "profile_background_color": "F5F8FA", "profile_banner_url": "https://pbs.twimg.com/profile_banners/75102025224