### Import abstracts

In [1]:
import pandas as pd

In [2]:
articles_df = pd.read_csv('pubmed_clinical_oncology.csv')

In [3]:
articles_df.head()

Unnamed: 0.1,Unnamed: 0,title,abstract
0,0,Marijuana use predicts onset of current little...,BACKGROUND: This study examined whether young ...
1,1,Decitabine in combination with donor lymphocyt...,The combination of 5-azacytidine (AZA) with do...
2,2,One-stage wedge osteotomy through posterolater...,PURPOSE: Osteotomy through anterior exposure i...
3,3,Tumor Treating Fields Utilization in a Gliobla...,BACKGROUND: Tumor treating fields (TTF) have b...
4,4,Allogeneic Stem Cell Transplantation for Patie...,Natural killer (NK)/T cell lymphoid malignancy...


In [4]:
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2709 entries, 0 to 2708
Data columns (total 3 columns):
Unnamed: 0    2709 non-null int64
title         2709 non-null object
abstract      2709 non-null object
dtypes: int64(1), object(2)
memory usage: 63.6+ KB


### Prepare abstracts for topic modelling

In [5]:
from spacy.lang.en import English

In [6]:
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [7]:
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

In [8]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [9]:
en_stop = set(stopwords.words('english'))

def prepare_text_for_lda(text, custom_stopwords):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 2]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [token for token in tokens if token not in custom_stopwords]
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

Tokenizing abstracts:

In [10]:
abstract_tokens = []

for abstract in articles_df['abstract']:
    tokens = prepare_text_for_lda(abstract, ['background','methods','conclusions','objective'])
    abstract_tokens.append(tokens)

Create bigrams:

In [11]:
from nltk import bigrams

In [12]:
def prepare_text_for_lda_bigrams(text, custom_stopwords):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 2]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [token for token in tokens if token not in custom_stopwords]
    tokens = [token for token in tokens if not token.isdigit()]
    tokens = [get_lemma(token) for token in tokens]
    tokens.extend(['_'.join((word1, word2)) for word1, word2 in bigrams(tokens)])
    return tokens

In [13]:
bigram_tokens = []

for abstract in articles_df['abstract']:
    tokens = prepare_text_for_lda_bigrams(abstract, ['background','methods','conclusions','objective'])
    bigram_tokens.append(tokens)

Generate dictionaries and corpora:

In [18]:
from gensim import corpora, models

In [15]:
# Generate dictionary and corpus from 1-gram tokens
dictionary = corpora.Dictionary(abstract_tokens)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

corpus = [dictionary.doc2bow(text) for text in abstract_tokens]

In [16]:
# Generate dictionary and corpus from 1- and 2-gram tokens
dictionary_bigrams = corpora.Dictionary(bigram_tokens)
dictionary_bigrams.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

corpus_bigrams = [dictionary_bigrams.doc2bow(text) for text in bigram_tokens]

### Topic modelling

LDA model for 1-gram tokens:

In [22]:
NUM_TOPICS = 15
ldamodel = models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15, random_state=42)

In [23]:
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.018*"relate" + 0.011*"tumor" + 0.011*"factor" + 0.009*"mechanism" + 0.009*"glioma" + 0.009*"cause" + 0.009*"development" + 0.008*"involve" + 0.008*"case" + 0.008*"thyroid"')
(1, '0.023*"treatment" + 0.023*"trial" + 0.014*"dose" + 0.013*"phase" + 0.013*"therapy" + 0.011*"receive" + 0.011*"event" + 0.011*"adverse" + 0.011*"toxicity" + 0.010*"safety"')
(2, '0.017*"analysis" + 0.014*"expression" + 0.013*"tumor" + 0.012*"sample" + 0.012*"gene" + 0.011*"using" + 0.010*"identify" + 0.009*"level" + 0.009*"clinical" + 0.008*"base"')
(3, '0.018*"node" + 0.015*"group" + 0.014*"lymph" + 0.013*"tumor" + 0.012*"pain" + 0.008*"undergo" + 0.008*"total" + 0.008*"score" + 0.008*"symptom" + 0.008*"metastasis"')
(4, '0.039*"dose" + 0.024*"radiation" + 0.019*"volume" + 0.017*"treatment" + 0.015*"plan" + 0.014*"using" + 0.012*"radiotherapy" + 0.012*"hpv" + 0.011*"compare" + 0.010*"cervical"')
(5, '0.099*"mutation" + 0.037*"gene" + 0.031*"crc" + 0.020*"sequence" + 0.019*"variant" + 0.018*"risk" + 0.01

In [24]:
coherence_model_lda = models.CoherenceModel(model=ldamodel, texts=abstract_tokens, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4903329345243774


LDA model for 1- and 2-gram tokens:

In [44]:
NUM_TOPICS = 15
ldamodel_bigrams = models.ldamodel.LdaModel(corpus_bigrams, num_topics = NUM_TOPICS, id2word=dictionary_bigrams, passes=15, minimum_probability=0,
                                            random_state=42)

In [45]:
topics = ldamodel_bigrams.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.046*"cell" + 0.018*"expression" + 0.013*"tumor" + 0.009*"protein" + 0.009*"target" + 0.007*"role" + 0.007*"level" + 0.006*"show" + 0.006*"tissue" + 0.006*"pathway"')
(1, '0.028*"cell" + 0.018*"lymphoma" + 0.016*"acute" + 0.015*"disease" + 0.014*"leukemia" + 0.014*"transplantation" + 0.013*"relapse" + 0.011*"stem" + 0.011*"stem_cell" + 0.010*"aml"')
(2, '0.024*"treatment" + 0.018*"clinical" + 0.013*"therapy" + 0.010*"review" + 0.010*"inhibitor" + 0.009*"cell" + 0.009*"lung" + 0.009*"disease" + 0.008*"egfr" + 0.008*"response"')
(3, '0.020*"tumor" + 0.013*"imaging" + 0.012*"case" + 0.010*"mri" + 0.009*"using" + 0.007*"tissue" + 0.007*"diagnosis" + 0.006*"present" + 0.006*"method" + 0.006*"perform"')
(4, '0.042*"prostate" + 0.028*"prostate_cancer" + 0.016*"tumor" + 0.014*"melanoma" + 0.010*"therapy" + 0.010*"pca" + 0.010*"metastatic" + 0.010*"psa" + 0.009*"clinical" + 0.009*"antibody"')
(5, '0.019*"trial" + 0.014*"treatment" + 0.010*"chemotherapy" + 0.009*"receive" + 0.009*"include"

In [27]:
coherence_model_lda_bigrams = models.CoherenceModel(model=ldamodel_bigrams, texts=bigram_tokens, dictionary=dictionary_bigrams, coherence='c_v')
coherence_lda = coherence_model_lda_bigrams.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.6433348177510676


In [37]:
topic_labels = {0: 'Mechanistic studies', 1: 'Lymphoma/Leukemia', 2: 'Lung/Renal/Immunotherapy', 3: 'Diagnosis/Imaging', 4: 'Prostate/Melanoma',
               5: 'Clinical trials', 6: 'Gene expression', 7: 'Screening/Lung cancer', 8: 'Risk factors', 9: 'Surgery', 10: 'Breast cancer',
               11: 'Treatment plan', 12: 'Radiation/Cervical', 13: 'Prognosis/Survival', 14: 'Lymph/Bone/Liver/Brain'}

### Visualise LDA model with pyLDAvis

In [28]:
import pyLDAvis.gensim

In [36]:
lda_display = pyLDAvis.gensim.prepare(ldamodel_bigrams, corpus_bigrams, dictionary_bigrams, sort_topics=True, mds='tsne')
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Categorise new abstract

Get new abstract:

In [80]:
from metapub import PubMedFetcher

In [81]:
fetch = PubMedFetcher()

  requests_per_second))
No NCBI API key provided; throttling to 3 requests/second; see https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/


In [82]:
new_abstract = fetch.article_by_pmid(25403997)

In [83]:
new_abstract.abstract

'Overall survival (OS) of patients with acquired immunodeficiency syndrome (AIDS)-related Burkitt lymphoma (BL), diffuse large B-cell lymphoma (DLBCL) and plasmablastic lymphoma (PBL) was analysed in the German AIDS-related-Lymphoma-Cohort-Study. Of 291 patients prospectively included between January 2005 and December 2012, 154 had DLBCL, 103 BL and 34 PBL. Two-year OS rates were similar between BL (69%) and DLBCL patients (63%) but lower for PBL patients (43%). Intermediate (Hazard ratio [HR] 4·1 95% confidence interval [CI] 1·98-8·49) or high (HR 4·92 95% CI 2·1-11·61) International Prognostic Index, bone marrow involvement (HR 1·69 95% CI 1·00-2·84) and PBL histology (HR 2·24 95% CI 1·24-4·03) were independent predictors of mortality. '

Get topic distribution for new abstract:

In [84]:
# Get topic from bigram model
bow_vector_bigrams = dictionary_bigrams.doc2bow(prepare_text_for_lda_bigrams(new_abstract.abstract, ['background','methods','conclusions','objective']))
print(ldamodel_bigrams[bow_vector_bigrams])

[(0, 0.0012578624), (1, 0.31527385), (2, 0.0012578629), (3, 0.0012578632), (4, 0.0012578621), (5, 0.0012578642), (6, 0.001257863), (7, 0.0012578623), (8, 0.34800872), (9, 0.001257864), (10, 0.0012578649), (11, 0.0012578628), (12, 0.0012578629), (13, 0.32162303), (14, 0.0012578658)]


In [92]:
new_abstract_topics = pd.DataFrame({'Topic': [value for value in topic_labels.values()], 'Fraction': [y for (x,y) in ldamodel_bigrams[bow_vector_bigrams]]})

In [99]:
new_abstract_topics

Unnamed: 0,Topic,Fraction
0,Mechanistic studies,0.001258
1,Lymphoma/Leukemia,0.315275
2,Lung/Renal/Immunotherapy,0.001258
3,Diagnosis/Imaging,0.001258
4,Prostate/Melanoma,0.001258
5,Clinical trials,0.001258
6,Gene expression,0.001258
7,Screening/Lung cancer,0.001258
8,Risk factors,0.347988
9,Surgery,0.001258


Get most similar abstracts:

In [148]:
from gensim.similarities import MatrixSimilarity

In [149]:
index = MatrixSimilarity(corpus_bigrams, num_features=len(dictionary_bigrams))

In [150]:
similarities = index[bow_vector_bigrams]

# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])

In [153]:
# Most similar document
document_id, similarity = similarities[2]
print(articles_df.iloc[document_id,2])

The immunodeficiency virus infection is known to increase the risk of malignancies, including lymphomas. We report a case of a 51-year-old male with a history of human immunodeficiency virus (HIV) infection, well-controlled on antiretroviral treatment, who presented with polyarthritis and hypercalcemia due to an elevated parathyroid-hormone-related peptide. Computer tomography (CT) revealed diffuse lymphadenopathy and a lymph node biopsy revealed large B-cell lymphoma. He was treated and responded well to rituximab, cyclophosphamide, doxorubicin, vincristine, and prednisone (R-CHOP) chemotherapy regimen. Our case highlights the importance of recognizing inflammatory arthritis as an initial manifestation of occult malignancy like large B-cell lymphoma, as the arthritis preceded his eventual diagnosis of lymphoma by several months.


### Plot t-sne representation of LDA model

In [38]:
import numpy as np
from sklearn.manifold import TSNE

Refactoring results of LDA into numpy matrix (number_of_papers x number_of_topics):

In [48]:
hm = np.array([[y for (x,y) in ldamodel_bigrams[corpus_bigrams[i]]] for i in range(len(corpus_bigrams))])

In [52]:
hm[1]

array([0.00064103, 0.49036646, 0.00064103, 0.00064103, 0.00064103,
       0.13750853, 0.00064103, 0.00064103, 0.00064103, 0.00064103,
       0.00064103, 0.00064103, 0.00064103, 0.36443266, 0.00064103],
      dtype=float32)

Run t-sne algorithm:

In [53]:
tsne = TSNE(random_state=42, perplexity=30)
tsne_embedding = tsne.fit_transform(hm)
tsne_embedding = pd.DataFrame(tsne_embedding, columns=['x','y'])
tsne_embedding['hue'] = hm.argmax(axis=1)

In [71]:
tsne_embedding.head()

Unnamed: 0,x,y,hue
0,-12.110913,-36.309227,8
1,-7.035795,7.389673,1
2,20.044956,-12.367356,3
3,18.713495,6.977576,2
4,-6.578784,7.544768,1


In [72]:
tsne_embedding['topic'] = [topic_labels[i] for i in tsne_embedding['hue']]

In [73]:
tsne_embedding.head()

Unnamed: 0,x,y,hue,topic
0,-12.110913,-36.309227,8,Risk factors
1,-7.035795,7.389673,1,Lymphoma/Leukemia
2,20.044956,-12.367356,3,Diagnosis/Imaging
3,18.713495,6.977576,2,Lung/Renal/Immunotherapy
4,-6.578784,7.544768,1,Lymphoma/Leukemia


Plot t-sne embedding:

In [61]:
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

In [78]:
color_palette = ['#f44542', '#3630f2', '#46db25', '#f7b613', '#a710f2',
         '#f2ea0e', '#f9818d', '#84f9d2', '#6d0c10', '#0b296d',
         '#18750b', '#77550b', '#bba2f9', '#aaea2a', '#a82676']

source = ColumnDataSource(
        data=dict(
            x = tsne_embedding['x'],
            y = tsne_embedding['y'],
            title = articles_df['title'],
            topic = tsne_embedding['topic'],
            colors = [color_palette[i] for i in tsne_embedding['hue']],
            alpha = [0.9] * tsne_embedding.shape[0],
            size = [7] * tsne_embedding.shape[0]
        )
    )

#hover = HoverTool(tooltips=[('Title','@title'),('Topic','@topic')])
hover = HoverTool(tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <div>
                <span style="font-size: 12px; font-weight: bold;">Title:</span>
                <span style="font-size: 12px">@title</span>
            </div>
            <div>
                <span style="font-size: 12px; font-weight: bold;">Topic:</span>
                <span style="font-size: 12px">@topic</span>
            </div>
        </div>
    </div>
    """)

plot_tsne = figure(plot_width=700, plot_height=700, title='Pubmed abstracts on clinical oncology', tools=[hover,'box_zoom','pan','reset'])

plot_tsne.circle('x', 'y', size='size', fill_color='colors',
                 alpha='alpha', line_alpha=0, line_width=0.01, source=source)

show(plot_tsne, notebook_handle=True)

In [76]:
tsne_embedding['topic'].value_counts()

Mechanistic studies         630
Treatment plan              288
Prognosis/Survival          286
Lung/Renal/Immunotherapy    267
Diagnosis/Imaging           255
Clinical trials             215
Risk factors                211
Breast cancer               162
Lymphoma/Leukemia            95
Surgery                      72
Gene expression              69
Lymph/Bone/Liver/Brain       44
Radiation/Cervical           44
Prostate/Melanoma            38
Screening/Lung cancer        33
Name: topic, dtype: int64