In [None]:
# default_exp nlp

# nlp

Here we are going to start our exploration of just enough deep learning be dangerous with topic-modeling.
This is the process of using, in our case, deep learning to group sentences by topics.
This process takes into account that different people may use related words to express the same concept.

## Queries

First we'll need to query the Pubmed website for some titles and abstracts.
We'll use the `Eutils` package to retrieve information.
I've made a shallow pubmed interface to automate some of the obnoxious hurdles.

In [2]:
#hide
#export

from itertools import islice
from Bio import Entrez
import pandas as pd
import numpy as np

from transformers import AutoModel, AutoTokenizer
from umap import UMAP
from fastai.text.all import *
import hdbscan





In [3]:
#hide
#export

# Some utility functions to extact pubmed info


def _get_abstract(pub_record):

    try:
        return 'abstract', ' '.join(pub_record['MedlineCitation']['Article']['Abstract']['AbstractText'])
    except:
        return 'abstract', None


def _get_journal(pub_record):

    try:
        return 'journal', pub_record['MedlineCitation']['Article']['Journal']['ISOAbbreviation']
    except:
        return 'journal', None

def _get_title(pub_record):

    try:
        return 'title', pub_record['MedlineCitation']['Article']['ArticleTitle']
    except:
        return 'title', None


def _get_authors(pub_record):

    try:
        author_list = pub_record['MedlineCitation']['Article']['AuthorList']
        return 'authors', ','.join(auth['LastName'] for auth in author_list)
    except :
        return 'authors', None

def _get_first_author(pub_record):

    try:
        author_list = pub_record['MedlineCitation']['Article']['AuthorList']
        return 'first_author', author_list[0]['LastName']
    except :
        return 'first_author', None


def _get_date(pub_record):

    try:
        date = pub_record['MedlineCitation']['Article']['ArticleDate'][0]
        return 'date', f'{date["Year"]}-{date["Month"]}-{date["Day"]}'
    except :
        return 'date', None

def _get_pmid(pub_record):

    try:
        return 'pmid', str(pub_record['MedlineCitation']['PMID'])
    except :
        return 'pmid', None

In [4]:
#hide
#export

class PubmedInterface(object):

    def __init__(self, email = 'wnd22@drexel.edu'):
        Entrez.email = email


    def query2ids(self, query, retmax = 1000):
        """ Query pubmed to get PMIDs.

        Parameters
        ----------
        query : str
        retmax : int

        Returns
        -------
        list[str]
        """


        handle = Entrez.esearch(db = 'pubmed',
                                retmax = retmax,
                                term = query)
        record = Entrez.read(handle)

        ids = record.get('IdList', [])

        return ids


    def ids2records(self, ids, batch_size = 100):
        """

        Parameters
        ----------
        ids : list[str]

        Returns
        -------

        """

        id_iter = iter(ids)

        batch = list(islice(id_iter, batch_size))
        while batch:

            pubs = Entrez.efetch(db="pubmed", id=",".join(batch),
                                 retmode="xml")
            res = Entrez.read(pubs)
            for pub in res['PubmedArticle']:
                yield self.extract_info(pub)

            batch = list(islice(id_iter, batch_size))

    def extract_info(self, pub_record):

        field_funcs = [_get_pmid,
                       _get_date,
                       _get_first_author,
                       _get_journal,
                       _get_title,
                       _get_abstract,
                       _get_authors]

        return dict(f(pub_record) for f  in field_funcs)


    def query2df(self, query, retmax = 1000, batch_size = 100):

        pmids = self.query2ids(query, retmax=retmax)
        df = pd.DataFrame(list(self.ids2records(pmids, batch_size=batch_size)))
        return df

This `PubmedInterface` queries Pubmed using the `BioPython` library.
It also simplifies the extraction of common information from their complex XLM format.

Use it like so.

In [5]:
QUERY = 'HIV-1 cellular tropism'

interface = PubmedInterface()
pmids = interface.query2ids(QUERY)
print(pmids[:5])

for pub in interface.ids2records(pmids[:5]):
    print(pub['pmid'], pub['title'])

['33191758', '33085078', '32992787', '32985522', '32843050']
33191758 Intrapatient Evolutionary Dynamics in an Individual Infected with HIV-1 CRF01_AE Who Experienced Periods of Treatment Failure.
33085078 Opsoclonus-myoclonus-ataxia syndrome associated with central nervous system HIV-1 escape phenomenon.
32992787 Macrophage Tropism in Pathogenic HIV-1 and SIV Infections.
32985522 Immune activation correlates with and predicts CXCR4 co-receptor tropism switch in HIV-1 infection.
32843050 First case of Dolutegravir and Darunavir/r multi drug-resistant HIV-1 in Cameroon following exposure to Raltegravir: lessons and implications in the era of transition to Dolutegravir-based regimens.


There is also a convenience method `query2df` that automates the whole process.


In [6]:
df = interface.query2df(QUERY, retmax=10_000)
df.head()

Unnamed: 0,pmid,date,first_author,journal,title,abstract,authors
0,33191758,2020-11-15,Peng,AIDS Res Hum Retroviruses,Intrapatient Evolutionary Dynamics in an Individual Infected with HIV-1 CRF01_AE Who Experienced Periods of Treatment Failure.,"Background While previous studies have analyzed cross-level CRF01_AE viral genomic data in populations, less is known about intrapatient viral evolutionary dynamics during antiviral treatment (ART) failure. Methods We longitudinally sampled plasma and peripheral blood mononuclear cells (PBMC) at different time points from one HIV-1 infected patient. The evolution of viral quasispecies was inferred from viral phylogenies. Results Prior to treatment, no drug resistant mutations were found in this patient's plasma, and all viruses had CCR5 tropism. Two months after treatment, the majority of ...","Peng,Xu,Huang,Zhu"
1,33085078,,Mendoza-Olivas,Rev Neurol,Opsoclonus-myoclonus-ataxia syndrome associated with central nervous system HIV-1 escape phenomenon.,"Opsoclonus-myoclonus-ataxia (OMA) syndrome is a rare neurological disorder characterized by involuntary conjugate saccadic eye movements, myoclonus, and ataxia. Few reports exist on patients with HIV and OMA. A 41-year-old man diagnosed with HIV-1 infection in 1997 coursed with multiple anti-retroviral schemes as a consequence of poor adherence. In 2008 he presented an HIV-1 viral load of 100,000 copies/mL and a CD4+ T cell count of 10 cells/mm3. In 2013 our patient arrived with an 11-month history of progressive opsoclonus and ataxia. He had undetectable plasma HIV-1 RNA load and CD4+ of ...","Mendoza-Olivas,Niembro-Ortega,Sierra-Madero,Soto-Ramírez,Rodríguez-Díaz,Fuentes-Romero,Hernández-Flores,Hernández-Martínez,Treviño-Frenk,Chiquete"
2,32992787,2020-09-25,Moeser,Viruses,Macrophage Tropism in Pathogenic HIV-1 and SIV Infections.,"Most myeloid lineage cells express the receptor and coreceptors that make them susceptible to infection by primate lentiviruses (SIVs and HIVs). However, macrophages are the only myeloid lineage cell commonly infected by SIVs and/or HIVs. The frequency of infected macrophages varies greatly across specific host and virus combinations as well as disease states, with infection rates being greatest in pathogenic SIV infections of non-natural hosts (i.e., Asian nonhuman primates (Asian NHPs)) and late in untreated HIV-1 infection. In contrast, macrophages from natural SIV hosts (i.e., African ...","Moeser,Nielsen,Joseph"
3,32985522,2020-09-28,Connell,Sci Rep,Immune activation correlates with and predicts CXCR4 co-receptor tropism switch in HIV-1 infection.,HIV-1 cell entry is mediated by binding to the CD4-receptor and chemokine co-receptors CCR5 (R5) or CXCR4 (X4). R5-tropic viruses are predominantly detected during early infection. A switch to X4-tropism often occurs during the course of infection. X4-tropism switching is strongly associated with accelerated disease progression and jeopardizes CCR5-based HIV-1 cure strategies. It is unclear whether host immunological factors play a causative role in tropism switching. We investigated the relationship between immunological factors and X4-tropism in a cross-sectional study in HIV-1 subtype C...,"Connell,Hermans,Wensing,Schellens,Schipper,van Ham,de Jong,Otto,Mathe,Moraba,Borghans,Papathanasopoulos,Kruize,Venter,Kootstra,Tempelman,Tesselaar,Nijhuis"
4,32843050,2020-08-26,Fokam,Antimicrob Resist Infect Control,First case of Dolutegravir and Darunavir/r multi drug-resistant HIV-1 in Cameroon following exposure to Raltegravir: lessons and implications in the era of transition to Dolutegravir-based regimens.,"Sub-Saharan African countries are transitioning to dolutegravir-based regimens, even for patients with extensive previous drug exposure, including first-generation integrase strand-transfer inhibitors (INSTI) such as raltegravir. Such exposure might have implications on cross-resistance to dolutegravir-based antiretroviral therapies (ART). We report a 65 years old Cameroonian, previously exposed to raltegravir, and failing on third-line treatment with multi-drug resistance to darunavir/r and dolutegravir. Genotypic resistance testing (GRT) and viral tropism were performed during monitoring...",


Great, now we have an easy to use `DataFrame` of our pubmed information.
Let's get into the deep learning.

## Huggingface Transformers
_Yes, the emoji._

This group has done a wonderful job encapsulating nearly all of modern deep learning NLP techniques into a only a handful of lines.

That was it.

You now have the ability to run any of the modern deep learning pipelines [huggingface doc link] and have access to thousands of pre-trained models [model-link].

We're going to use this one [model link].

It was trained by Microsoft on Pubmed abstracts. Here's a few videos describing BERT, what it is, how it works, and what it can be useful for.
[Link list].

Watch at least one of those before continuing.

Now that you're back, let's download a model.

In [16]:
#export




def mean_pooling_attention(token_embeddings, attention_mask):
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


class TopicModelingInterface(object):
    
    def __init__(self, tokenizer = None, model = None, model_name = None, bs=8,
                 cluster_dim = 10, viz_dim = 2, device = 'cuda',
                 min_cluster_size = 5):
        
        if tokenizer is None:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        else:
            self.tokenizer = tokenizer
        
        if model is None:
            self.model = AutoModel.from_pretrained(model_name)
        else:
            self.model = model
            
        self.bs = bs
        self.device = device
        
        self.viz_dim = viz_dim
        self.cluster_dim = cluster_dim
        
        self.umap_cluster = UMAP(n_components=cluster_dim)
        self.cluster = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
        self.umap_viz = UMAP(n_components=2)
        
            
            
    def text2embed(self, text, bs=None):
        if type(text) is str: return self.text2embed([text])[0]
        
        bs = self.bs if bs is None else bs
        
        
        it = iter(text)
        
        out_data = []
        
        with torch.no_grad():
            batch = list(islice(it, bs))
            while batch:
                tokens = self.tokenizer(batch, return_tensors='pt', padding='max_length',
                                        truncation = True,
                                       max_length = 512)
                tokens.to(self.device)
                res = self.model(**tokens)
                out_data.append(mean_pooling_attention(res[0], tokens['attention_mask']))
                
                batch = list(islice(it, bs))
                
        return torch.vstack(out_data)
    
    
    def embed2cluster(self, embed, fit = True):
        
        if fit:
            clst_data = self.umap_cluster.fit_transform(embed)
            self.cluster.fit(clst_data)
            labels = self.cluster.labels_
        else:
            clst_data = self.umap_cluster.transform(embed)
            labels, _ = hdbscan.approximate_predict(self.cluster, embed)
            
        return labels
    
    
    def embed2xy(self, embed, fit = True):
        
        if fit:
            xy = self.umap_viz.fit_transform(embed)
        else:
            xy = self.umap_cluster.transform(embed)
        return xy
    
        
    
    def process_df(self, df, col = 'text', fit = True):
        
        
        emb = self.text2embed(df[col].fillna('').tolist())
        
        clusters = self.embed2cluster(emb.cpu().numpy(), fit = fit)
        xy = self.embed2xy(emb.cpu().numpy(), fit = fit)
        
        ndf = pd.DataFrame({'cluster': clusters,
                            'X': xy[:, 0],
                            'Y': xy[:, 1],
                            'label': [str(c) for c in clusters]}, index = df.index)
        
        return ndf
        

In [8]:
from transformers import AutoModel, AutoTokenizer


model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to('cuda');

In [17]:
tm = TopicModelingInterface(model = model, tokenizer = tokenizer)

In [22]:
clus_df = tm.process_df(df, col = 'title')
clus_df.head()

Unnamed: 0,cluster,X,Y,label
0,3,4.910263,2.697094,3
1,4,5.506819,0.132954,4
2,26,3.858279,0.395652,26
3,-1,6.237852,1.763779,-1
4,0,4.993605,3.655994,0


In [19]:
from bokeh.plotting import output_notebook, show


output_notebook()

In [20]:
# export


from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.transform import factor_cmap
from bokeh.palettes import viridis


def make_pubmed_topic_figure(pub_data, clust_data):
    
    full_df = pd.concat([pub_data, clust_data], axis=1)
    source = ColumnDataSource(full_df)

    factors = full_df['label'].unique().tolist()


    tips = [('Title', '@title'),
            ('First Author', '@first_author'),
            ('Date', '@date')]

    fig = figure(tooltips = tips)

    fig.scatter(x = "X", y = "Y",
                source = source, size = 10,
                color = factor_cmap('label', viridis(len(factors)),  factors))

    return fig


In [23]:
show(make_pubmed_topic_figure(df, clus_df))