In [1]:
from pymed import PubMed
import pandas as pd
import nltk
nltk.download('wordnet')
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\charl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
email = "charles.lescure@polytechnique.edu"
pubmed = PubMed(tool="genes", email=email)

In [3]:
# Demo of pubmed: get journal info for 2 papers
def print_journal_info(paper):
    print(paper.title)
    print(paper.abstract)
    print(paper.journal)
    print(paper.publication_date.year)
    print(paper.keywords)

results = pubmed.query("PDL1", max_results=2)
for paper in results:
    print_journal_info(paper)

Circ-METTL15 contributes to the proliferation, metastasis, immune escape and restrains apoptosis in lung cancer by regulating miR-1299/PDL1 axis.
Circular RNAs (circRNAs) are important regulators in the pathogenesis of lung cancer. The study aims to explore the function and mechanism of circRNA methyltransferase-like 15 (circ-METTL15) in lung cancer development.
The expression of circ-METTL15, miR-1299 and programmed death-ligand 1 (PDL1) were investigated by qRT-PCR assay. Cell viability, colony formation, cell proliferation and invasion were determined by MTT, colony formation, EDU incorporation and transwell assays, respectively. Cell apoptosis was attested by flow cytometry and TUNEL assays. Interferon-γ (IFN-γ) and Tumour Necrosis Factor-α (TNF-α) production were tested by enzyme-linked immunosorbent assay (ELISA), and the survival rate of cancer cells was assessed by cytotoxicity analysis. The protein expression was examined by western blot or immunohistochemistry (IHC) assay. Th

In [4]:
organs_list = ["bladder", "brain", "breast", "cervical", "liver", "colon", "esophagus", "kidney",
                "lung", "ovarian", "pancreatic", "prostate", "rectum", "stomach", "testicular",
                "thyroid", "uterine", "melanoma", "leukemia"]

# Abstracts will be lemmatized, we need to check that we won't miss cancer types because of it
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
for org in organs_list:
    print(org == lemmatizer.lemmatize(org))

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [5]:
# Loading list of genes
def load_genes(genes_path):
    genes = pd.read_csv(genes_path, sep='\t', dtype='str')
    genes_list = list(genes.symbol)
    for aliases in genes.alias_symbol.values.astype('str'):
        for alias in aliases.split('|'):
            if alias!='nan':
                genes_list.append(alias)
    return genes_list

genes_path = "./gene_with_protein_product.txt"
genes_list = load_genes(genes_path)

print(genes_list[:5])
print('PDL1' in genes_list)
print(len(genes_list))

['A1BG', 'A1CF', 'A2M', 'A2ML1', 'A3GALT2']
True
53745


In [6]:
# Main function
def get_types_and_genes(q, max_results=100, organs_list=organs_list, verbose=1, buzz_words={'cancer'}):
    """
    q: query
    max_results: max number of papers to get from PubMed
    organs_list: list of cancer types (most of them are organs, but not all of them)
    verbose: 0, 1 or 2 depending on how much we want to print
    buzz_words: words that flag an abstract as relevant. Abstracts without any buzz words won't be considered.
    """
    papers = pubmed.query(q, max_results=max_results)
    gen_count = {}
    org_count = {}
    cancer_papers = []
    tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = WordNetLemmatizer()
    buzz_words = set(buzz_words) # set of words that should flag a paper as relevant

    for paper in papers:
        abst = paper.abstract
        if abst is None:
            continue
        tokenized1 = set(tokenizer.tokenize(abst.lower())) # Convert to list of words without caps or punctuation
        words = {lemmatizer.lemmatize(w) for w in tokenized1} # reduce words to their canonical form

        if buzz_words.intersection(words) != set():
            if verbose>1:
                print(paper.title)
            cancer_papers.append(paper)

            tokenized2 = set(tokenizer.tokenize(abst)) # Remove punctuation but not caps
            genes = tokenized2.intersection(genes_list) # find associated genes

            organs = words.intersection(organs_list) # find associated organs (assumably cancer types)

            # Count papers containing associated genes and cancer types
            if organs != set():
                if verbose>1:
                    print(organs)
                for organ in organs:
                    if organ not in org_count.keys():
                        org_count[organ] = 1
                    else:
                        org_count[organ] += 1
            if genes != set():
                if verbose>1:
                    print(genes)
                for gene in genes:
                    if gene != q:
                        if gene not in list(gen_count.keys()):
                            gen_count[gene] = 1
                        else:
                            gen_count[gene] += 1
    if verbose > 0:
        print(f"Found {len(cancer_papers)} relevant papers")

    # Sort results by count to get most associated genes and cancer types first
    org_count = dict(sorted(org_count.items(), key=lambda x:-x[1]))
    gen_count = dict(sorted(gen_count.items(), key=lambda x:-x[1]))

    if verbose > 0:
        print(len(org_count), 'associated cancer types')
        print(org_count)
        print(len(gen_count), 'associated genes')
        print(gen_count)
    return cancer_papers, org_count, gen_count

In [7]:
q = 'PDL1'
cancer_papers, org_count, gen_count = get_types_and_genes(q)

Found 60 relevant papers
16 associated cancer types
{'lung': 16, 'breast': 12, 'melanoma': 7, 'liver': 4, 'bladder': 4, 'prostate': 3, 'cervical': 3, 'brain': 2, 'colon': 2, 'kidney': 2, 'rectum': 1, 'testicular': 1, 'uterine': 1, 'pancreatic': 1, 'stomach': 1, 'leukemia': 1}
110 associated genes
{'PD1': 17, 'CTLA4': 7, 'CD4': 6, 'OS': 5, 'B': 4, 'LAG3': 4, 'IFN': 3, 'HCC': 3, 'M1': 3, 'TNF': 2, 'spatial': 2, 'CD274': 2, 'BRAF': 2, 'KRAS': 2, 'FOXP3': 2, 'p53': 2, 'TIGIT': 2, 'PDCD1': 2, 'P3': 2, 'METTL15': 1, 'RS': 1, 'DSP': 1, 'MB': 1, 'H1': 1, 'SAP': 1, 'B7': 1, 'CIT': 1, 'Ki': 1, 'LSD1': 1, 'DCR': 1, 'HR': 1, 'GO': 1, 'GPI': 1, 'PGK1': 1, 'RPE': 1, 'CD25': 1, 'HER2': 1, 'CSNK2A1': 1, 'COL5A1': 1, 'EFNA3': 1, 'CCNA2': 1, 'PDL2': 1, 'PSA': 1, 'CD5L': 1, 'EGFR': 1, 'p65': 1, 'NF': 1, 'SRF': 1, 'MDM4': 1, 'MDM2': 1, 'STC1': 1, 'ADP': 1, 'PARP1': 1, 'ER': 1, 'GZMB': 1, 'HAVCR2': 1, 'CXCL9': 1, 'AKT': 1, 'PI3K': 1, 'KCNJ12': 1, 'SLC6A11': 1, 'ALPP': 1, 'FOLR3': 1, 'IGFN1': 1, 'FOXM1': 1,

In [8]:
q = 'PDL1'
buzz_words = {'cancer', 'melanoma', 'leukemia'}
cancer_papers, org_count, gen_count = get_types_and_genes(q, buzz_words=buzz_words)

Found 62 relevant papers
16 associated cancer types
{'lung': 16, 'breast': 12, 'melanoma': 9, 'liver': 4, 'bladder': 4, 'prostate': 3, 'cervical': 3, 'brain': 2, 'colon': 2, 'kidney': 2, 'rectum': 1, 'testicular': 1, 'uterine': 1, 'pancreatic': 1, 'stomach': 1, 'leukemia': 1}
110 associated genes
{'PD1': 19, 'CTLA4': 9, 'CD4': 6, 'OS': 5, 'B': 4, 'LAG3': 4, 'IFN': 3, 'HCC': 3, 'M1': 3, 'TNF': 2, 'spatial': 2, 'CD274': 2, 'BRAF': 2, 'KRAS': 2, 'FOXP3': 2, 'p53': 2, 'TIGIT': 2, 'PDCD1': 2, 'P3': 2, 'METTL15': 1, 'RS': 1, 'DSP': 1, 'MB': 1, 'H1': 1, 'SAP': 1, 'B7': 1, 'CIT': 1, 'Ki': 1, 'LSD1': 1, 'DCR': 1, 'HR': 1, 'GO': 1, 'GPI': 1, 'PGK1': 1, 'RPE': 1, 'CD25': 1, 'HER2': 1, 'CSNK2A1': 1, 'COL5A1': 1, 'EFNA3': 1, 'CCNA2': 1, 'PDL2': 1, 'PSA': 1, 'CD5L': 1, 'EGFR': 1, 'p65': 1, 'NF': 1, 'SRF': 1, 'MDM4': 1, 'MDM2': 1, 'STC1': 1, 'ADP': 1, 'PARP1': 1, 'ER': 1, 'GZMB': 1, 'HAVCR2': 1, 'CXCL9': 1, 'AKT': 1, 'PI3K': 1, 'KCNJ12': 1, 'SLC6A11': 1, 'ALPP': 1, 'FOLR3': 1, 'IGFN1': 1, 'FOXM1': 1,

In [9]:
q = 'PD1'
cancer_papers, org_count, gen_count = get_types_and_genes(q, verbose=2)

The potential combinational immunotherapiesfor treatment of hepatocellular carcinoma.
{'HCC', 'PD1'}
Cullin3 deficiency shapes tumor microenvironment and promotes cholangiocarcinoma in liver-specific Smad4/Pten mutant mice.
{'liver'}
Addition of immunotherapy to chemotherapy for metastatic triple-negative breast cancer: A systematic review and meta-analysis of randomized clinical trials.
{'breast'}
{'PD1', 'CI', 'HR', 'OS'}
Explore association of genes in PDL1/PD1 pathway to radiotherapy survival benefit based on interaction model strategy.
{'RS', 'OS'}
Multifaceted glycoadjuvant@AuNPs inhibits tumor metastasis through promoting T cell activation and remodeling tumor microenvironment.
{'melanoma'}
{'M1', 'PD1', 'DC', 'B'}
Circular RNA CELF1 drives immunosuppression and anti-PD1 therapy resistance in non-small cell lung cancer via the miR-491-5p/EGFR axis.
{'lung'}
{'CCK', 'EGFR'}
Tumor Microenvironment Profiles Reveal Distinct Therapy-Oriented Proteogenomic Characteristics in Colorecta

In [10]:
q = 'INSL4'
cancer_papers, org_count, gen_count = get_types_and_genes(q, verbose=2)

Exploration of the prognostic signature reflecting tumor microenvironment of lung adenocarcinoma based on immunologically relevant genes.
{'lung'}
{'VEGFD', 'BTK', 'INSL4', 'PIK3CG', 'PTPRC', 'INHA'}
INSL4 as prognostic marker for proliferation and invasiveness in Non-Small-Cell Lung Cancer.
{'lung'}
{'INSL4'}
Identification of a Novel Tumor Microenvironment-Associated Eight-Gene Signature for Prognosis Prediction in Lung Adenocarcinoma.
{'lung'}
{'MS4A1', 'SCN7A', 'STAP1', 'P2RX1', 'KLRB1', 'INSL4', 'Mast', 'B', 'IKZF3', 'ACSM5'}
Role of INSL4 Signaling in Sustaining the Growth and Viability of LKB1-Inactivated Lung Cancer.
{'lung'}
{'LKB1', 'INSL4', 'IGF'}
Relaxin-like peptides in male reproduction - a human perspective.
{'brain', 'prostate', 'testicular'}
{'INSL3', 'INSL4', 'INSL6', 'INSL5', 'H2'}
DNA methylation signatures identify biologically distinct thyroid cancer subtypes.
{'thyroid'}
{'NOTCH4', 'ZIC1', 'ADAMTS8', 'INSL4', 'MAP17', 'HOXB4', 'DPPA2', 'KISS1R', 'TCL1B'}
[Express