In [1]:
# this is a topic modeling script for synth bio papers on PMC
# this is a test notebook file
# this is an example query program
# code taken from https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21
# Cred. Susan Li
from Bio import Entrez
import json

def search(query):
    Entrez.email = 'clp0216@bu.edu'
    handle = Entrez.esearch(db='pmc',
                            sort='relevance',
                            retmax='350',
                            retmode='xml',
                            term=query)
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'your.email@example.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

In [2]:
results1 = search('ACS Synth. Biol[Journal]')
ID1 = results1['IdList']
print(len(ID1))

329


In [3]:
results2 = search('Synth Syst Biotechnol[Journal]')
ID2 = results2['IdList']
print(len(ID2))

196


In [4]:
results3 = search('Front Mol Neurosci.[Journal]')
ID3 = results3['IdList']
print(len(ID3))

350


In [5]:

id_list = ID1+ID2+ID3
id_list = id_list[:350]
# papers = fetch_details(id_list)
# for i, paper in enumerate(papers['PubmedArticle']):
#      print("{}) {}".format(i+1, paper['MedlineCitation']['Article']['ArticleTitle']))


# train on curated models based upon selected journals idenified in bibliometric searches:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5533824/

In [6]:
import json
import xmltodict
import requests
payload = {'db':'pmc', 'id':id_list}
# response = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=4304705")
response = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi", params=payload)

import xml.etree.ElementTree as ET
import xmltodict
from json_flatten import flatten
    
string_xml = response.text
tree = ET.fromstring(string_xml)
xml_str = ET.tostring(tree, method='xml')
data = xmltodict.parse(xml_str)

300

In [7]:
len(data['pmc-articleset']['article'])

350

In [8]:
# list(filter(lambda x:, data['pmc-articleset']['article']['body']['sec'][2]['sec'][1]))
# https://towardsdatascience.com/how-to-flatten-deeply-nested-json-objects-in-non-recursive-elegant-python-55f96533103d
from collections import MutableMapping
import numpy as np
from json_flatten import flatten
import os, glob
# for filename in glob.glob(os.getcwd() + "/text_result_*"):
#     os.remove(filename) 

kwds = {}
original_corpus = {}
# code to convert ini_dict to flattened dictionary
for i, paper in enumerate(data['pmc-articleset']['article']):
    try:
        pmid = paper['front']['article-meta']['article-id'][0]["#text"]
        flat_data = flatten(paper['body'])
        # apply filter for any value over 10 characters in length
        text_res = dict((key, flat_data[key]) for key in flat_data.keys() if (len(flat_data[key]) > 30) and ('{' not in flat_data[key]))
        concat_text = " ".join(list(text_res.values())) + "."
        concat_text = concat_text.replace("[]", "").replace("[,]", "")
        original_corpus[pmid] = concat_text
        kwds[pmid] = paper['front']['article-meta']['kwd-group']['kwd']
    except:
        pass
  

In [9]:
len(original_corpus.values())

179

In [10]:
from os import chdir
os.chdir('/Users/cullenpaulisick/Documents/EC552/Project/EC552SaturnLab/PracticeCorpus')
for item in original_corpus.items():
    with open('pmid_' + str(item[0]) + '.txt', 'w') as f:
        f.write(item[1])
    f.close()


In [3]:
def bow_corpus(original_corpus):
    docs = list(original_corpus)
    # Tokenize the documents.
    from nltk.tokenize import RegexpTokenizer
    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.
    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 2] for doc in docs]

    # Lemmatize the documents.
    from nltk.stem.wordnet import WordNetLemmatizer

    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
    
    # Compute bigrams.
    from gensim.models import Phrases

    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(docs, min_count=20)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)

    # Remove rare and common tokens.
    from gensim.corpora import Dictionary

    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    print(docs[0][:5])
    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(corpus))
    return corpus, dictionary

In [4]:
def bow_string(doc):
    from nltk.tokenize import RegexpTokenizer
    from nltk.corpus import stopwords
    from gensim.corpora import Dictionary
    import numpy as np
    doc = doc.lower()
    print(doc)
    print("Doc: ")
    print(doc)
    # tokenize string
    tokenizer = RegexpTokenizer(r'\w+')
    doc = doc.lower()  # Convert to lowercase.
    doc = tokenizer.tokenize(doc)  # Split into words.
    print("Doc after tokenization: ")
    print(doc)
    # remove stopwords
    stop_words = stopwords.words('english')
    user_terms = [token for token in doc if token not in stop_words]
    print("Doc after stopword filtering: ")
    print(user_terms)
    # convert terms into word-order agnostic 2D list
    user_array = np.array(user_terms)
    user_corpus = [np.roll(user_array, i) for i in range(len(user_terms))]
    user_corpus = [list(arr) for arr in user_corpus]
    # create dictionary and bag-of-words
    dictionary = Dictionary(user_corpus)
    print(user_corpus)
    text_corpus = [dictionary.doc2bow(doc) for doc in user_corpus]
    print("Corpus created from text: ")
    print(text_corpus)    


In [43]:
user_terms = ['I', 'Hate', 'the', 'Ravens']
user_array = np.array(user_terms)
user_corpus = [np.roll(user_array, i) for i in range(len(user_terms))]
user_corpus = [list(arr) for arr in user_corpus]
print(user_corpus)

[['I', 'Hate', 'the', 'Ravens'], ['Ravens', 'I', 'Hate', 'the'], ['the', 'Ravens', 'I', 'Hate'], ['Hate', 'the', 'Ravens', 'I']]


In [5]:
import math
length_corpus = len(list(original_corpus.values()))
train_cutoff = math.ceil(length_corpus*0.75)
train_kwds = list(kwds.values())[:train_cutoff]
train_kwds_id = list(kwds.keys())[:train_cutoff]
test_kwds = list(kwds.values())[train_cutoff:]
test_kwds_id = list(kwds.keys())[train_cutoff:]
train_data = list(original_corpus.values())[:train_cutoff]
test_data = list(original_corpus.values())[train_cutoff:]
training = bow_corpus(train_data)
testing = bow_corpus(test_data)

# Train LDA model.
from gensim.models import LdaModel

train_corpus = training[0]
test_corpus = testing[0]
dictionary = training[1]
# Set training parameters.
num_topics = 10
chunksize = 500
passes = 15
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.


# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=train_corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)


['cell', 'free', 'system', 'that', 'support']
Number of unique tokens: 8938
Number of documents: 136
['central', 'goal', 'synthetic', 'biology', 'build']
Number of unique tokens: 4546
Number of documents: 45


In [10]:
type(model)

gensim.models.ldamodel.LdaModel

In [49]:
bow_string("pveg promoter and long time optimization")

pveg promoter and long time optimization
Doc: 
pveg promoter and long time optimization
Doc after tokenization: 
['pveg', 'promoter', 'and', 'long', 'time', 'optimization']
Doc after stopword filtering: 
['pveg', 'promoter', 'long', 'time', 'optimization']
[['pveg', 'promoter', 'long', 'time', 'optimization'], ['optimization', 'pveg', 'promoter', 'long', 'time'], ['time', 'optimization', 'pveg', 'promoter', 'long'], ['long', 'time', 'optimization', 'pveg', 'promoter'], ['promoter', 'long', 'time', 'optimization', 'pveg']]
Corpus created from text: 
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]]


In [159]:
from gensim.test.utils import datapath

model.save('model1.gensim')

In [160]:
new_model = LdaModel.load('model1.gensim')

In [42]:
len(original_corpus)

47

In [6]:
top_topics = model.top_topics(train_corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -4.0540.
[([(0.016739553, 'library'),
   (0.0111876605, 'rb'),
   (0.007410834, 'biosensor'),
   (0.007115362, 'figure'),
   (0.006646982, 'oscillation'),
   (0.0065360283, 'repressor'),
   (0.0065265405, 'activator'),
   (0.006102303, 'circuit'),
   (0.005772312, 'feedback'),
   (0.005644147, 'riboswitch'),
   (0.0056192447, 'parameter'),
   (0.0047752857, 'induction'),
   (0.0047452287, 'iptg'),
   (0.0046593146, 'performance'),
   (0.0046215598, 'screening'),
   (0.0045057926, 'loop'),
   (0.0037863704, 'primary'),
   (0.0037785026, 'space'),
   (0.0037740006, 'facs'),
   (0.0033366706, 'strength')],
  -1.3579791546389646),
 ([(0.0066813813, 'glucose'),
   (0.006120781, 'riboswitch'),
   (0.0051763025, 'colony'),
   (0.0050295894, 'metabolic'),
   (0.00462217, 'module'),
   (0.004382617, 'genome'),
   (0.004258925, 'violacein'),
   (0.0039314055, 'yeast'),
   (0.0035941917, 'cultivation'),
   (0.0035560613, 'primer'),
   (0.0034598943, 'fragment'),
   (0.003

In [6]:
print(model[test_corpus[3]])
print(test_kwds[3])
print(test_kwds_id[3])

[(0, 0.07618819), (2, 0.017732281), (3, 0.22501023), (4, 0.48947626), (5, 0.07544329), (6, 0.040534757), (7, 0.058275856), (9, 0.013862014)]
['protein−protein interaction', 'synthetic biology', 'coiled coil', 'heterodimer']
22558529


In [157]:
model.print_topics(num_words=5)

[(0,
  '0.008*"genome" + 0.007*"primer" + 0.007*"fragment" + 0.006*"editing" + 0.006*"library"'),
 (1,
  '0.007*"riboswitch" + 0.005*"mutation" + 0.004*"library" + 0.003*"iptg" + 0.003*"lipid"'),
 (2,
  '0.006*"module" + 0.005*"biosensor" + 0.005*"base" + 0.005*"glucose" + 0.005*"metabolic"'),
 (3,
  '0.011*"core" + 0.010*"circuit" + 0.008*"copy" + 0.007*"cassette" + 0.006*"reporter"'),
 (4,
  '0.015*"circuit" + 0.012*"figure" + 0.009*"repressor" + 0.008*"activator" + 0.008*"oscillation"'),
 (5,
  '0.017*"light" + 0.016*"peptide" + 0.008*"substrate" + 0.008*"amino" + 0.008*"amino_acid"'),
 (6,
  '0.029*"hac" + 0.016*"alphoid" + 0.013*"array" + 0.010*"alphoid_hac" + 0.009*"cenp"'),
 (7,
  '0.012*"light" + 0.012*"noise" + 0.007*"parameter" + 0.007*"figure" + 0.005*"distribution"'),
 (8,
  '0.007*"glycosylation" + 0.004*"bacteria" + 0.004*"linked" + 0.004*"yeast" + 0.004*"ligand"'),
 (9,
  '0.011*"switch" + 0.008*"figure" + 0.006*"car" + 0.006*"drug" + 0.006*"light"')]

In [114]:
pprint(top_topics[2])

([(0.007511505, 'activation'),
  (0.007097461, 'switch'),
  (0.004782665, 'protease'),
  (0.0047546686, 'reporter'),
  (0.0046180845, 'core'),
  (0.004326322, 'peptide'),
  (0.0040891776, 'circuit'),
  (0.003087302, 'gfp'),
  (0.0029444091, 'figure'),
  (0.0028294383, 'ligand'),
  (0.002761806, 'fusion'),
  (0.0027580173, 'cleavage'),
  (0.0026285388, 'output'),
  (0.0024671573, 'car'),
  (0.0024269305, 'core_promoter'),
  (0.0024224874, 'nfat'),
  (0.00233404, 'input'),
  (0.0023046061, 'scaffold'),
  (0.0022477582, 'primer'),
  (0.002186634, 'secretion')],
 -2.980139296128298)


In [56]:
list(kwds.values())[7]

['ATAC-seq', 'heterologous gene', 'RNA-seq', 'locus', 'genome\nengineering']

In [2]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

user_terms = "I hope that the pVeg system functions as a promoter"
tokenizer = RegexpTokenizer(r'\w+')
user_terms = user_terms.lower()  # Convert to lowercase.
user_terms = tokenizer.tokenize(user_terms)  # Split into words.
stop_words = stopwords.words('english')
user_terms = [word for word in user_terms if word not in stop_words ]


In [10]:
user_terms

['hope', 'pveg', 'system', 'functions', 'promoter']