In [1]:
#!pip install xlrd
#!pip uninstall spacy
#!pip install spacy==2.2.2

In [2]:
import urllib.request
import io
import gzip
import os
from pathlib import Path
import pandas as pd
try:
    from Bio import Entrez
except ModuleNotFoundError:
    !pip install Bio
    from Bio import Entrez
import re
import random
import spacy

response = urllib.request.urlopen('ftp://mirbase.org/pub/mirbase/CURRENT/miRNA.xls.gz')
compressed_file = io.BytesIO(response.read())
decompressed_file = gzip.GzipFile(fileobj=compressed_file)

with open(Path(os.getcwd(), 'miRNA.xlsx'), 'wb') as outfile:
    outfile.write(decompressed_file.read())

mir_database = pd.read_excel('miRNA.xlsx')

mir_database_1 = mir_database.loc[:, ['Accession', 'ID']]
mir_database_2 = mir_database.loc[:, ['Mature1_Acc', 'Mature1_ID']].rename(columns = {'Mature1_Acc':'Accession', 'Mature1_ID':'ID'})
mir_database_3 = mir_database.loc[:, ['Mature2_Acc', 'Mature2_ID']].rename(columns = {'Mature2_Acc':'Accession', 'Mature2_ID':'ID'})

final_database = pd.concat([mir_database_1, mir_database_2, mir_database_3])

In [3]:
print(spacy.__version__)

2.3.5


In [4]:
def fetch_abstract(pmid):
    handle = Entrez.efetch(db='pubmed', id = pmid, retmode='xml')
    article = Entrez.read(handle)['PubmedArticle'][0]['MedlineCitation']['Article']
    if 'Abstract' in article:
            return article['Abstract']['AbstractText']
        
def concat_article(x):
    final_article = str()
    for i in range(len(x)):
        final_article = final_article + str(x[i]) + ' '
    return final_article

In [5]:
def get_literature(user_mir):
    
    filtered_database = final_database[final_database['Accession']  == user_mir]['ID']
    filtered_database = final_database[final_database['Accession']  == user_mir]['ID']

    if filtered_database.size == 1:
        mir = filtered_database.iloc[0]
        print('The accession number ' + user_mir + ' corresponds to miR ' + mir)
    else:
        print('miR accession is incorrect. Try again (caps sensitive)')

    Entrez.email = 'anonymous@gmail.com'
    esearch_query = Entrez.esearch(db="pubmed", term="mir-100", retmode="xml")
    esearch_result = Entrez.read(esearch_query)
    pmid_list = esearch_result['IdList']
    print("pmid's obtained: " + str(len(pmid_list)))
    
    abs_list = []

    for i in pmid_list:
        abs = fetch_abstract(i)
        abs_list.append(abs)
        
    abs_list = [concat_article(i) for i in abs_list if i is not None]
    
    return(abs_list)

In [6]:
training_mir = ['MI0000692', 'MI0000159', 'MI0000172', 'MI0000406', 'MI0000111', 'MI0000684', 'MI0000256', 'MI0000170', 'MI0000268', 'MI0002470']

In [7]:
all_abstracts = []

for i in training_mir:
    abstracts = get_literature(i)
    all_abstracts = all_abstracts + abstracts

The accession number MI0000692 corresponds to miR mmu-mir-100
pmid's obtained: 20
The accession number MI0000159 corresponds to miR mmu-mir-133a-1
pmid's obtained: 20
The accession number MI0000172 corresponds to miR mmu-mir-150
pmid's obtained: 20
The accession number MI0000406 corresponds to miR mmu-mir-106a
pmid's obtained: 20
The accession number MI0000111 corresponds to miR hsa-mir-105-1
pmid's obtained: 20
The accession number MI0000684 corresponds to miR mmu-mir-107
pmid's obtained: 20
The accession number MI0000256 corresponds to miR mmu-mir-122
pmid's obtained: 20
The accession number MI0000170 corresponds to miR mmu-mir-146a
pmid's obtained: 20
The accession number MI0000268 corresponds to miR hsa-mir-34a
pmid's obtained: 20
The accession number MI0002470 corresponds to miR hsa-mir-486-1
pmid's obtained: 20


In [8]:
from nltk.tokenize import word_tokenize, sent_tokenize

all_sentences = []

for mir_abs in all_abstracts:
    abstr_sentences = sent_tokenize(mir_abs)
    all_sentences = all_sentences + abstr_sentences


In [9]:
random.shuffle(all_sentences)

In [10]:
training_sentences = all_sentences[0:int(.8 * len(all_sentences))]
testing_sentences = all_sentences[int(.8 * len(all_sentences)):len(all_sentences)]

In [11]:
def make_training_data(string):
    if len([i for i in re.finditer('mir-\d+[^\s|.|,|!|?| |:|;]*', string.lower())]) != 0:
        ent_list = []
        for i in re.finditer('mir-\d+[^\s|.|,|!|?| |:|;]*', string.lower()):
            ent_code = (i.start(), i.end(), 'miR')
            ent_list.append(ent_code)
            
    else:
        ent_list = []      
    return((string, {'entities' : ent_list}))    

In [12]:
training_data = [make_training_data(i) for i in training_sentences]

In [13]:
training_data[0:3]

[('Moreover, CCND1 was shown to be a novel target gene of miR-194 in GC.',
  {'entities': [(55, 62, 'miR')]}),
 ('PCa cells were transfected with NC-mimics or miR-100-5p mimics, inhibitor by using liposome transfection.',
  {'entities': [(45, 55, 'miR')]}),
 ('Expression of eca-miR-100 and eca-miR-1 was not different between groups.',
  {'entities': [(18, 25, 'miR'), (34, 39, 'miR')]})]

In [14]:
!pip install spacy-lookups-data



In [15]:
def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp

In [16]:
miRnlp = train_spacy(training_data, 10)

Starting iteration 0
{'ner': 1162.650378093236}
Starting iteration 1
{'ner': 309.35683442019365}
Starting iteration 2
{'ner': 102.06620370620945}
Starting iteration 3
{'ner': 158.36978899452725}
Starting iteration 4
{'ner': 37.76365348176094}
Starting iteration 5
{'ner': 294.65522586023627}
Starting iteration 6
{'ner': 47.67403523577628}
Starting iteration 7
{'ner': 29.274067601147642}
Starting iteration 8
{'ner': 55.70227766505165}
Starting iteration 9
{'ner': 17.553434002926913}


In [17]:
for i in testing_sentences[0:5]:
    print(i)
    doc = miRnlp(i)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print()

Because of the high number of miRNAs, we more closely evaluated the expression of six of them (miR-100-5p, miR-29a-3p, miR-130a-3p, miR-10a-5p, miR-10b-5p, miR-203a), and determined that their levels were dramatically changed by at least 50-fold at different time points of the experiment (p < 0.01).
Entities [('miR-100-5p', 'miR'), ('miR-29a-3p', 'miR'), ('miR-130a-3p', 'miR'), ('miR-10a-5p', 'miR'), ('miR-10b-5p', 'miR'), ('miR-203a)', 'miR')]

Because of the high number of miRNAs, we more closely evaluated the expression of six of them (miR-100-5p, miR-29a-3p, miR-130a-3p, miR-10a-5p, miR-10b-5p, miR-203a), and determined that their levels were dramatically changed by at least 50-fold at different time points of the experiment (p < 0.01).
Entities [('miR-100-5p', 'miR'), ('miR-29a-3p', 'miR'), ('miR-130a-3p', 'miR'), ('miR-10a-5p', 'miR'), ('miR-10b-5p', 'miR'), ('miR-203a)', 'miR')]

<i>p</i> < 0.05), whereas the expression of 16 miRNAs was significantly decreased (> 1.5-fold, adj.
