In [7]:
# Get articles :

In [8]:
!pip install xlrd



In [8]:
import urllib.request
import io
import gzip
import os
from pathlib import Path
import pandas as pd
try:
    from Bio import Entrez
except ModuleNotFoundError:
    !pip install Bio
    from Bio import Entrez
import re


response = urllib.request.urlopen('ftp://mirbase.org/pub/mirbase/CURRENT/miRNA.xls.gz')
compressed_file = io.BytesIO(response.read())
decompressed_file = gzip.GzipFile(fileobj=compressed_file)

with open(Path(os.getcwd(), 'miRNA.xlsx'), 'wb') as outfile:
    outfile.write(decompressed_file.read())

mir_database = pd.read_excel('miRNA.xlsx')

mir_database_1 = mir_database.loc[:, ['Accession', 'ID']]
mir_database_2 = mir_database.loc[:, ['Mature1_Acc', 'Mature1_ID']].rename(columns = {'Mature1_Acc':'Accession', 'Mature1_ID':'ID'})
mir_database_3 = mir_database.loc[:, ['Mature2_Acc', 'Mature2_ID']].rename(columns = {'Mature2_Acc':'Accession', 'Mature2_ID':'ID'})

final_database = pd.concat([mir_database_1, mir_database_2, mir_database_3])

In [2]:
def fetch_abstract(pmid):
    handle = Entrez.efetch(db='pubmed', id = pmid, retmode='xml')
    article = Entrez.read(handle)['PubmedArticle'][0]['MedlineCitation']['Article']
    if 'Abstract' in article:
            return article['Abstract']['AbstractText']
        
def concat_article(x):
    final_article = str()
    for i in range(len(x)):
        final_article = final_article + str(x[i]) + ' '
    return final_article

In [3]:
def get_literature(user_mir):
    
    filtered_database = final_database[final_database['Accession']  == user_mir]['ID']
    filtered_database = final_database[final_database['Accession']  == user_mir]['ID']

    if filtered_database.size == 1:
        mir = filtered_database.iloc[0]
        print('The accession number ' + user_mir + ' corresponds to miR ' + mir)
    else:
        print('miR accession is incorrect. Try again (caps sensitive)')

    Entrez.email = 'anonymous@gmail.com'
    esearch_query = Entrez.esearch(db="pubmed", term="mir-100", retmode="xml")
    esearch_result = Entrez.read(esearch_query)
    pmid_list = esearch_result['IdList']
    print("pmid's obtained: " + str(len(pmid_list)))
    
    abs_list = []

    for i in pmid_list:
        abs = fetch_abstract(i)
        abs_list.append(abs)
        
    abs_list = [concat_article(i) for i in abs_list if i is not None]
    
    return(abs_list)

In [12]:
training_mir = ['MI0000692', 'MI0000159', 'MI0000172', 'MI0000406', 'MI0000111']

In [4]:
training_mir = ['MI0000692']

In [5]:
all_abstracts = []

for i in training_mir:
    abstracts = get_literature(i)
    all_abstracts = all_abstracts + abstracts

The accession number MI0000692 corresponds to miR mmu-mir-100
pmid's obtained: 20


In [6]:
from nltk.tokenize import word_tokenize, sent_tokenize

all_sentences = []

for mir_abs in all_abstracts:
    abstr_sentences = sent_tokenize(mir_abs)
    all_sentences = all_sentences + abstr_sentences


In [23]:
all_sentences[12:20]

['After a myocardial infarction, the adult human heart lacks sufficient regenerative capacity to restore lost tissue, leading to heart failure progression.',
 'Finding novel ways to reprogram adult cardiomyocytes into a regenerative state is a major therapeutic goal.',
 'The epicardium, the outermost layer of the heart, contributes cardiovascular cell types to the forming heart and is a source of trophic signals to promote heart muscle growth during embryonic development.',
 'The epicardium is also essential for heart regeneration in zebrafish and neonatal mice and can be reactivated after injury in adult hearts to improve outcome.',
 'A recently identified mechanism of cell-cell communication and signalling is that mediated by extracellular vesicles (EVs).',
 'Here, we aimed to investigate epicardial signalling via EV release in response to cardiac injury and as a means to optimise cardiac repair and regeneration.',
 'We isolated epicardial EVs from mouse and human sources and targete

In [40]:
iter = re.finditer(r'mir[^\s|.|,|!|?| |:|;]*', "mir-100 and mir-100-5p are the same.")
[i for i in iter]
[i.start() for i in iter]
#[i.match() for i in iter]
for i in re.finditer(r'mir[^\s|.|,|!|?| |:|;]*', "mir-100 and mir-100-5p are the same."):
    print(i.span())
[i.span() for i in re.finditer(r'mir[^\s|.|,|!|?| |:|;]*', "mir-100 and mir-100-5p are the same.")]

(0, 7)
(12, 22)


[(0, 7), (12, 22)]

In [16]:
len(all_sentences)

1065

In [86]:
def make_training_data(string, pattern):
    counter = string.lower().find(pattern.lower())
    end_miR = len(string)
    
    while counter + 1 < len(string):
        counter += 1 
        if string[counter] in ".,!? :;":
            end_miR = counter
            counter = len(string)
            
    extracted_string = string[string.lower().find(pattern.lower()):end_miR]
    
    if string.lower().find(pattern.lower()) == -1 or not any(i.isdigit() for i in extracted_string):
        return((string, {'entities': []}))
    else:
        return((string, {'entities': [(string.lower().find(pattern.lower()), end_miR, 'miR')]}))


In [87]:
def check_training_data(string, pattern):
    counter = string.lower().find(pattern.lower())
    end_miR = len(string)
    
    while counter + 1 < len(string):
        counter += 1 
        if string[counter] in ".,!? :;":
            end_miR = counter
            counter = len(string)
           
    extracted_string = string[string.lower().find(pattern.lower()):end_miR]    
        
    if string.lower().find(pattern.lower()) == -1 or not any(i.isdigit() for i in extracted_string):
        return('')
    else:
        return(extracted_string)

In [88]:
all_sentences[0:5]

['MicroRNAs play critical roles in regulating target gene expression and multiple cellular processes in human cancer malignant progression.',
 'However, the function of miR-194 in gastric cancer (GC) remains unclear and controversial.',
 'In this study, we identified a series of miRNAs that can serve as prognostic biomarkers for GC by analysis of miRNA expression using TCGA (The Cancer Genome Atlas) data.',
 'Among them, miR-100, miR-125b, miR-199a and miR-194 were the 4 most promising prognostic biomarkers in GC due to their significant associations with various clinical characteristics of patients.',
 'MiR-100, miR-125b and miR-199a predicted poor prognosis in GC, while miR-194 predicted favorable prognosis in GC.']

In [89]:
print(type([make_training_data(i, 'miR') for i in all_sentences[0:5]][1]))
print()
print([check_training_data(i, 'miR') for i in all_sentences[0:5]])

<class 'tuple'>

['', 'miR-194', '', 'miR-100', 'MiR-100']


In [90]:
training_data = [make_training_data(i, 'miR') for i in all_sentences]

In [91]:
training_data[1]

('However, the function of miR-194 in gastric cancer (GC) remains unclear and controversial.',
 {'entities': [(25, 32, 'miR')]})

In [92]:
import spacy
import random

In [78]:
print(spacy.__version__)

2.3.5


In [96]:
!pip install spacy-lookups-data

Collecting spacy-lookups-data
  Downloading spacy_lookups_data-1.0.0-py2.py3-none-any.whl (93.4 MB)
[K     |████████████████████████████████| 93.4 MB 22 kB/s s eta 0:00:01   |█▌                              | 4.4 MB 4.5 MB/s eta 0:00:20     |███▏                            | 9.1 MB 4.5 MB/s eta 0:00:19     |███████████████                 | 44.0 MB 13.4 MB/s eta 0:00:04     |███████████████████████▍        | 68.1 MB 13.2 MB/s eta 0:00:02
Installing collected packages: spacy-lookups-data
Successfully installed spacy-lookups-data-1.0.0


In [97]:
def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp

In [100]:
miRnlp = train_spacy(training_data, 20)

Statring iteration 0
{'ner': 540.7753894802347}
Statring iteration 1
{'ner': 121.3353181946998}
Statring iteration 2
{'ner': 180.8156804720977}
Statring iteration 3
{'ner': 159.823294482903}
Statring iteration 4
{'ner': 109.46729700126087}
Statring iteration 5
{'ner': 93.3332711722388}
Statring iteration 6
{'ner': 61.53049516743217}
Statring iteration 7
{'ner': 42.50040410344927}
Statring iteration 8
{'ner': 12.63705921049174}
Statring iteration 9
{'ner': 30.73334057955599}
Statring iteration 10
{'ner': 46.06847185864277}
Statring iteration 11
{'ner': 0.0016748744729511566}
Statring iteration 12
{'ner': 0.02129791527256239}
Statring iteration 13
{'ner': 4.689143328467551e-09}
Statring iteration 14
{'ner': 38.90907874504627}
Statring iteration 15
{'ner': 31.50245819976662}
Statring iteration 16
{'ner': 24.160864319264878}
Statring iteration 17
{'ner': 27.48014735164433}
Statring iteration 18
{'ner': 26.16853955596753}
Statring iteration 19
{'ner': 17.831414355206892}


In [106]:
test_miR = ['MI0000684']

In [107]:
all_test_abstracts = []

for i in test_miR:
    abstracts = get_literature(i)
    all_test_abstracts = all_test_abstracts + abstracts

all_test_sentences = []

for mir_abs in all_test_abstracts:
    abstr_sentences = sent_tokenize(mir_abs)
    all_test_sentences = all_test_sentences + abstr_sentences

The accession number MI0000684 corresponds to miR mmu-mir-107
pmid's obtained: 20


In [111]:
for i in all_test_sentences[0:20]:
    doc = miRnlp(i)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities []
Entities [('miR-194', 'miR')]
Entities []
Entities [('miR-100', 'miR')]
Entities [('MiR-100', 'miR')]
Entities [('miR-194', 'miR')]
Entities [('miR-194', 'miR')]
Entities [('miR-194', 'miR')]
Entities [('miR-194', 'miR')]
Entities [('miR-194', 'miR')]
Entities [('miR-100', 'miR')]
Entities [('MiR-194', 'miR')]
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []


In [108]:
all_test_sentences[0:5]

['MicroRNAs play critical roles in regulating target gene expression and multiple cellular processes in human cancer malignant progression.',
 'However, the function of miR-194 in gastric cancer (GC) remains unclear and controversial.',
 'In this study, we identified a series of miRNAs that can serve as prognostic biomarkers for GC by analysis of miRNA expression using TCGA (The Cancer Genome Atlas) data.',
 'Among them, miR-100, miR-125b, miR-199a and miR-194 were the 4 most promising prognostic biomarkers in GC due to their significant associations with various clinical characteristics of patients.',
 'MiR-100, miR-125b and miR-199a predicted poor prognosis in GC, while miR-194 predicted favorable prognosis in GC.']

In [112]:
test_string = 'However, the function of miR-193 in gastric cancer (GC) remains unclear and controversial.'

doc = miRnlp(test_string)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities []
