In [1]:
# Get articles :

In [2]:
!pip install xlrd
!pip uninstall spacy
!pip install spacy==2.2.2



In [1]:
import urllib.request
import io
import gzip
import os
from pathlib import Path
import pandas as pd
try:
    from Bio import Entrez
except ModuleNotFoundError:
    !pip install Bio
    from Bio import Entrez
import re
import random
import spacy


response = urllib.request.urlopen('ftp://mirbase.org/pub/mirbase/CURRENT/miRNA.xls.gz')
compressed_file = io.BytesIO(response.read())
decompressed_file = gzip.GzipFile(fileobj=compressed_file)

with open(Path(os.getcwd(), 'miRNA.xlsx'), 'wb') as outfile:
    outfile.write(decompressed_file.read())

mir_database = pd.read_excel('miRNA.xlsx')

mir_database_1 = mir_database.loc[:, ['Accession', 'ID']]
mir_database_2 = mir_database.loc[:, ['Mature1_Acc', 'Mature1_ID']].rename(columns = {'Mature1_Acc':'Accession', 'Mature1_ID':'ID'})
mir_database_3 = mir_database.loc[:, ['Mature2_Acc', 'Mature2_ID']].rename(columns = {'Mature2_Acc':'Accession', 'Mature2_ID':'ID'})

final_database = pd.concat([mir_database_1, mir_database_2, mir_database_3])

In [2]:
print(spacy.__version__)

2.2.2


In [3]:
def fetch_abstract(pmid):
    handle = Entrez.efetch(db='pubmed', id = pmid, retmode='xml')
    article = Entrez.read(handle)['PubmedArticle'][0]['MedlineCitation']['Article']
    if 'Abstract' in article:
            return article['Abstract']['AbstractText']
        
def concat_article(x):
    final_article = str()
    for i in range(len(x)):
        final_article = final_article + str(x[i]) + ' '
    return final_article

In [4]:
def get_literature(user_mir):
    
    filtered_database = final_database[final_database['Accession']  == user_mir]['ID']
    filtered_database = final_database[final_database['Accession']  == user_mir]['ID']

    if filtered_database.size == 1:
        mir = filtered_database.iloc[0]
        print('The accession number ' + user_mir + ' corresponds to miR ' + mir)
    else:
        print('miR accession is incorrect. Try again (caps sensitive)')

    Entrez.email = 'anonymous@gmail.com'
    esearch_query = Entrez.esearch(db="pubmed", term="mir-100", retmode="xml")
    esearch_result = Entrez.read(esearch_query)
    pmid_list = esearch_result['IdList']
    print("pmid's obtained: " + str(len(pmid_list)))
    
    abs_list = []

    for i in pmid_list:
        abs = fetch_abstract(i)
        abs_list.append(abs)
        
    abs_list = [concat_article(i) for i in abs_list if i is not None]
    
    return(abs_list)

In [5]:
training_mir = ['MI0000692', 'MI0000159', 'MI0000172', 'MI0000406', 'MI0000111']

In [6]:
all_abstracts = []

for i in training_mir:
    abstracts = get_literature(i)
    all_abstracts = all_abstracts + abstracts

The accession number MI0000692 corresponds to miR mmu-mir-100
pmid's obtained: 20
The accession number MI0000159 corresponds to miR mmu-mir-133a-1
pmid's obtained: 20
The accession number MI0000172 corresponds to miR mmu-mir-150
pmid's obtained: 20
The accession number MI0000406 corresponds to miR mmu-mir-106a
pmid's obtained: 20
The accession number MI0000111 corresponds to miR hsa-mir-105-1
pmid's obtained: 20


In [7]:
from nltk.tokenize import word_tokenize, sent_tokenize

all_sentences = []

for mir_abs in all_abstracts:
    abstr_sentences = sent_tokenize(mir_abs)
    all_sentences = all_sentences + abstr_sentences


In [8]:
all_sentences[10]

'In conclusion, miR-100, miR-125b, miR-199a and miR-194 may have potential as prognostic and diagnostic biomarkers for GC.'

In [9]:
len(all_sentences)

1065

In [10]:
def make_training_data(string, pattern):
    counter = string.lower().find(pattern.lower())
    end_miR = len(string)
    
    while counter + 1 < len(string):
        counter += 1 
        if string[counter] in ".,!? :;":
            end_miR = counter
            counter = len(string)
            
    extracted_string = string[string.lower().find(pattern.lower()):end_miR]
    new_miR = re.sub(r'-(\d)*', '-' + str(random.randint(1,1200)), extracted_string, 1)
    new_string = string[0:string.lower().find(pattern.lower())] + new_miR + string[end_miR:len(string)]

    counter = new_string.lower().find(pattern.lower())
    end_miR = len(new_string)

    while counter + 1 < len(new_string):
        counter += 1 
        if new_string[counter] in ".,!? :;":
            end_miR = counter
            counter = len(new_string)

    if string.lower().find(pattern.lower()) == -1 or not any(i.isdigit() for i in extracted_string):
        return((string, {'entities': []}))
    else:
        return((new_string, {'entities': [(new_string.lower().find(pattern.lower()), end_miR, 'miR')]}))
       


In [11]:
all_sentences[0:5]

['MicroRNAs play critical roles in regulating target gene expression and multiple cellular processes in human cancer malignant progression.',
 'However, the function of miR-194 in gastric cancer (GC) remains unclear and controversial.',
 'In this study, we identified a series of miRNAs that can serve as prognostic biomarkers for GC by analysis of miRNA expression using TCGA (The Cancer Genome Atlas) data.',
 'Among them, miR-100, miR-125b, miR-199a and miR-194 were the 4 most promising prognostic biomarkers in GC due to their significant associations with various clinical characteristics of patients.',
 'MiR-100, miR-125b and miR-199a predicted poor prognosis in GC, while miR-194 predicted favorable prognosis in GC.']

In [12]:
print([make_training_data(i, 'miR') for i in all_sentences[0:5]])

[('MicroRNAs play critical roles in regulating target gene expression and multiple cellular processes in human cancer malignant progression.', {'entities': []}), ('However, the function of miR-68 in gastric cancer (GC) remains unclear and controversial.', {'entities': [(25, 31, 'miR')]}), ('In this study, we identified a series of miRNAs that can serve as prognostic biomarkers for GC by analysis of miRNA expression using TCGA (The Cancer Genome Atlas) data.', {'entities': []}), ('Among them, miR-929, miR-125b, miR-199a and miR-194 were the 4 most promising prognostic biomarkers in GC due to their significant associations with various clinical characteristics of patients.', {'entities': [(12, 19, 'miR')]}), ('MiR-122, miR-125b and miR-199a predicted poor prognosis in GC, while miR-194 predicted favorable prognosis in GC.', {'entities': [(0, 7, 'miR')]})]


In [13]:
training_data = [make_training_data(i, 'miR') for i in all_sentences]

In [14]:
training_data[1]

('However, the function of miR-319 in gastric cancer (GC) remains unclear and controversial.',
 {'entities': [(25, 32, 'miR')]})

In [15]:
print(spacy.__version__)

2.2.2


In [16]:
!pip install spacy-lookups-data



In [19]:
def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp

In [20]:
miRnlp = train_spacy(training_data, 10)

Starting iteration 0
{'ner': 698.1961718309033}
Starting iteration 1
{'ner': 129.51670173409732}
Starting iteration 2
{'ner': 69.77326219327692}
Starting iteration 3
{'ner': 41.705053622645}
Starting iteration 4
{'ner': 28.710898934038223}
Starting iteration 5
{'ner': 23.4670738215081}
Starting iteration 6
{'ner': 44.93986783116825}
Starting iteration 7
{'ner': 42.402259317974035}
Starting iteration 8
{'ner': 2.0013992651253156}
Starting iteration 9
{'ner': 194.08269852540838}


In [21]:
test_miR = ['MI0000684']

In [22]:
all_test_abstracts = []

for i in test_miR:
    abstracts = get_literature(i)
    all_test_abstracts = all_test_abstracts + abstracts

all_test_sentences = []

for mir_abs in all_test_abstracts:
    abstr_sentences = sent_tokenize(mir_abs)
    all_test_sentences = all_test_sentences + abstr_sentences

The accession number MI0000684 corresponds to miR mmu-mir-107
pmid's obtained: 20


In [23]:
for i in all_test_sentences[0:20]:
    doc = miRnlp(i)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities []
Entities [('miR-194', 'miR')]
Entities []
Entities []
Entities []
Entities [('miR-194', 'miR')]
Entities [('miR-194', 'miR')]
Entities [('miR-194', 'miR')]
Entities [('miR-194', 'miR')]
Entities [('miR-194', 'miR')]
Entities []
Entities [('MiR-194', 'miR')]
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []
Entities []


In [None]:
all_test_sentences[0:5]

['MicroRNAs play critical roles in regulating target gene expression and multiple cellular processes in human cancer malignant progression.',
 'However, the function of miR-194 in gastric cancer (GC) remains unclear and controversial.',
 'In this study, we identified a series of miRNAs that can serve as prognostic biomarkers for GC by analysis of miRNA expression using TCGA (The Cancer Genome Atlas) data.',
 'Among them, miR-100, miR-125b, miR-199a and miR-194 were the 4 most promising prognostic biomarkers in GC due to their significant associations with various clinical characteristics of patients.',
 'MiR-100, miR-125b and miR-199a predicted poor prognosis in GC, while miR-194 predicted favorable prognosis in GC.']

In [None]:
test_string = 'However, the function of miR-193 in gastric cancer (GC) remains unclear and controversial.'

doc = miRnlp(test_string)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities []
