In [7]:
# Get articles :

In [8]:
!pip install xlrd



In [9]:
import urllib.request
import io
import gzip
import os
from pathlib import Path
import pandas as pd
try:
    from Bio import Entrez
except ModuleNotFoundError:
    !pip install Bio
    from Bio import Entrez


response = urllib.request.urlopen('ftp://mirbase.org/pub/mirbase/CURRENT/miRNA.xls.gz')
compressed_file = io.BytesIO(response.read())
decompressed_file = gzip.GzipFile(fileobj=compressed_file)

with open(Path(os.getcwd(), 'miRNA.xlsx'), 'wb') as outfile:
    outfile.write(decompressed_file.read())

mir_database = pd.read_excel('miRNA.xlsx')

mir_database_1 = mir_database.loc[:, ['Accession', 'ID']]
mir_database_2 = mir_database.loc[:, ['Mature1_Acc', 'Mature1_ID']].rename(columns = {'Mature1_Acc':'Accession', 'Mature1_ID':'ID'})
mir_database_3 = mir_database.loc[:, ['Mature2_Acc', 'Mature2_ID']].rename(columns = {'Mature2_Acc':'Accession', 'Mature2_ID':'ID'})

final_database = pd.concat([mir_database_1, mir_database_2, mir_database_3])

In [10]:
def fetch_abstract(pmid):
    handle = Entrez.efetch(db='pubmed', id = pmid, retmode='xml')
    article = Entrez.read(handle)['PubmedArticle'][0]['MedlineCitation']['Article']
    if 'Abstract' in article:
            return article['Abstract']['AbstractText']
        
def concat_article(x):
    final_article = str()
    for i in range(len(x)):
        final_article = final_article + str(x[i]) + ' '
    return final_article

In [11]:
def get_literature(user_mir):
    
    filtered_database = final_database[final_database['Accession']  == user_mir]['ID']
    filtered_database = final_database[final_database['Accession']  == user_mir]['ID']

    if filtered_database.size == 1:
        mir = filtered_database.iloc[0]
        print('The accession number ' + user_mir + ' corresponds to miR ' + mir)
    else:
        print('miR accession is incorrect. Try again (caps sensitive)')

    Entrez.email = 'anonymous@gmail.com'
    esearch_query = Entrez.esearch(db="pubmed", term="mir-100", retmode="xml")
    esearch_result = Entrez.read(esearch_query)
    pmid_list = esearch_result['IdList']
    print("pmid's obtained: " + str(len(pmid_list)))
    
    abs_list = []

    for i in pmid_list:
        abs = fetch_abstract(i)
        abs_list.append(abs)
        
    abs_list = [concat_article(i) for i in abs_list if i is not None]
    
    return(abs_list)

In [12]:
training_mir = ['MI0000692', 'MI0000159', 'MI0000172', 'MI0000406', 'MI0000111']

In [13]:
all_abstracts = []

for i in training_mir:
    abstracts = get_literature(i)
    all_abstracts = all_abstracts + abstracts

The accession number MI0000692 corresponds to miR mmu-mir-100
pmid's obtained: 20
The accession number MI0000159 corresponds to miR mmu-mir-133a-1
pmid's obtained: 20
The accession number MI0000172 corresponds to miR mmu-mir-150
pmid's obtained: 20
The accession number MI0000406 corresponds to miR mmu-mir-106a
pmid's obtained: 20
The accession number MI0000111 corresponds to miR hsa-mir-105-1
pmid's obtained: 20


In [14]:
from nltk.tokenize import word_tokenize, sent_tokenize

all_sentences = []

for mir_abs in all_abstracts:
    abstr_sentences = sent_tokenize(mir_abs)
    all_sentences = all_sentences + abstr_sentences


In [15]:
all_sentences[10]

'In conclusion, miR-100, miR-125b, miR-199a and miR-194 may have potential as prognostic and diagnostic biomarkers for GC.'

In [16]:
len(all_sentences)

1065

In [17]:
def make_training_data(string, pattern):
    counter = string.lower().find(pattern.lower())
    end_miR = len(string)
    
    while counter + 1 < len(string):
        counter += 1 
        if string[counter] in ".,!? :;":
            end_miR = counter
            counter = len(string)
            
    extracted_string = string[string.lower().find(pattern.lower()):end_miR]
    
    if string.lower().find(pattern.lower()) == -1 or not any(i.isdigit() for i in extracted_string):
        return((string, {'entities': []}))
    else:
        return((string, {'entities': [(string.lower().find(pattern.lower()), end_miR, 'miR')]}))


In [18]:
def check_training_data(string, pattern):
    counter = string.lower().find(pattern.lower())
    end_miR = len(string)
    
    while counter + 1 < len(string):
        counter += 1 
        if string[counter] in ".,!? :;":
            end_miR = counter
            counter = len(string)
           
    extracted_string = string[string.lower().find(pattern.lower()):end_miR]    
        
    if string.lower().find(pattern.lower()) == -1 or not any(i.isdigit() for i in extracted_string):
        return('')
    else:
        return(extracted_string)

In [19]:
all_sentences[0:5]

['MicroRNAs play critical roles in regulating target gene expression and multiple cellular processes in human cancer malignant progression.',
 'However, the function of miR-194 in gastric cancer (GC) remains unclear and controversial.',
 'In this study, we identified a series of miRNAs that can serve as prognostic biomarkers for GC by analysis of miRNA expression using TCGA (The Cancer Genome Atlas) data.',
 'Among them, miR-100, miR-125b, miR-199a and miR-194 were the 4 most promising prognostic biomarkers in GC due to their significant associations with various clinical characteristics of patients.',
 'MiR-100, miR-125b and miR-199a predicted poor prognosis in GC, while miR-194 predicted favorable prognosis in GC.']

In [20]:
print([make_training_data(i, 'miR') for i in all_sentences[0:5]])
print()
print([check_training_data(i, 'miR') for i in all_sentences[0:5]])

[('MicroRNAs play critical roles in regulating target gene expression and multiple cellular processes in human cancer malignant progression.', {'entities': []}), ('However, the function of miR-194 in gastric cancer (GC) remains unclear and controversial.', {'entities': [(25, 32, 'miR')]}), ('In this study, we identified a series of miRNAs that can serve as prognostic biomarkers for GC by analysis of miRNA expression using TCGA (The Cancer Genome Atlas) data.', {'entities': []}), ('Among them, miR-100, miR-125b, miR-199a and miR-194 were the 4 most promising prognostic biomarkers in GC due to their significant associations with various clinical characteristics of patients.', {'entities': [(12, 19, 'miR')]}), ('MiR-100, miR-125b and miR-199a predicted poor prognosis in GC, while miR-194 predicted favorable prognosis in GC.', {'entities': [(0, 7, 'miR')]})]

['', 'miR-194', '', 'miR-100', 'MiR-100']


In [21]:
training_data = [make_training_data(i, 'miR') for i in all_sentences]

In [22]:
training_data[1]

('However, the function of miR-194 in gastric cancer (GC) remains unclear and controversial.',
 {'entities': [(25, 32, 'miR')]})

In [23]:
import spacy
import random

In [24]:
print(spacy.__version__)

2.3.5


In [25]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 3.4 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [26]:
nlp=spacy.load('en_core_web_sm')

# Getting the pipeline component
ner=nlp.get_pipe("ner")

In [27]:
for _, annotations in training_data:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])
    
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [28]:
print(spacy.__version__)


2.3.5


In [None]:
import random
from spacy.util import minibatch, compounding
from pathlib import Path

all_losses = []
# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 10 iterations
  for iteration in range(500):

    # shuufling examples  before every iteration
    random.shuffle(training_data)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
        
    all_losses.append(losses)

In [37]:
len(all_losses)

100

In [38]:
all_losses

[{'ner': 18086.475824654102},
 {'ner': 18221.50495751947},
 {'ner': 18017.555027782917},
 {'ner': 18089.43132263422},
 {'ner': 18102.15681213609},
 {'ner': 18167.981379110715},
 {'ner': 18105.43503576517},
 {'ner': 18047.357963236147},
 {'ner': 18152.546090126038},
 {'ner': 18171.00736861126},
 {'ner': 18167.968371226045},
 {'ner': 18191.358203679323},
 {'ner': 18178.26528763771},
 {'ner': 18143.42500393145},
 {'ner': 18167.608760505915},
 {'ner': 18198.85860077123},
 {'ner': 18236.310033619404},
 {'ner': 18081.673486324027},
 {'ner': 18238.54442322254},
 {'ner': 18109.265405595303},
 {'ner': 18142.225919485092},
 {'ner': 18087.139151201307},
 {'ner': 18024.00379225734},
 {'ner': 18058.318527831783},
 {'ner': 18217.95513150096},
 {'ner': 18003.504776746035},
 {'ner': 17899.657508030534},
 {'ner': 18107.4003149271},
 {'ner': 18163.577620390803},
 {'ner': 18111.807903826237},
 {'ner': 18007.221433520317},
 {'ner': 18090.340340316296},
 {'ner': 18087.450000226498},
 {'ner': 18079.18424719

In [41]:
doc = nlp("This is a test sentence about miR-100.")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities []


In [32]:
test_list = []

test_list.append({'thing1': 1})

test_list

[{'thing1': 1}]

In [52]:
test_miR = ['MI0000684']

In [57]:
all_test_abstracts = []

for i in test_miR:
    abstracts = get_literature(i)
    all_test_abstracts = all_test_abstracts + abstracts

all_test_sentences = []

for mir_abs in all_test_abstracts:
    abstr_sentences = sent_tokenize(mir_abs)
    all_test_sentences = all_test_sentences + abstr_sentences

The accession number MI0000684 corresponds to miR mmu-mir-107
pmid's obtained: 20


In [58]:
all_test_sentences[0:5]

['MicroRNAs play critical roles in regulating target gene expression and multiple cellular processes in human cancer malignant progression.',
 'However, the function of miR-194 in gastric cancer (GC) remains unclear and controversial.',
 'In this study, we identified a series of miRNAs that can serve as prognostic biomarkers for GC by analysis of miRNA expression using TCGA (The Cancer Genome Atlas) data.',
 'Among them, miR-100, miR-125b, miR-199a and miR-194 were the 4 most promising prognostic biomarkers in GC due to their significant associations with various clinical characteristics of patients.',
 'MiR-100, miR-125b and miR-199a predicted poor prognosis in GC, while miR-194 predicted favorable prognosis in GC.',
 'We also provide\xa0the\xa0first\xa0comprehensive transcriptome analysis of miR-194 in GC.',
 "Our data suggest that miR-194 tends to regulated target genes by binding to their 3' untranslated regions in a 7-mer-A1, 7-mer-m8 or 8-mer manner.",
 'KEGG pathway analysis sho

In [60]:
for doc in nlp.pipe(all_test_sentences[0:5]):
    # Print the document text and entitites
    print(doc.text)
    print(doc.ents, '\n')

MicroRNAs play critical roles in regulating target gene expression and multiple cellular processes in human cancer malignant progression.
() 

However, the function of miR-194 in gastric cancer (GC) remains unclear and controversial.
() 

In this study, we identified a series of miRNAs that can serve as prognostic biomarkers for GC by analysis of miRNA expression using TCGA (The Cancer Genome Atlas) data.
() 

Among them, miR-100, miR-125b, miR-199a and miR-194 were the 4 most promising prognostic biomarkers in GC due to their significant associations with various clinical characteristics of patients.
() 

MiR-100, miR-125b and miR-199a predicted poor prognosis in GC, while miR-194 predicted favorable prognosis in GC.
() 



In [None]:
from spacy.lang.en import English
nlp = English()