In [15]:
from pymed import PubMed

my_email = "norockderipa@yahoo.com"

# Create a PubMed object that GraphQL can use to query
# Note that the parameters are not required but kindly requested by PubMed Central
# https://www.ncbi.nlm.nih.gov/pmc/tools/developers/
pubmed = PubMed(tool="Protein Interaction Text Miner", email=my_email)


class Publication:
    def __init__(self, 
                 pubmed_id, 
                 title,
                 publication_date,
                 abstract,
                 keywords = ""):
        
        self.pubmed_id = pubmed_id
        self.url = "https://www.ncbi.nlm.nih.gov/pubmed/"+pubmed_id.split("\n")[0]
        self.title = title        
        self.publication_date = publication_date
        self.abstract = abstract
        if abstract is not None and abstract is not "":
            self.words = len(abstract.split(" "))  
        else:
            self.words = 0
        self.keywords = keywords

In [16]:
def queryPubmed(query,results):   


    # Execute the query against the API
    results = pubmed.query(query, max_results=results)



    # Loop over the retrieved articles
    publications = []
    for article in results:
        if hasattr(article, 'keywords'):
            publications +=[Publication(article.pubmed_id,
                                        article.title,
                                        article.publication_date,
                                        article.abstract,
                                        article.keywords)]
        else:
            publications +=[Publication(article.pubmed_id,
                                article.title,
                                article.publication_date,
                                article.abstract)]
    
    publications = [publication for publication in publications if publication.words is not 0]
    
    publications = [publication for publication in publications if query in publication.abstract]
    
    print("Extracted",len(publications),"articles")
    
    return publications

In [17]:
with open ("protein_list.txt", "r", encoding="utf8") as myfile:
    lines=myfile.readlines()
    myfile.close()

proteins = [l.rstrip() for l in lines]    
print(len(proteins),"proteins imported")

import nltk
nltk.download('words')
from nltk.corpus import words
word_list = words.words()
# prints 236736


common = [w for w in word_list]
print(len(common),"common words imported")

207468 proteins imported
236736 common words imported


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Frost\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [18]:
import numpy as np
import random
import joblib
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def wordSplitter(string, length):
    
    words = string.split(" ")
    
    lists = list((words[0+i:length+i] for i in range(0, len(words), length)))        
        
    sentences = [" ".join(l) for l in lists]        
        
    return sentences



PPIDetector = joblib.load('PPINetwork.pkl')

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle) 


def PPI_Detection_Network(text):    
    
    tokenized = tokenizer.texts_to_sequences([text])
    
    padded_tokenized = pad_sequences(tokenized, maxlen=50)
    
    prediction = round(PPIDetector.predict([[padded_tokenized[0]]])[0][0]  + 0.1)
    
    return prediction



def PPI_Estimator(abstract):
    
    results = []
    
    for piece in wordSplitter(abstract, 50):
        results += [PPI_Detection_Network(piece)]
    
    return bool(round(sum(results)/len(results)))

def getPossiblePPP(publications):
    
    possibleInteractions = []
    
    for p in publications:
        if PPI_Estimator(p.abstract):
            possibleInteractions += [p]
        
    return possibleInteractions






import nltk

def processString(sentence):
    text=sentence.lower()
    tokens = nltk.word_tokenize(text)
    #punctuation
    tokens=nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(text)
    #stopwords
    stopwords=set(nltk.corpus.stopwords.words('english'))
    tokens=list(filter(lambda word: not word in stopwords,tokens))
    
    
    return set(tokens)



import joblib
import spacy
import scispacy
nlp = spacy.load("en_core_sci_sm")

def word2vec(x):
    return nlp(x).vector


proteinRecognitionNetwork = joblib.load('proteinRecognitionNetwork.pkl')

def getProteinPrediction(word):
    return round(proteinRecognitionNetwork.predict(np.array([word2vec(word)]))[0][0])


def getCandidateProteins(abstract):
    candidate_words = processString(abstract)
    candidates = []
    for w in candidate_words:
        #if w in proteins:
            #candidates +=[w]
        if getProteinPrediction(w):
            candidates +=[w]
    return candidates



def findRelations(protein,abstract):
    
    protein = protein.lower()
    
    abstract = abstract.lower()
    
    #print(abstract)
    #print()
    
    abstract = abstract.replace(protein,"PROTEINA")
    
    #print(abstract)
    #print()
    
    candidates = getCandidateProteins(abstract)
    
    print(candidates)
    
    if protein in candidates:
        candidates.remove(protein)
        
    finalListOfProteins = []
        
    for candidate in candidates:
        tempAbstract = abstract
        tempAbstract = tempAbstract.replace(candidate,"PROTEINB")

        #print(candidate)
        #print(tempAbstract)
        #print()
        
        if PPI_Estimator(tempAbstract):
            finalListOfProteins +=[candidate]
    
    #print(finalListOfProteins)
    
    return finalListOfProteins
    

In [19]:
publications = queryPubmed(query = "Dystrophin",results = 50)

print(type(publications))






Extracted 6 articles
<class 'list'>


In [20]:
def checkArticlesForPPI(protein):

    publications = queryPubmed(query = protein,results = 500)

    interactions = []
    
    for p in publications:
              
        interactions += findRelations("Dystrophin",p.abstract)
        
    return interactions


In [21]:
checkArticlesForPPI("Dystrophin")

Extracted 58 articles
['years', 'ef', 'provides', 'cell', 'functions', 'c', 'zz', 'two', 'dystroglycan', 'membrane', 'β', 'ww', 'protein', 'beta']
['diseases', 'mutations', 'proteinaopathies']
['cell', 'role', 'membrane', 'term', 'neurons', 'long', 'synapses', 'symptoms', 'mice', '427', '2j', 'kda', 'ca1', 'mdx', 'dmd']
['another', 'mutations', '356', 'x', 'lab', 'patients', '65', 'exons', 'proteinaopathies', 'six', '2', 'xp21', '15', 'allows', 'polymerase', 'mlpa', 'applications', '27', 'male', 'years', 'lead', '10', 'spot']
['60', 'processes', 'lamp2', 'adeno', 'transcription', 'factor', 'protein', 'tfeb', 'extent', '1α', 'mice', 'old', 'eb', 'p62', 'via', 'gastrocnemii', 'lc3ii', '6', '3', 'appears', 'mdx', 'dmd']
['moctn3', 'moctn1', 'disrupts', 'octn2', 'long', 'qrt', 'peroxisomal', 'mitochondrial', 'mice', 'transporter', 'octn1', 'octn3', 'acyl', 'cn', 'male', 'mrna', '10scsn', 'moctn2', 'membrane', '10snj', 'c57bl', 'l', 'mdx', 'protein', 'dmd', 'j']
['junctions', 'glycoprotein'

['co', 'high', '4998_5000', 'c', 'cag', 'p', '73', 'small', '1667', 'sub']


[]