In [2]:
import pandas as pd
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TreebankWordTokenizer
import re
from random import shuffle
import networkx as nx
import spacy
from nltk.corpus import stopwords
from collections import Counter
import nltk
tokenizer = WordPunctTokenizer()

In [3]:
def preProcess(sent):

    sent=re.sub(r'[.,\(\)?!;]',"",sent.lower())
    sent = sent.replace('/',' ')
    sent = tokenizer.tokenize(sent)
    sent=' '.join( [w for w in sent if len(w)>1] )
    return sent


In [5]:
def makePaddedList(sent_contents, maxl, pad_symbol= '<pad>'):
    T = []
    for sent in sent_contents:
        t = []
        lenth = len(sent)
        for i in range(lenth):
            t.append(sent[i])
        for i in range(lenth,maxl):
            t.append(pad_symbol)
        T.append(t)
    return T

In [6]:
def makeDistanceList(lista):
    sent_list = sum(lista, [])
    wf = {}
    for sent in sent_list:
        for w in sent:
            if w in wf:
                wf[w] += 1
            else:
                wf[w] = 0

    wl = []
    i = 1
    for w,f in wf.items():
        wl.append(w)
    return wl


In [8]:
def mapWordToId(sent_contents, word2id):
    T = []
    for sent in sent_contents:
        t = []
        for w in sent:
            #t.append(word_list.index(w))
            t.append(word2id[w])
        T.append(t)
    return T

In [9]:
def mapWordToId_list(sent_contents, word_list):
    T = []
    for sent in sent_contents:
        t = []
        for w in sent:
            t.append(word_list.index(w))            
        T.append(t)
    return T

In [2]:
def mapLabelToId_snp_positive(sent_lables, label_dict):
    return [label_dict[label] for label in sent_lables]
        
    


In [2]:
def mapLabelToId_snp(sent_lables, label_dict):
    return [int (label == 'positive') for label in sent_lables]
    #return [label_dict[label] for label in sent_lables]
        
    


In [10]:
def mapLabelToId_befree_EUADR(sent_lables, label_dict):
    return [int (label != 'FA') for label in sent_lables]
        # SA,NA,PA ==>1    FA ==>0
    


In [11]:
def mapLabelToId_befree(sent_lables, label_dict):
    if len(label_dict) > 2:
        lables=[]
        for label in sent_lables:
            try:
                lables.append(label_dict[label])  
            except:  # only for one sample which  is P
                lables.append(1)
                
        return lables
                
    else:
        return [int (label != 'F') for label in sent_lables]
        # Y,N ==>1    F ==>0
    


In [12]:
def mapLabelToId(sent_lables, label_dict):
    if len(label_dict) > 2:
        return [label_dict[label] for label in sent_lables]
    else:
        return [int (label != 'Negative') for label in sent_lables]



In [15]:
def sample_positive_negative(pos_file,neg_file,neg_ratio,num_pos_samples,random_seed=1337):
    num_neg_samples=neg_ratio*num_pos_samples
    pos_samples= pd.read_csv(pos_file, encoding='latin-1')
    pos_samples=pos_samples[pos_samples['gene_mention'].str.lower()!=pos_samples['disease_mention'].str.lower()]
    pos_samples=pos_samples.sample(num_pos_samples,random_state=random_seed)
    neg_samples= pd.read_csv(neg_file, encoding='latin-1')
    neg_samples=neg_samples[neg_samples['gene_mention'].str.lower()!=neg_samples['disease_mention'].str.lower()]
    neg_samples=neg_samples.sample(num_neg_samples,random_state=random_seed)
    return pos_samples,neg_samples
    


In [1]:
def dataRead_snp(fname, ignore_neutral=False,keep_only_positive=False,read_confidence=False):
    print("Input File Reading")
    samples= pd.read_csv(fname,sep='\t')
    sent_lengths   = []
    sent_contents  = []
    sent_lables    = []
    sent_confidences = []
    entity1_list   = []
    entity2_list   = []
    gene_id_list=[]
    disease_id_list=[]
    gene_symbol_list=[]
    confidence=0
    
    for i,sample in samples.iterrows():
        e1=sample['snp']
        e1_t='snp'
        e2=sample['phenotype']
        e2_t='phenotype'
        relation=sample['lable']
        if read_confidence:
            confidence=sample['confidence']
        
        if keep_only_positive:
            if relation!='positive':
                continue
            
        if ignore_neutral:
            if relation=='neutral':
                continue
            
        sent=sample['sentence']
        sent=sent.lower()
        e1=e1.lower()
        e2=e2.lower()
        
        
        snp_start=int(sample['snp_start'])
        snp_end=int(sample['snp_end'])
        pheno_start=int(sample['pheno_start'])
        pheno_end=int(sample['pheno_end'])
        
        if snp_start > pheno_start:
            sent=sent[:snp_start]+ ' FIRST_ENTITY ' + sent[snp_end:]
            sent=sent[:pheno_start]+ ' SECOND_ENTITY ' + sent[pheno_end:]
        else:
            sent=sent[:pheno_start]+ ' SECOND_ENTITY ' + sent[pheno_end:]
            sent=sent[:snp_start]+ ' FIRST_ENTITY ' + sent[snp_end:]
        sent=preProcess(sent)
        sent_contents.append(sent)
      
     
        try:
            sent_splitted = sent.split()       
            s1 = sent_splitted.index('first_entity')
            s2 = sent_splitted.index('second_entity') 
        except:
            print('input file contains illegal format ')
            break
        
        
        entity1_list.append([e1, e1_t])
        entity2_list.append([e2, e2_t])
        sent_lables.append(relation)
        sent_confidences.append(confidence)
        
        
    return sent_contents, entity1_list, entity2_list, sent_lables,sent_confidences

In [17]:
def dataRead_befree(fname):
    print("Input File Reading")
    
    samples= pd.read_csv(fname)
    sent_lengths   = []
    sent_contents  = []
    sent_lables    = []
    entity1_list   = []
    entity2_list   = []
    gene_id_list=[]
    disease_id_list=[]
    gene_symbol_list=[]

    
    for i,sample in samples.iterrows():
        e1=sample['gene_mention']
        e1_t='gene'
        e2=sample['disease_mention']
        e2_t='disease'
        relation=sample['associationType']
        sent=sample['raw_sentence']
        sent=sent.lower()
        e1=e1.lower()
        e2=e2.lower()
        
        
        offset_gene=sample['GENE_ENTITY_OFFSET'].split('#')
        offset_gene=[int (i) for i in offset_gene]
        
        offset_disease=sample['DISEASE_ENTITY_OFFSET'].split('#')
        offset_disease=[int (i) for i in offset_disease]
        
        if offset_gene[0] > offset_disease[0]:
            sent=sent[:offset_gene[0]]+ ' FIRST_ENTITY ' + sent[offset_gene[1]:]
            sent=sent[:offset_disease[0]]+ ' SECOND_ENTITY ' + sent[offset_disease[1]:]
        else:
            sent=sent[:offset_disease[0]]+ ' SECOND_ENTITY ' + sent[offset_disease[1]:]
            sent=sent[:offset_gene[0]]+ ' FIRST_ENTITY ' + sent[offset_gene[1]:]
        
        sent=preProcess(sent)
        sent_contents.append(sent)
      
     
        try:
            sent_splitted = sent.split()       
            s1 = sent_splitted.index('first_entity')
            s2 = sent_splitted.index('second_entity') 
        except:
            print('input file contains illegal format ')
            break
        
        
        entity1_list.append([e1, e1_t])
        entity2_list.append([e2, e2_t])
        sent_lables.append(relation)
        gene_id_list.append(sample['geneId'])
        disease_id_list.append(sample['diseaseId'])
        gene_symbol_list.append(sample['geneSymbol'])
        
        
    return sent_contents, entity1_list, entity2_list, sent_lables, gene_id_list,disease_id_list,gene_symbol_list


In [18]:
def dataRead_befree_EUADR(fname):
    print("Input File Reading")
    samples=pd.read_csv(fname,sep='\t',encoding='latin',keep_default_na=False)
    sent_lengths   = []
    sent_contents  = []
    sent_lables    = []
    entity1_list   = []
    entity2_list   = []

    for i,sample in samples.iterrows():
        e1=sample['ENTITY1_TEXT']
        e1_t='gene'
        e2=sample['ENTITY2_TEXT']
        e2_t='disease'
        relation=sample['ASSOCIATION_TYPE']
        sent=sample['SENTENCE']
        sent=sent.lower()
        e1=e1.lower()
        e2=e2.lower()
        
        
        offset_gene_start=int(sample['ENTITY1_INI'])
        offset_gene_end=int(sample['ENTITY1_END'])
        
        
        offset_disease_start=int(sample['ENTITY2_INI'])
        offset_disease_end=int(sample['ENTITY2_END'])
        
        
        if offset_gene_start > offset_disease_start:
            sent=sent[:offset_gene_start]+ ' FIRST_ENTITY ' + sent[offset_gene_end:]
            sent=sent[:offset_disease_start]+ ' SECOND_ENTITY ' + sent[offset_disease_end:]
        else:            
            sent=sent[:offset_disease_start]+ ' SECOND_ENTITY ' + sent[offset_disease_end:]
            sent=sent[:offset_gene_start]+ ' FIRST_ENTITY ' + sent[offset_gene_end:]
        
        sent=preProcess(sent)
        sent_contents.append(sent)
      
     
        try:
            sent_splitted = sent.split()       
            s1 = sent_splitted.index('first_entity')
            s2 = sent_splitted.index('second_entity') 
        except:
            print('input file contains illegal format ')
            break
        
        
        entity1_list.append([e1, e1_t])
        entity2_list.append([e2, e2_t])
        sent_lables.append(relation)
        
    return sent_contents, entity1_list, entity2_list, sent_lables


In [1]:
def dataRead(fname,max_length=2000):
    print("Input File Reading")
    
    samples= pd.read_csv(fname, encoding='latin-1')
    sent_lengths   = []
    sent_contents  = []
    sent_lables    = []
    entity1_list   = []
    entity2_list   = []
    gene_id_list=[]
    disease_id_list=[]
    for i,sample in samples.iterrows():
        
        e1=sample['gene_mention']
        e1_t='gene'
        e2=sample['disease_mention']
        e2_t='disease'
        relation=sample['associationType']

        sent=sample['raw_sentence']
        sent=sent.lower()
        e1=e1.lower()
        e2=e2.lower()
        # if e1 contains e2 replcae the e1 first
        if e1.find(e2)!=-1:
            sent = sent.replace(e1, ' FIRST_ENTITY ')
            sent = sent.replace(e2, ' SECOND_ENTITY ')
        else:
            sent = sent.replace(e2, ' SECOND_ENTITY ')
            sent = sent.replace(e1, ' FIRST_ENTITY ')
        sent=preProcess(sent)
        try:
            sent_splitted = sent.split()     
            if len(sent_splitted) > 100:
                continue
            s1 = sent_splitted.index('first_entity')
            s2 = sent_splitted.index('second_entity') 
        except:
            print('input file contains illegal format ')            
            break

        sent_contents.append(sent)
        entity1_list.append([e1, e1_t])
        entity2_list.append([e2, e2_t])
        sent_lables.append(relation)
        gene_id_list.append(sample['geneId'])
        disease_id_list.append(sample['diseaseId'])
        
    return sent_contents, entity1_list, entity2_list, sent_lables, gene_id_list,disease_id_list


In [1]:
def get_wordList_and_distances_snp(sent_list):
    
    word_list = []
    d1_list = []
    d2_list = []
    
    i=0
    for sent in sent_list:
        i=i+1
        sent_list1 = sent.split()       
        try:
            s1 = sent_list1.index('first_entity')
            s2 = sent_list1.index('second_entity') 
        except:
            print(sent)
            break
            
        
        # distance1 feature	
        d1 = []
        for i in range(len(sent_list1)):
            if i < s1 :
                d1.append(str(i - s1))
            elif i > s1 :
                d1.append(str(i - s1 ))
            else:
                d1.append('0')

        #distance2 feature

        d2 = []
        for i in range(len(sent_list1)):
            if i < s2:
                d2.append(str(i - s2))
            elif i > s2:
                d2.append(str(i - s2))
            else:
                d2.append('0')
    
        word_list.append(sent_list1)
        d1_list.append(d1)
        d2_list.append(d2)
    
        
    return word_list, d1_list, d2_list


In [20]:
def get_wordList_and_distances_befree(sent_list):
    word_list = []
    distance1_list = []
    distance2_list = []
    type_list = []
    i=0
    for sent in sent_list:
        i=i+1
        sent_list1 = sent.split()       
        try:
            s1 = sent_list1.index('first_entity')
            s2 = sent_list1.index('second_entity') 
        except:
            print(sent)
            break
            
        
        # distance1 feature
        d1 = []
        for i in range(len(sent_list1)):
            if i < s1 :
                d1.append(str(i - s1))
            elif i > s1 :
                d1.append(str(i - s1 ))
            else:
                d1.append('0')

        #distance2 feature

        d2 = []
        for i in range(len(sent_list1)):
            if i < s2:
                d2.append(str(i - s2))
            elif i > s2:
                d2.append(str(i - s2))
            else:
                d2.append('0')
        
        
        word_list.append(sent_list1)
        distance1_list.append(d1)
        distance2_list.append(d2)
        
        
    return word_list, distance1_list, distance2_list


In [21]:
def get_wordList_and_distances_Corpus(sent_list, entity1_list, entity2_list):
    
    word_list = []
    d1_list = []
    d2_list = []
    
    i=0
    for sent, ent1, ent2 in zip(sent_list, entity1_list, entity2_list):
        i=i+1
        sent_list1 = sent.split()       
        try:
            s1 = sent_list1.index('first_entity')
            s2 = sent_list1.index('second_entity') 
        except:
            print(sent)
            break
            
        
        # distance1 feature	
        d1 = []
        for i in range(len(sent_list1)):
            if i < s1 :
                d1.append(str(i - s1))
            elif i > s1 :
                d1.append(str(i - s1 ))
            else:
                d1.append('0')

        #distance2 feature

        d2 = []
        for i in range(len(sent_list1)):
            if i < s2:
                d2.append(str(i - s2))
            elif i > s2:
                d2.append(str(i - s2))
            else:
                d2.append('0')
        

        word_list.append(sent_list1)
        d1_list.append(d1)
        d2_list.append(d2)
        
        
    return word_list, d1_list, d2_list 


In [22]:
def split_train_test_val(samples,train_frac,test_frac,val_frac,random_seed=1337):
    
    val_param=1-val_frac
    train, test, val = np.split(samples.sample(frac=1,random_state=random_seed), [int(train_frac*len(samples)), int(val_param*len(samples))])
    return train,test,val

##  Function for Embeddings

In [23]:
def makeWordList(lista):
    sent_list = sum(lista, [])
    wf = {}
    for sent in sent_list:
        for w in sent:
            if w in wf:
                wf[w] += 1
            else:
                wf[w] = 0

    wl = []
    i = 1

    wl.append('<pad>')
    wl.append('<unkown>')
    for w,f in wf.items():
        wl.append(w)
    return wl

In [24]:
def create_dico(item_list):
    """
    Create a dictionary of items from a list of list of items.
    """
    assert type(item_list) is list
    dico = {}
    for items in item_list:
        for item in items:
            if item not in dico:
                dico[item] = 1
            else:
                dico[item] += 1
    return dico

In [25]:
def create_mapping(dico):
    """
    Create a mapping (item to ID / ID to item) from a dictionary.
    Items are ordered by decreasing frequency.
    """
    sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
    id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
    item_to_id = {v: k for k, v in id_to_item.items()}
    return item_to_id, id_to_item


In [26]:
def word_mapping(Tr_word_list):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    dico = create_dico(Tr_word_list)
    dico['<unknown>'] = 10000000
    dico['<pad>'] = 10000000 + 1
    word_to_id, id_to_word = create_mapping(dico)
    print ("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in Tr_word_list)))
    return dico, word_to_id, id_to_word

In [3]:
def readWordEmb_fastText(dico_words,id_to_word,word_to_id, fname, embSize=300):
    import io
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())    
    c_found = 0
    print ("Reading word vectors")
    word_emb_weight = np.zeros((len(dico_words), embSize))
    foundWords={}
    for line in fin:
        tokens = line.rstrip().split(' ')
        if tokens[0] in dico_words:
            word_emb_weight[word_to_id[tokens[0]]] = list(map(float, tokens[1:]))
            foundWords[tokens[0]]=1  # 1 is not used
            c_found += 1
        
    n_words = len(dico_words)    
    count=0
    for i in range(n_words):
        word = id_to_word[i]
        if word not in foundWords:
            count += 1
            word_emb_weight[i]=np.random.rand(embSize)
    
    print ("number of unknown word in word embedding", count)
    print ("number of known word in word embedding", c_found)
    
    return word_emb_weight


In [1]:
def readWordEmb(dico_words,id_to_word,word_to_id, fname, embSize=200,limit=None):
    from gensim.models.keyedvectors import KeyedVectors   
    model = KeyedVectors.load_word2vec_format(fname, binary=True,limit=limit)
    print ("Reading word vectors")
    word_emb_weight = np.zeros((len(dico_words), embSize))
    c_found = 0
    n_words = len(dico_words)
    count=0
    for i in range(n_words):
        word = id_to_word[i]
        if word in model:
            word_emb_weight[i] = model[word]
            c_found += 1
        else:
            count += 1
            word_emb_weight[i]=np.random.rand(embSize)
    
    print ('Loaded %i pretrained embeddings.' % len(model.vocab))   
    print ("number of unknown word in word embedding", count)
    
    return word_emb_weight

In [28]:
def findLongestSent(Tr_word_list, Te_word_list):
    combine_list = Tr_word_list + Te_word_list
    a = max([len(sent) for sent in combine_list])
    return a
 

In [29]:
def findSentLengths(tr_te_list):
    lis = []
    for lists in tr_te_list:
        lis.append([len(l) for l in lists])
    return lis
 

In [None]:
def paddData(listL, maxl,padd_num=0): #W_batch, d1_tatch, d2_batch, t_batch)
    rlist = []
    for mat in listL:
        mat_n = []
        for row in mat:
            lenth = len(row)
            t = []
            for i in range(lenth):
                t.append(row[i])
            for i in range(lenth, maxl):
                t.append(padd_num)
            mat_n.append(t)
        rlist.append(np.array(mat_n)) 
    return rlist

In [1]:
def paddData(listL, maxl,padd_num=0): #W_batch, d1_tatch, d2_batch, t_batch)
    rlist = []
    for mat in listL:
        mat_n = []
        for row in mat:
            lenth = len(row)
            t = []
            if (lenth>maxl):
                lenth=maxl
            for i in range(lenth):
                t.append(row[i])
            for i in range(lenth, maxl):
                t.append(padd_num)
            mat_n.append(t)
        rlist.append(np.array(mat_n)) 
    return rlist

In [31]:
def makeBalence(Tr_sent_contents, Tr_entity1_list, Tr_entity2_list, Tr_sent_lables):
    sent_contents=[]; entity1_list=[]; entity2_list=[]; sent_lables=[];
    other = []
    clas = []
    for sent,e1,e2,lab in zip(Tr_sent_contents, Tr_entity1_list, Tr_entity2_list, Tr_sent_lables):
        if lab == 'false' :
            other.append([sent, e1, e2, lab])
        else:
            clas.append([sent, e1, e2, lab])

    random.shuffle(other)

    neg = other[0 : 3*len(clas)]
    l = neg+clas
    for sent,e1,e2,lab in l:
        sent_contents.append(sent)
        entity1_list.append(e1)
        entity2_list.append(e2)
        sent_lables.append(lab)
    return sent_contents, entity1_list, entity2_list, sent_lables

In [32]:
def create_validation_split(data, percent_validation, percent_test, seed=1337):

    
    random.seed(seed)
    shuffle(data)
    nrows = len(data)
    test_len = int(percent_test/100.0 * nrows)
    val_len = int(percent_validation/100.0 * nrows)
    train_len=nrows -(val_len+test_len)
   
    return data[:train_len], data[train_len:train_len+val_len], data[train_len+val_len:]


In [1]:
def get_path(Snt, Gen=None, Dis=None):
    """
    Get a sentence,gene and disease mentions and returns frequency of words in the path connecting them
    
    Reuirements:
    
    nltk.download('stopwords')
    sudo pip3 install networkx
    sudo pip3 install spacy
    sudo -H pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz
    sudo python3 -m spacy download en_core_web_lg
    
    """
    edges = []
    for token in Snt:
        for child in token.children:
            edges.append(('{0}'.format(token),
                          '{0}'.format(child)))
    graph = nx.Graph(edges)
    try:
        path = nx.shortest_path(graph, source=Gen, target=Dis)
    except nx.NetworkXNoPath:
        path = []
    #filtered_words = [word for word in path if word not in stop]
    return path


In [4]:
def get_lowest_common_anssestor(Snt, Gen=None, Dis=None):
    """
    Get a sentence,gene and disease mentions and returns the root word  of words (lowest_common_anssestor) in the path connecting them
    
    """
    G = nx.DiGraph()
    edges = []
    nodes=[]
    for token in Snt:
        nodes.append('{0}'.format(token))
        for child in token.children:
            edges.append(('{0}'.format(token),
                          '{0}'.format(child)))
    #graph = nx.Graph(edges)
    G.add_nodes_from(nodes)
    G.add_edges_from(edges)
    common=nx.algorithms.lowest_common_ancestor(G,Gen,Dis)
    return mylem.lemmatize(common, pos='v')

  

In [1]:
def get_counts(Snt, Gen=None, Dis=None):
    """
    Get a sentence,gene and disease mentions and returns frequency of words in the path connecting them
    
    """
    
    edges = []
    for token in Snt:
        for child in token.children:
            edges.append(('{0}'.format(token),
                          '{0}'.format(child)))
    graph = nx.Graph(edges)
    try:
        path = nx.shortest_path(graph, source=Gen, target=Dis)
        return  nx.ancestors(graph,Dis)
        
    except nx.NetworkXNoPath:
        path = []
    #removing stop words
    filtered_words = [word for word in path if word not in stop]

    #Frequency of words
    counts = Counter(filtered_words)

    
    return counts
