In [None]:
import math
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
!pip install owlready2
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from owlready2 import *
import seaborn as sns

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Collecting owlready2
  Downloading Owlready2-0.37.tar.gz (23.8 MB)
[K     |████████████████████████████████| 23.8 MB 1.4 MB/s 
[?25hBuilding wheels for collected packages: owlready2
  Building wheel for owlready2 (setup.py) ... [?25l[?25hdone
  Created wheel for owlready2: filename=Owlready2-0.37-cp37-cp37m-linux_x86_64.whl size=20445991 sha256=20d2c7ebaee74d271f9d30d7f8faf9f8e193e04dd76926d6197a478942378c51
  Stored in directory: /root/.cache/pip/wheels/34/49/36/31062d59333455aa0cb4950141cab4945600ce61c26a44e9cd
Successfully built owlready2
Installing collected packages: owlready2
Successfully installed owlready2-0.37


In [None]:
def get_path(cl):
    path = cl.name
    while True:
        try:
            path = path + '/' + cl.is_a[0].name
        except IndexError:
            break
        cl = cl.is_a[0]
        if cl == 'owl.Thing':
          #the very first node
            break

    return '/'.join(path.split('/')[::-1])

In [None]:

def read_ontology(path):
    onto = get_ontology(path)
    onto.load()

    # Read classes
    classes = []

    for cl in onto.classes():
        classes.append(cl)

    classes = list(set(classes))

    '''# Read properties
    properties = []

    for prop in onto.properties():
        properties.append(prop)

    properties = list(set(properties))'''

    return classes
def get_classes_df(ont_path):
    data = []

    # Parse ontologies
    classes1 = read_ontology(ont_path)

    # Generate classes
    for class_ in classes1:
      data.append((class_.name.lower(), 
                     get_path(class_).lower(),''.join(class_.label).lower(),''.join(class_.comment).lower()))
    dataset = pd.DataFrame(data, columns=['Name','Path','label','comment'])

    return dataset

In [None]:
def sim_plot(sim_df,threshold):
  plot_df = sim_df
  plot_df[plot_df<threshold]=0.0
  rows = sim_df.shape[0]
  cols = sim_df.shape[1]

  plt.figure(figsize=((0.4*rows),(0.4*cols)))
  cmap = sns.diverging_palette(230, 20, as_cmap=True)
  cg = sns.heatmap(sim_df, fmt="g", cmap=cmap,linewidths=0.5, linecolor='black')

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
import sys
sys.path.insert(0, '/content/drive/My Drive/Thesis2022')

In [None]:
human = get_classes_df('/content/drive/My Drive/Thesis2022/human.owl')
mouse = get_classes_df('/content/drive/My Drive/Thesis2022/mouse.owl')
#onto = get_classes_df('/content/drive/My Drive/Thesis2022/OntoWind.owl')
#sf_ont = get_classes_df('/content/drive/My Drive/Thesis2022/SF-ONT.owl')

In [None]:
mouse_dict = {}

for i in range(mouse.shape[0]):
  mouse_dict[mouse.iloc[i]['Name']] = mouse.iloc[i]['label']

human_dict = {}

for i in range(human.shape[0]):
  human_dict[human.iloc[i]['Name']] = human.iloc[i]['label']

In [None]:
path_list_human = []
parents_list_human = []
for i in human.Path:
  path = ['thing']
  split_list = i.split('/')[1:]
  for j in range(len(split_list)):
    path.append(human_dict[split_list[j]])
  path_list_human.append('/'.join(path))
  parents_list_human.append(path[-2])


In [None]:
human['Path'] = path_list_human
human['Parents']= parents_list_human 

In [None]:
human = human.apply(lambda x: x.str.replace('_', ' ') )
human = human.apply(lambda x: x.str.replace('-', ' ') )
human = human[human['Parents'] != 'thing']


In [None]:
path_list_mouse = []
parents_list_mouse = []
for i in mouse.Path:
  path = ['thing']
  split_list = i.split('/')[1:]
  for j in range(len(split_list)):
    path.append(mouse_dict[split_list[j]])
  path_list_mouse.append('/'.join(path))
  parents_list_mouse.append(path[len(path)-2])

In [None]:
mouse['Path'] = path_list_mouse
mouse['Parents']= parents_list_mouse 

In [None]:
mouse = mouse.apply(lambda x: x.str.replace('_', ' '))
mouse = mouse.apply(lambda x: x.str.replace('-', ' '))
mouse = mouse[mouse['Parents'] != 'thing']

In [None]:
new_human = human.sample(n=500)
new_mouse = mouse.sample(n=500)

In [None]:
def iterative_levenshtein(s, t, **weight_dict):
    """ 
        iterative_levenshtein(s, t) -> ldist
        ldist is the Levenshtein distance between the strings 
        s and t.
        For all i and j, dist[i,j] will contain the Levenshtein 
        distance between the first i characters of s and the 
        first j characters of t
        
        weight_dict: keyword parameters setting the costs for characters,
                     the default value for a character will be 1
    """

    rows = len(s)+1
    cols = len(t)+1
    
    alphabet = "abcdefghijklmnopqrstuvwxyz0123456789 "
    
    w = dict( (x, (1, 1, 1)) for x in alphabet + alphabet.upper())
    if weight_dict:
        w.update(weight_dict)
    
    dist = [[0 for x in range(cols)] for x in range(rows)]

    # source prefixes can be transformed into empty strings 
    # by deletions:
    for row in range(1, rows):
        dist[row][0] = dist[row-1][0] + w[s[row-1]][0]

    # target prefixes can be created from an empty source string
    # by inserting the characters
    for col in range(1, cols):
        dist[0][col] = dist[0][col-1] + w[t[col-1]][1]
        
    for col in range(1, cols):
        for row in range(1, rows):
            deletes = w[s[row-1]][0]
            inserts = w[t[col-1]][1]
            subs = max( (w[s[row-1]][2], w[t[col-1]][2]))
            if s[row-1] == t[col-1]:
                subs = 0
            else:
                subs = subs

            dist[row][col] = min(dist[row-1][col] + deletes,
                                 dist[row][col-1] + inserts,
                                 dist[row-1][col-1] + subs) # substitution


    
 
    return dist[row][col]




def lexicalSimilarity(e1,e2):

    costs=(1,1,2)
  # Weights for operations on making words similar
  # adding, replacing, deleting 
    
    ci = len(e1)
    cj = len(e2)
    
    ed = iterative_levenshtein(e1,e2,costs=costs)

    comp_ = (min(ci,cj)-ed)/min(ci,cj)

    
    lex_sim = max(0,comp_)
    return(lex_sim)


def lexicalSimilarity2(e1,e2):
    
    lenght_e1 = len(e1)
    lenght_e2 = len(e2)
    
    ed = iterative_levenshtein(e1,e2)
    alfa = 1/3
    beta = 1/3
    #weights for combining similarity measures: nodes and labels respectively. might consider adding comments
    
    lex_sim = (alfa*(max(lenght_e1,lenght_e2) - ed))/(alfa*(max(lenght_e1,lenght_e2)-ed)
    +(beta*(lenght_e1+lenght_e2-2*max(lenght_e1,lenght_e2)+2*ed)))
    return(lex_sim)



def parents_lexical_sim(p1_list,p2_list):
  l1 = p1_list
  l2 = p2_list
#3.2.2 on the paper
  n = len(l1)
  m = len(l2)


  n_mat = [n+1-k for k in range(1,n+1)]
  m_mat = [m+1-k for k in range(1,m+1)]

  #Create empty matrix to fill with lex sim of parents
  M_lex = np.zeros((len(l1),len(l2)))

  #Iterate and fill matrix with lex sim of all parents 
  for i in range(len(l1)):
      u = l1[i]
      for j in range(len(l2)):
          v = l2[j]
          # similarity -> Lexical similarity
          M_lex[i,j] = lexicalSimilarity2(u,v)

  d = pd.DataFrame(M_lex,columns=l2,index=l1)
  n_parents = sum(list(d.max(axis=1)*n_mat))/(n*(n+1))
  m_parents = sum(list(i for i in d.max(axis=0)*m_mat))/(m*(m+1))
  
  return (n_parents+m_parents)

def structural_sim(p1,p2,param):
  a,b = param

  p1_list = p1.split('/')
  p2_list = p2.split('/')

  ci = p1_list[-1]
  cj = p2_list[-1]
  pi = p1_list
  pj = p2_list

  lex_sim_entity = lexicalSimilarity2(ci,cj)

  lex_sim_parents = parents_lexical_sim(pi,pj)

  struct_sim = (a*lex_sim_entity) + (b*lex_sim_parents)
  return struct_sim 


def struc_similarity_matrix(df1,df2,param):
  df1_ = df1
  df2_ = df2



  entity_list = df1_['Name'].values
  entity_list2 = df2_['Name'].values

  l1 = df1_['Path'].values
  l2 = df2_['Path'].values

  #Create empty matrix to fill
  M_sim = np.zeros((l1.shape[0],l2.shape[0]))
  c=0
  #Iterate and fill 
  for i in range(l1.shape[0]):
      u = l1[i]
      for j in range(l2.shape[0]):
          v = l2[j]
          #similarity -> structural similarity 
          c +=1
          M_sim[i,j] = structural_sim(u,v,param)

  DF_sim = pd.DataFrame(M_sim,columns=entity_list2,index=entity_list)

  return DF_sim

def sim_plot(sim_df,threshold):
  plot_df = sim_df
  plot_df[plot_df<threshold]=0.0
  rows = sim_df.shape[0]
  cols = sim_df.shape[1]

  plt.figure(figsize=((0.4*rows),(0.4*cols)))
  cmap = sns.diverging_palette(230, 20, as_cmap=True)
  cg = sns.heatmap(sim_df, fmt="g", cmap=cmap,linewidths=0.5, linecolor='black')

In [None]:
sim_matrix = struc_similarity_matrix(new_mouse,new_human,(0.5,0.5))

In [None]:
from google.colab import files

sim_matrix.to_csv('RandoutputFin.csv', encoding = 'utf-8-sig') 
files.download('RandoutputFin.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
sim_matrix

Unnamed: 0,nci c33352,nci c33709,nci c33051,nci c25439,nci c38518,nci c33632,nci c52973,nci c53155,nci c33409,nci c12675,...,nci c49249,nci c33166,nci c33375,nci c33138,nci c52941,nci c32222,nci c12261,nci c12408,nci c32999,nci c32231
ma 0001263,0.305460,0.224362,0.251242,0.310933,0.327367,0.289746,0.250982,0.349381,0.287073,0.331223,...,0.333038,0.288684,0.367528,0.284436,0.291135,0.328715,0.327155,0.287639,0.349385,0.362349
ma 0002018,0.583478,0.242868,0.298218,0.299188,0.273895,0.284691,0.682725,0.293249,0.320894,0.240083,...,0.283279,0.322877,0.345127,0.247270,0.531061,0.332192,0.302806,0.279397,0.305128,0.328514
ma 0001080,0.204492,0.187599,0.261142,0.254658,0.197568,0.258106,0.256313,0.217998,0.228625,0.289151,...,0.212148,0.269029,0.257348,0.199461,0.218094,0.237791,0.235640,0.268568,0.211817,0.272001
ma 0000695,0.427823,0.252706,0.374597,0.325801,0.385178,0.287048,0.385036,0.287534,0.360558,0.271473,...,0.287267,0.293027,0.362474,0.290749,0.405231,0.302985,0.268023,0.309003,0.308176,0.254760
ma 0001277,0.270018,0.347643,0.261959,0.283364,0.268606,0.310036,0.256671,0.347971,0.340528,0.284625,...,0.256112,0.281645,0.315365,0.258257,0.267294,0.262051,0.315314,0.269042,0.333302,0.266616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ma 0000147,0.294366,0.309801,0.274214,0.336500,0.266268,0.294789,0.310116,0.286472,0.287545,0.438509,...,0.314856,0.307744,0.334185,0.247285,0.272127,0.279422,0.275336,0.295754,0.281751,0.261356
ma 0000108,0.338817,0.283980,0.277635,0.278970,0.327953,0.306778,0.267854,0.358479,0.293136,0.305914,...,0.289291,0.314489,0.300773,0.255621,0.288323,0.337872,0.316332,0.296572,0.297925,0.307847
ma 0002071,0.491479,0.264538,0.293601,0.338656,0.262545,0.297332,0.571861,0.287046,0.284493,0.296497,...,0.251093,0.307429,0.261890,0.267011,0.468706,0.297526,0.312524,0.262361,0.271806,0.239012
ma 0001804,0.371209,0.326801,0.293261,0.357217,0.265686,0.344232,0.420472,0.276746,0.306565,0.288671,...,0.249181,0.328950,0.311452,0.287351,0.363698,0.303263,0.359464,0.274343,0.305214,0.253569


In [None]:
stacked_df = sim_matrix.stack().reset_index()
stacked_df

Unnamed: 0,level_0,level_1,0
0,ma 0001263,nci c33352,0.305460
1,ma 0001263,nci c33709,0.224362
2,ma 0001263,nci c33051,0.251242
3,ma 0001263,nci c25439,0.310933
4,ma 0001263,nci c38518,0.327367
...,...,...,...
249995,ma 0000482,nci c32222,0.334022
249996,ma 0000482,nci c12261,0.308390
249997,ma 0000482,nci c12408,0.309655
249998,ma 0000482,nci c32999,0.300001


In [None]:
stacked_df['entity_pair_names'] = stacked_df['level_0'] + ' / ' + stacked_df['level_1']
stacked_df.drop(['level_0','level_1'],axis=1,inplace=True)
stacked_df = stacked_df.rename(columns={0: 'Structural_similarity'})
stacked_df = stacked_df[['entity_pair_names', 'Structural_similarity']]

In [None]:
stacked_df

Unnamed: 0,entity_pair_names,Structural_similarity
0,ma 0001263 / nci c33352,0.305460
1,ma 0001263 / nci c33709,0.224362
2,ma 0001263 / nci c33051,0.251242
3,ma 0001263 / nci c25439,0.310933
4,ma 0001263 / nci c38518,0.327367
...,...,...
249995,ma 0000482 / nci c32222,0.334022
249996,ma 0000482 / nci c12261,0.308390
249997,ma 0000482 / nci c12408,0.309655
249998,ma 0000482 / nci c32999,0.300001


In [None]:
import re
dictOfStrings = {'1' : ' One',
                 '2': ' Two',
                 '3': ' Three',
                 '4' : ' Four',
                 '5': ' Five',
                 '6': ' Six',
                 '7' : ' Seven',
                 '8': ' Eight',
                 '9': ' Nine',
                 '10': 'Ten'}
for idx, j in enumerate(human['label'][:70]):
  if len(re.findall('[0-9]+', j)) != 0:
    for word, replacement in dictOfStrings.items():
      human['label'].values[idx] = re.sub(word, replacement, human['label'].values[idx])

In [None]:
import re
dictOfStrings = {'1' : ' One',
                 '2': ' Two',
                 '3': ' Three',
                 '4' : ' Four',
                 '5': ' Five',
                 '6': ' Six',
                 '7' : ' Seven',
                 '8': ' Eight',
                 '9': ' Nine',
                 '10': 'Ten'}
for idx, j in enumerate(mouse['label'][:70]):
  if len(re.findall('[0-9]+', j)) != 0:
    for word, replacement in dictOfStrings.items():
      mouse['label'].values[idx] = re.sub(word, replacement, mouse['label'].values[idx])

In [None]:
print(mouse['label'].values[5])


mammary gland


In [None]:
###----Jaccard-----###
def jaccard_similarity(x,y):
  """ returns the jaccard similarity between two lists """
  intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
  union_cardinality = len(set.union(*[set(x), set(y)]))
  return intersection_cardinality/float(union_cardinality)

In [None]:
def jac_similarity_matrix(df1_,df2_,param):
    



    entity_list = df1_['Name'].values
    entity_list2 = df2_['Name'].values

    label_list  = df1_['label'].values
    label_list2 = df2_['label'].values


    l1 = df1_['Path'].values
    l2 = df2_['Path'].values
    #print(l2)



    #Create empty matrix to fill
    M_sim = np.zeros((l1.shape[0],l2.shape[0]))
    #Iterate and fill 
    for i in range(l1.shape[0]):
      u = label_list[i]
      for j in range(l2.shape[0]):
          v = label_list2[j]
          #similarity -> structural similarity 
          M_sim[i,j] = jaccard_similarity(u,v)

    DF_sim = pd.DataFrame(M_sim,columns=entity_list2,index=entity_list)

    return DF_sim


In [None]:
sim_df_jac = jac_similarity_matrix(new_mouse,new_human,(0.5,0.5))

In [None]:
stacked_df_jac = sim_df_jac.stack().reset_index()

In [None]:
stacked_df_jac = stacked_df_jac.rename(columns={0: 'Jaccardian_similarity'})
stacked_df['Jaccardian_Similarity'] =  stacked_df_jac['Jaccardian_similarity']
stacked_df

Unnamed: 0,entity_pair_names,Structural_similarity,Jaccardian_Similarity
0,ma 0001263 / nci c33352,0.305460,0.533333
1,ma 0001263 / nci c33709,0.224362,0.133333
2,ma 0001263 / nci c33051,0.251242,0.153846
3,ma 0001263 / nci c25439,0.310933,0.400000
4,ma 0001263 / nci c38518,0.327367,0.470588
...,...,...,...
249995,ma 0000482 / nci c32222,0.334022,0.437500
249996,ma 0000482 / nci c12261,0.308390,0.571429
249997,ma 0000482 / nci c12408,0.309655,0.230769
249998,ma 0000482 / nci c32999,0.300001,0.562500


In [None]:
from difflib import SequenceMatcher

def sequence_similarity_matrix(df1_,df2_,param):
   

    entity_list = df1_['Name'].values
    entity_list2 = df2_['Name'].values

    label_list  = df1_['label'].values
    label_list2 = df2_['label'].values

    l1 = df1_['Path'].values
    l2 = df2_['Path'].values


    #Create empty matrix to fill
    M_sim = np.zeros((l1.shape[0],l2.shape[0]))
    #Iterate and fill 
    for i in range(l1.shape[0]):
      u = label_list[i]
      for j in range(l2.shape[0]):
          v = label_list2[j]
          #similarity -> structural similarity 
          M_sim[i,j] = SequenceMatcher(None, u, v).ratio()


    DF_sim = pd.DataFrame(M_sim,columns=entity_list2,index=entity_list)

    return DF_sim


In [None]:
sim_df_seq = sequence_similarity_matrix(new_mouse,new_human,(0.5,0.5))

In [None]:
stacked_df_seq = sim_df_seq.stack().reset_index()

In [None]:
stacked_df_seq = stacked_df_seq.rename(columns={0: 'Sequence_similarity'})
stacked_df['Sequence_Similarity'] =  stacked_df_seq['Sequence_similarity']
stacked_df

Unnamed: 0,entity_pair_names,Structural_similarity,Jaccardian_Similarity,Sequence_Similarity
0,ma 0001263 / nci c33352,0.305460,0.533333,0.315789
1,ma 0001263 / nci c33709,0.224362,0.133333,0.160000
2,ma 0001263 / nci c33051,0.251242,0.153846,0.210526
3,ma 0001263 / nci c25439,0.310933,0.400000,0.222222
4,ma 0001263 / nci c38518,0.327367,0.470588,0.324324
...,...,...,...,...
249995,ma 0000482 / nci c32222,0.334022,0.437500,0.413793
249996,ma 0000482 / nci c12261,0.308390,0.571429,0.352941
249997,ma 0000482 / nci c12408,0.309655,0.230769,0.086957
249998,ma 0000482 / nci c32999,0.300001,0.562500,0.205128


In [None]:
!pip install -U sentence-transformers


Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 9.7 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 63.1 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 65.6 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 9.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 19.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 12.8 MB/s 
[?25hCol

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
model = SentenceTransformer('all-MiniLM-L6-v2')

def sentence_transformers(p1,p2):
    embedding1 = model.encode(p1, convert_to_tensor=True)
    embedding2 = model.encode(p2, convert_to_tensor=True)
    # compute similarity scores of two embeddings
    cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
    return cosine_scores.item()

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
def transformers_similarity_matrix(df1_,df2_,param):

    entity_list = df1_['Name'].values
    entity_list2 = df2_['Name'].values

    label_list  = df1_['label'].values
    label_list2 = df2_['label'].values

    l1 = df1_['Path'].values
    l2 = df2_['Path'].values
    #print(l2)



    #Create empty matrix to fill
    M_sim = np.zeros((l1.shape[0],l2.shape[0]))
    #Iterate and fill 
    for i in range(l1.shape[0]):
      u = label_list[i]
      for j in range(l2.shape[0]):
          v = label_list2[j]
          #similarity -> structural similarity 
          M_sim[i,j] = sentence_transformers(u,v)

    DF_sim = pd.DataFrame(M_sim,columns=label_list2,index=label_list)

    return DF_sim

In [None]:
sim_df_trans = transformers_similarity_matrix(new_mouse,new_human, (0.5,0.5))

In [None]:
stacked_df_trans = sim_df_trans.stack().reset_index()

In [None]:
stacked_df_trans = stacked_df_trans.rename(columns={0: 'Transformers_similarity'})
stacked_df['Transformers_similarity'] =  stacked_df_trans['Transformers_similarity']
stacked_df

Unnamed: 0,entity_pair_names,Structural_similarity,Jaccardian_Similarity,Sequence_Similarity,Transformers_similarity
0,ma 0001263 / nci c33352,0.305460,0.533333,0.315789,0.227243
1,ma 0001263 / nci c33709,0.224362,0.133333,0.160000,0.273064
2,ma 0001263 / nci c33051,0.251242,0.153846,0.210526,0.066789
3,ma 0001263 / nci c25439,0.310933,0.400000,0.222222,0.283161
4,ma 0001263 / nci c38518,0.327367,0.470588,0.324324,0.176458
...,...,...,...,...,...
249995,ma 0000482 / nci c32222,0.334022,0.437500,0.413793,0.217082
249996,ma 0000482 / nci c12261,0.308390,0.571429,0.352941,0.089963
249997,ma 0000482 / nci c12408,0.309655,0.230769,0.086957,0.064388
249998,ma 0000482 / nci c32999,0.300001,0.562500,0.205128,0.142476


In [None]:
!pip install -U spacy

Collecting spacy
  Downloading spacy-3.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 26.9 MB/s 
[?25hCollecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.7-py3-none-any.whl (17 kB)
Collecting typing-extensions<4.0.0.0,>=3.7.4
  Downloading typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)
Collecting pathy>=0.3.5
  Downloading pathy-0.6.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.9 MB/s 
[?25hCollecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (457 kB)
[K     |████████████████████████████████| 457 kB 67.6 MB/s 
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 79.4 MB/s 
[?25hCollecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.2-py3-none-any.whl (7.2 kB)
Collecting spacy-legacy<3.1.0,>=3.0.9
  

In [None]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.3.0/en_core_web_md-3.3.0-py3-none-any.whl (33.5 MB)
[K     |████████████████████████████████| 33.5 MB 249 kB/s 
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
from math import sqrt, pow, exp
 
def squared_sum(x):
  """ return 3 rounded square rooted value """
 
  return round(sqrt(sum([a*a for a in x])),3)
  
def cos_similarity(x,y):
    """ return cosine similarity between two lists """

    numerator = sum(a*b for a,b in zip(x,y))
    denominator = squared_sum(x)*squared_sum(y)
    return round(numerator/float(denominator),3)


In [None]:
import spacy
nlp = spacy.load('en_core_web_md')

import numpy as np
import scipy
from scipy.spatial import distance

def spacy_similarity_matrix(df1_,df2_,param):

    entity_list = df1_['Name'].values
    entity_list2 = df2_['Name'].values

    label_list  = df1_['label'].values
    label_list2 = df2_['label'].values

    l1 = df1_['Path'].values
    l2 = df2_['Path'].values

    #print(l2)



    #Create empty matrix to fill
    M_sim = np.zeros((l1.shape[0],l2.shape[0]))
    #Iterate and fill 
    for i in range(l1.shape[0]):
      u = nlp(label_list[i]).vector
      for j in range(l2.shape[0]):
          v = nlp(label_list2[j]).vector
          #similarity -> structural similarity 
          M_sim[i,j] = cos_similarity(u,v)

    DF_sim = pd.DataFrame(M_sim,columns=label_list2,index=label_list)

    return DF_sim
#scipy.spatial.distance.cosine

In [None]:
sim_df_spacy = spacy_similarity_matrix(new_mouse,new_human, (0.5,0.5))

  del sys.path[0]


In [None]:
stacked_df_spacy = sim_df_spacy.stack().reset_index()

In [None]:
stacked_df_spacy = stacked_df_spacy.rename(columns={0: 'Spacy_similarity'})
stacked_df['Spacy_similarity'] =  stacked_df_spacy['Spacy_similarity']
stacked_df.to_csv('the first_five.csv')


In [None]:
stacked_df

Unnamed: 0,entity_pair_names,Structural_similarity,Jaccardian_Similarity,Sequence_Similarity,Transformers_similarity,Spacy_similarity
0,ma 0001263 / nci c33352,0.305460,0.533333,0.315789,0.227243,0.383
1,ma 0001263 / nci c33709,0.224362,0.133333,0.160000,0.273064,0.237
2,ma 0001263 / nci c33051,0.251242,0.153846,0.210526,0.066789,0.237
3,ma 0001263 / nci c25439,0.310933,0.400000,0.222222,0.283161,0.155
4,ma 0001263 / nci c38518,0.327367,0.470588,0.324324,0.176458,0.191
...,...,...,...,...,...,...
249995,ma 0000482 / nci c32222,0.334022,0.437500,0.413793,0.217082,
249996,ma 0000482 / nci c12261,0.308390,0.571429,0.352941,0.089963,
249997,ma 0000482 / nci c12408,0.309655,0.230769,0.086957,0.064388,
249998,ma 0000482 / nci c32999,0.300001,0.562500,0.205128,0.142476,


In [None]:
from google.colab import files

stacked_df.to_csv('Features.csv', encoding = 'utf-8-sig') 
files.download('Features.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Alignment appending

In [5]:
import pandas as pd
df = pd.read_csv('/content/drive/My Drive/Thesis2022/RandoutputFin (1).csv')

df['Unnamed: 0'] = df['Unnamed: 0'].str.replace(' ','_')
df['Unnamed: 0'] = df['Unnamed: 0'].str.upper()

col = []
for i in df.columns:
    z= i.upper()
    col+=[z.replace(' ','_')]
    
old_colnames = df.columns
new_colnames = col

col_rename_dict = {i:j for i,j in zip(old_colnames,new_colnames)}
df.rename(columns=col_rename_dict, inplace=True)

df.head(4)

Unnamed: 0,UNNAMED:_0,NCI_C33352,NCI_C33709,NCI_C33051,NCI_C25439,NCI_C38518,NCI_C33632,NCI_C52973,NCI_C53155,NCI_C33409,...,NCI_C49249,NCI_C33166,NCI_C33375,NCI_C33138,NCI_C52941,NCI_C32222,NCI_C12261,NCI_C12408,NCI_C32999,NCI_C32231
0,MA_0001263,0.30546,0.224362,0.251242,0.310933,0.327367,0.289746,0.250982,0.349381,0.287073,...,0.333038,0.288684,0.367528,0.284436,0.291135,0.328715,0.327155,0.287639,0.349385,0.362349
1,MA_0002018,0.583478,0.242868,0.298218,0.299188,0.273895,0.284691,0.682725,0.293249,0.320894,...,0.283279,0.322877,0.345127,0.24727,0.531061,0.332192,0.302806,0.279397,0.305128,0.328514
2,MA_0001080,0.204492,0.187599,0.261142,0.254658,0.197568,0.258106,0.256313,0.217998,0.228625,...,0.212148,0.269029,0.257348,0.199461,0.218094,0.237791,0.23564,0.268568,0.211817,0.272001
3,MA_0000695,0.427823,0.252706,0.374597,0.325801,0.385178,0.287048,0.385036,0.287534,0.360558,...,0.287267,0.293027,0.362474,0.290749,0.405231,0.302985,0.268023,0.309003,0.308176,0.25476


In [6]:
import re
matches = []
rows = []
for string1 in df['UNNAMED:_0']:
    file1 = open("/content/drive/My Drive/Thesis2022/reference_.rdf", "r")
    index = 0

    for line in file1:  
        index += 1 

        if string1 in line:
            #print('String', string1, 'Found In Line', index)
            next_line = next(file1)
            matches += [re.findall(r"#(\w+)", line)[0]+' / '+re.findall(r"#(\w+)", next_line)[0]]
            rows += [re.findall(r"#(\w+)", next_line)[0]]

  
# closing text file    
file1.close()

In [7]:
import re
matches2 = []
rows2 = []
for string1 in new_colnames:
    file1 = open("/content/drive/My Drive/Thesis2022/reference_.rdf", "r")
    index = 0
    prevLine = ""

    for line in file1:  
        index += 1 

        if string1 in line:
            #print('String', string1, 'Found In Line', index)
            #print(prevLine)
            #print(line)
            matches2 += [re.findall(r"#(\w+)", prevLine)[0]+' / '+re.findall(r"#(\w+)", line)[0]]
            rows2 += [re.findall(r"#(\w+)", prevLine)[0]]
        
            
        prevLine = line   

  
# closing text file    
file1.close()

In [8]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3
len(intersection(matches,matches2))

48

In [9]:
featuresDF = pd.read_csv('/content/drive/My Drive/Thesis2022/FeaturesFin.csv')
featuresDF['entity_pair_names'] = featuresDF['entity_pair_names'].str.replace(' ','_')
featuresDF['entity_pair_names'] = featuresDF['entity_pair_names'].str.upper()
featuresDF['entity_pair_names'] = featuresDF['entity_pair_names'].str.replace('_/_',' / ')
featuresDF["Alignment"] = " "
featuresDF

Unnamed: 0.1,Unnamed: 0,entity_pair_names,Structural_similarity,Jaccardian_Similarity,Sequence_Similarity,Transformers_similarity,Spacy_similarity,Alignment
0,0,MA_0001263 / NCI_C33352,0.305460,0.533333,0.315789,0.227243,0.383,
1,1,MA_0001263 / NCI_C33709,0.224362,0.133333,0.160000,0.273064,0.237,
2,2,MA_0001263 / NCI_C33051,0.251242,0.153846,0.210526,0.066789,0.237,
3,3,MA_0001263 / NCI_C25439,0.310933,0.400000,0.222222,0.283161,0.155,
4,4,MA_0001263 / NCI_C38518,0.327367,0.470588,0.324324,0.176458,0.191,
...,...,...,...,...,...,...,...,...
249995,249995,MA_0000482 / NCI_C32222,0.334022,0.437500,0.413793,0.217082,,
249996,249996,MA_0000482 / NCI_C12261,0.308390,0.571429,0.352941,0.089963,,
249997,249997,MA_0000482 / NCI_C12408,0.309655,0.230769,0.086957,0.064388,,
249998,249998,MA_0000482 / NCI_C32999,0.300001,0.562500,0.205128,0.142476,,


In [10]:
for idx,i in enumerate(featuresDF['entity_pair_names']):
    if i in matches2:
        featuresDF.iat[idx, featuresDF.columns.get_loc('Alignment')] = '1'
    else:
        featuresDF.iat[idx, featuresDF.columns.get_loc('Alignment')] = '0'

In [None]:
featuresDF[featuresDF['Alignment'] == '1']

In [12]:
from google.colab import files

featuresDF.to_csv('FeaturesDF.csv', encoding = 'utf-8-sig', index=False) 
files.download('FeaturesDF.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#!pip install gensim

In [None]:
#import gensim
#import gensim.downloader as api
#model_gigaword = api.load("word2vec-google-news-300")


In [None]:
#cosine = model_gigaword.similarity("pericyte", "cranial nerve")
#cosine

In [None]:
#wordnnetsimilarity('timestamp','time')

0

In [None]:
#!pip install SPARQLWrapper

In [None]:
#from sematch.semantic.similarity import WordNetSimilarity
#nltk.download('wordnet_ic')

#def wordnet(a,b):
#    wns = WordNetSimilarity()

    # Computing English word similarity using Li method
#    return wns.word_similarity(a,b,'res')

In [None]:
#sim_df_wn = wordnet_similarity_matrix(human[:7], mouse[:7], (0.5,0.5))
#sim_df_wn