In [1]:
import math
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
!pip install owlready2
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from owlready2 import *
import seaborn as sns

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Collecting owlready2
  Downloading Owlready2-0.37.tar.gz (23.8 MB)
[K     |████████████████████████████████| 23.8 MB 1.7 MB/s 
[?25hBuilding wheels for collected packages: owlready2
  Building wheel for owlready2 (setup.py) ... [?25l[?25hdone
  Created wheel for owlready2: filename=Owlready2-0.37-cp37-cp37m-linux_x86_64.whl size=20445991 sha256=29d3795350ae9ca5f14751afec3d471befb735670fa5d39adfca3c224a437234
  Stored in directory: /root/.cache/pip/wheels/34/49/36/31062d59333455aa0cb4950141cab4945600ce61c26a44e9cd
Successfully built owlready2
Installing collected packages: owlready2
Successfully installed owlready2-0.37


In [2]:
def get_path(cl):
    path = cl.name
    while True:
        try:
            path = path + '/' + cl.is_a[0].name
        except IndexError:
            break
        cl = cl.is_a[0]
        if cl == 'owl.Thing':
          #the very first node
            break

    return '/'.join(path.split('/')[::-1])

In [3]:

def read_ontology(path):
    onto = get_ontology(path)
    onto.load()

    # Read classes
    classes = []

    for cl in onto.classes():
        classes.append(cl)

    classes = list(set(classes))

    '''# Read properties
    properties = []

    for prop in onto.properties():
        properties.append(prop)

    properties = list(set(properties))'''

    return classes
def get_classes_df(ont_path):
    data = []

    # Parse ontologies
    classes1 = read_ontology(ont_path)

    # Generate classes
    for class_ in classes1:
      data.append((class_.name.lower(), 
                     get_path(class_).lower(),''.join(class_.label).lower(),''.join(class_.comment).lower()))
    dataset = pd.DataFrame(data, columns=['Name','Path','label','comment'])

    return dataset

In [4]:
def sim_plot(sim_df,threshold):
  plot_df = sim_df
  plot_df[plot_df<threshold]=0.0
  rows = sim_df.shape[0]
  cols = sim_df.shape[1]

  plt.figure(figsize=((0.4*rows),(0.4*cols)))
  cmap = sns.diverging_palette(230, 20, as_cmap=True)
  cg = sns.heatmap(sim_df, fmt="g", cmap=cmap,linewidths=0.5, linecolor='black')

In [5]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [6]:
import sys
sys.path.insert(0, '/content/drive/My Drive/Thesis2022')

In [8]:
human = get_classes_df('/content/drive/My Drive/Thesis2022/human.owl')
mouse = get_classes_df('/content/drive/My Drive/Thesis2022/mouse.owl')
#onto = get_classes_df('/content/drive/My Drive/Thesis2022/OntoWind.owl')
#sf_ont = get_classes_df('/content/drive/My Drive/Thesis2022/SF-ONT.owl')

In [9]:
mouse_dict = {}

for i in range(mouse.shape[0]):
  mouse_dict[mouse.iloc[i]['Name']] = mouse.iloc[i]['label']

human_dict = {}

for i in range(human.shape[0]):
  human_dict[human.iloc[i]['Name']] = human.iloc[i]['label']

In [10]:
path_list_human = []
parents_list_human = []
for i in human.Path:
  path = ['thing']
  split_list = i.split('/')[1:]
  for j in range(len(split_list)):
    path.append(human_dict[split_list[j]])
  path_list_human.append('/'.join(path))
  parents_list_human.append(path[-2])


In [11]:
human['Path'] = path_list_human
human['Parents']= parents_list_human 

In [12]:
human = human.apply(lambda x: x.str.replace('_', ' ') )
human = human.apply(lambda x: x.str.replace('-', ' ') )
human = human[human['Parents'] != 'thing']


In [13]:
path_list_mouse = []
parents_list_mouse = []
for i in mouse.Path:
  path = ['thing']
  split_list = i.split('/')[1:]
  for j in range(len(split_list)):
    path.append(mouse_dict[split_list[j]])
  path_list_mouse.append('/'.join(path))
  parents_list_mouse.append(path[len(path)-2])

In [14]:
mouse['Path'] = path_list_mouse
mouse['Parents']= parents_list_mouse 

In [15]:
mouse = mouse.apply(lambda x: x.str.replace('_', ' '))
mouse = mouse.apply(lambda x: x.str.replace('-', ' '))
mouse = mouse[mouse['Parents'] != 'thing']

In [16]:
def iterative_levenshtein(s, t, **weight_dict):
    """ 
        iterative_levenshtein(s, t) -> ldist
        ldist is the Levenshtein distance between the strings 
        s and t.
        For all i and j, dist[i,j] will contain the Levenshtein 
        distance between the first i characters of s and the 
        first j characters of t
        
        weight_dict: keyword parameters setting the costs for characters,
                     the default value for a character will be 1
    """

    rows = len(s)+1
    cols = len(t)+1
    
    alphabet = "abcdefghijklmnopqrstuvwxyz0123456789 "
    
    w = dict( (x, (1, 1, 1)) for x in alphabet + alphabet.upper())
    if weight_dict:
        w.update(weight_dict)
    
    dist = [[0 for x in range(cols)] for x in range(rows)]

    # source prefixes can be transformed into empty strings 
    # by deletions:
    for row in range(1, rows):
        dist[row][0] = dist[row-1][0] + w[s[row-1]][0]

    # target prefixes can be created from an empty source string
    # by inserting the characters
    for col in range(1, cols):
        dist[0][col] = dist[0][col-1] + w[t[col-1]][1]
        
    for col in range(1, cols):
        for row in range(1, rows):
            deletes = w[s[row-1]][0]
            inserts = w[t[col-1]][1]
            subs = max( (w[s[row-1]][2], w[t[col-1]][2]))
            if s[row-1] == t[col-1]:
                subs = 0
            else:
                subs = subs

            dist[row][col] = min(dist[row-1][col] + deletes,
                                 dist[row][col-1] + inserts,
                                 dist[row-1][col-1] + subs) # substitution


    
 
    return dist[row][col]




def lexicalSimilarity(e1,e2):

    costs=(1,1,2)
  # Weights for operations on making words similar
  # adding, replacing, deleting 
    
    ci = len(e1)
    cj = len(e2)
    
    ed = iterative_levenshtein(e1,e2,costs=costs)

    comp_ = (min(ci,cj)-ed)/min(ci,cj)

    
    lex_sim = max(0,comp_)
    return(lex_sim)


def lexicalSimilarity2(e1,e2):
    
    lenght_e1 = len(e1)
    lenght_e2 = len(e2)
    
    ed = iterative_levenshtein(e1,e2)
    alfa = 1/3
    beta = 1/3
    #weights for combining similarity measures: nodes and labels respectively. might consider adding comments
    
    lex_sim = (alfa*(max(lenght_e1,lenght_e2) - ed))/(alfa*(max(lenght_e1,lenght_e2)-ed)
    +(beta*(lenght_e1+lenght_e2-2*max(lenght_e1,lenght_e2)+2*ed)))
    return(lex_sim)



def parents_lexical_sim(p1_list,p2_list):
  l1 = p1_list
  l2 = p2_list
#3.2.2 on the paper
  n = len(l1)
  m = len(l2)


  n_mat = [n+1-k for k in range(1,n+1)]
  m_mat = [m+1-k for k in range(1,m+1)]

  #Create empty matrix to fill with lex sim of parents
  M_lex = np.zeros((len(l1),len(l2)))

  #Iterate and fill matrix with lex sim of all parents 
  for i in range(len(l1)):
      u = l1[i]
      for j in range(len(l2)):
          v = l2[j]
          # similarity -> Lexical similarity
          M_lex[i,j] = lexicalSimilarity2(u,v)

  d = pd.DataFrame(M_lex,columns=l2,index=l1)
  n_parents = sum(list(d.max(axis=1)*n_mat))/(n*(n+1))
  m_parents = sum(list(i for i in d.max(axis=0)*m_mat))/(m*(m+1))
  
  return (n_parents+m_parents)

def structural_sim(p1,p2,param):
  a,b = param

  p1_list = p1.split('/')
  p2_list = p2.split('/')

  ci = p1_list[-1]
  cj = p2_list[-1]
  pi = p1_list
  pj = p2_list

  lex_sim_entity = lexicalSimilarity2(ci,cj)

  lex_sim_parents = parents_lexical_sim(pi,pj)

  struct_sim = (a*lex_sim_entity) + (b*lex_sim_parents)
  return struct_sim 


def struc_similarity_matrix(df1,df2,param):
  df1_ = df1
  df2_ = df2



  entity_list = df1_['Name'].values
  entity_list2 = df2_['Name'].values

  l1 = df1_['Path'].values
  l2 = df2_['Path'].values

  #Create empty matrix to fill
  M_sim = np.zeros((l1.shape[0],l2.shape[0]))
  c=0
  #Iterate and fill 
  for i in range(l1.shape[0]):
      u = l1[i]
      for j in range(l2.shape[0]):
          v = l2[j]
          #similarity -> structural similarity 
          c +=1
          M_sim[i,j] = structural_sim(u,v,param)

  DF_sim = pd.DataFrame(M_sim,columns=entity_list2,index=entity_list)

  return DF_sim

def sim_plot(sim_df,threshold):
  plot_df = sim_df
  plot_df[plot_df<threshold]=0.0
  rows = sim_df.shape[0]
  cols = sim_df.shape[1]

  plt.figure(figsize=((0.4*rows),(0.4*cols)))
  cmap = sns.diverging_palette(230, 20, as_cmap=True)
  cg = sns.heatmap(sim_df, fmt="g", cmap=cmap,linewidths=0.5, linecolor='black')

In [17]:
sim_matrix = struc_similarity_matrix(mouse[:70],human[:70],(0.5,0.5))

In [18]:
sim_matrix

Unnamed: 0,nci c13071,nci c12367,nci c12766,nci c33638,nci c12899,nci c13101,nci c32137,nci c32291,nci c32441,nci c32557,...,nci c12458,nci c12571,nci c12675,nci c33286,nci c33254,nci c33218,nci c41177,nci c49273,nci c52719,nci c52925
ma 0001902,0.275570,0.258785,0.282320,0.323964,0.233693,0.341364,0.281271,0.225452,0.456868,0.234228,...,0.246886,0.295531,0.238689,0.257778,0.309081,0.334328,0.253646,0.229816,0.279385,0.224431
ma 0000442,0.307856,0.330439,0.324051,0.302848,0.285828,0.335766,0.287515,0.325758,0.325501,0.275477,...,0.284548,0.293359,0.305676,0.326820,0.305070,0.428451,0.281969,0.303582,0.304172,0.279017
ma 0000510,0.283389,0.258488,0.256865,0.263003,0.232782,0.276175,0.212215,0.258993,0.250603,0.264701,...,0.247294,0.270776,0.257655,0.265097,0.267787,0.272486,0.253397,0.281619,0.418863,0.290789
ma 0000686,0.237218,0.226635,0.226970,0.250273,0.217425,0.263823,0.215208,0.266065,0.241571,0.257769,...,0.232808,0.237555,0.285476,0.275235,0.260025,0.234206,0.244387,0.257512,0.439954,0.284007
ma 0000885,0.315365,0.280494,0.290489,0.325710,0.260425,0.361657,0.264764,0.367991,0.312013,0.335934,...,0.302156,0.274457,0.310747,0.311429,0.304728,0.371384,0.291723,0.292412,0.291052,0.285936
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ma 0001924,0.291707,0.224995,0.262205,0.292764,0.276306,0.310459,0.257960,0.267072,0.558098,0.297936,...,0.231028,0.285079,0.262146,0.296306,0.294989,0.255649,0.273285,0.285636,0.227699,0.265578
ma 0001860,0.299808,0.244155,0.280916,0.308886,0.258884,0.307045,0.320798,0.245802,0.209777,0.270637,...,0.294393,0.268199,0.225540,0.256527,0.278222,0.252866,0.256883,0.269460,0.212143,0.248539
ma 0000545,0.197987,0.413464,0.292692,0.277873,0.251530,0.237805,0.247704,0.266375,0.225352,0.264140,...,0.255881,0.269502,0.381995,0.242529,0.246655,0.258540,0.230298,0.221104,0.338659,0.280042
ma 0000531,0.220516,0.215793,0.292563,0.247432,0.219221,0.279357,0.255340,0.227698,0.320579,0.216999,...,0.217555,0.275846,0.200828,0.273782,0.281595,0.215976,0.261985,0.251546,0.236360,0.230303


In [19]:
stacked_df = sim_matrix.stack().reset_index()
stacked_df

Unnamed: 0,level_0,level_1,0
0,ma 0001902,nci c13071,0.275570
1,ma 0001902,nci c12367,0.258785
2,ma 0001902,nci c12766,0.282320
3,ma 0001902,nci c33638,0.323964
4,ma 0001902,nci c12899,0.233693
...,...,...,...
4895,ma 0000175,nci c33218,0.269659
4896,ma 0000175,nci c41177,0.281649
4897,ma 0000175,nci c49273,0.298697
4898,ma 0000175,nci c52719,0.267155


In [20]:
stacked_df['entity_pair_names'] = stacked_df['level_0'] + ' / ' + stacked_df['level_1']
stacked_df.drop(['level_0','level_1'],axis=1,inplace=True)
stacked_df = stacked_df.rename(columns={0: 'Structural_similarity'})
stacked_df = stacked_df[['entity_pair_names', 'Structural_similarity']]

In [21]:
stacked_df

Unnamed: 0,entity_pair_names,Structural_similarity
0,ma 0001902 / nci c13071,0.275570
1,ma 0001902 / nci c12367,0.258785
2,ma 0001902 / nci c12766,0.282320
3,ma 0001902 / nci c33638,0.323964
4,ma 0001902 / nci c12899,0.233693
...,...,...
4895,ma 0000175 / nci c33218,0.269659
4896,ma 0000175 / nci c41177,0.281649
4897,ma 0000175 / nci c49273,0.298697
4898,ma 0000175 / nci c52719,0.267155


In [22]:
import re
dictOfStrings = {'1' : ' One',
                 '2': ' Two',
                 '3': ' Three',
                 '4' : ' Four',
                 '5': ' Five',
                 '6': ' Six',
                 '7' : ' Seven',
                 '8': ' Eight',
                 '9': ' Nine',
                 '10': 'Ten'}
for idx, j in enumerate(human['label'][:70]):
  if len(re.findall('[0-9]+', j)) != 0:
    for word, replacement in dictOfStrings.items():
      human['label'].values[idx] = re.sub(word, replacement, human['label'].values[idx])

In [23]:
import re
dictOfStrings = {'1' : ' One',
                 '2': ' Two',
                 '3': ' Three',
                 '4' : ' Four',
                 '5': ' Five',
                 '6': ' Six',
                 '7' : ' Seven',
                 '8': ' Eight',
                 '9': ' Nine',
                 '10': 'Ten'}
for idx, j in enumerate(mouse['label'][:70]):
  if len(re.findall('[0-9]+', j)) != 0:
    for word, replacement in dictOfStrings.items():
      mouse['label'].values[idx] = re.sub(word, replacement, mouse['label'].values[idx])

In [24]:
print(mouse['label'].values[5])


lateral ventricle choroid plexus epithelium


In [25]:
###----Jaccard-----###
def jaccard_similarity(x,y):
  """ returns the jaccard similarity between two lists """
  intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
  union_cardinality = len(set.union(*[set(x), set(y)]))
  return intersection_cardinality/float(union_cardinality)

In [26]:
def jac_similarity_matrix(df1_,df2_,param):
    



    entity_list = df1_['Name'].values
    entity_list2 = df2_['Name'].values

    label_list  = df1_['label'].values
    label_list2 = df2_['label'].values


    l1 = df1_['Path'].values
    l2 = df2_['Path'].values
    #print(l2)



    #Create empty matrix to fill
    M_sim = np.zeros((l1.shape[0],l2.shape[0]))
    #Iterate and fill 
    for i in range(l1.shape[0]):
      u = label_list[i]
      for j in range(l2.shape[0]):
          v = label_list2[j]
          #similarity -> structural similarity 
          M_sim[i,j] = jaccard_similarity(u,v)

    DF_sim = pd.DataFrame(M_sim,columns=entity_list2,index=entity_list)

    return DF_sim


In [27]:
sim_df_jac = jac_similarity_matrix(mouse[:70],human[:70],(0.5,0.5))

In [28]:
stacked_df_jac = sim_df_jac.stack().reset_index()

In [29]:
stacked_df_jac = stacked_df_jac.rename(columns={0: 'Jaccardian_similarity'})
stacked_df['Jaccardian_Similarity'] =  stacked_df_jac['Jaccardian_similarity']
stacked_df

Unnamed: 0,entity_pair_names,Structural_similarity,Jaccardian_Similarity
0,ma 0001902 / nci c13071,0.275570,0.250000
1,ma 0001902 / nci c12367,0.258785,0.250000
2,ma 0001902 / nci c12766,0.282320,0.466667
3,ma 0001902 / nci c33638,0.323964,0.625000
4,ma 0001902 / nci c12899,0.233693,0.411765
...,...,...,...
4895,ma 0000175 / nci c33218,0.269659,0.307692
4896,ma 0000175 / nci c41177,0.281649,0.400000
4897,ma 0000175 / nci c49273,0.298697,0.333333
4898,ma 0000175 / nci c52719,0.267155,0.357143


In [30]:
from difflib import SequenceMatcher

def sequence_similarity_matrix(df1_,df2_,param):
   

    entity_list = df1_['Name'].values
    entity_list2 = df2_['Name'].values

    label_list  = df1_['label'].values
    label_list2 = df2_['label'].values

    l1 = df1_['Path'].values
    l2 = df2_['Path'].values


    #Create empty matrix to fill
    M_sim = np.zeros((l1.shape[0],l2.shape[0]))
    #Iterate and fill 
    for i in range(l1.shape[0]):
      u = label_list[i]
      for j in range(l2.shape[0]):
          v = label_list2[j]
          #similarity -> structural similarity 
          M_sim[i,j] = SequenceMatcher(None, u, v).ratio()


    DF_sim = pd.DataFrame(M_sim,columns=entity_list2,index=entity_list)

    return DF_sim


In [31]:
sim_df_seq = sequence_similarity_matrix(mouse[:70],human[:70],(0.5,0.5))

In [32]:
stacked_df_seq = sim_df_seq.stack().reset_index()

In [33]:
stacked_df_seq = stacked_df_seq.rename(columns={0: 'Sequence_similarity'})
stacked_df['Sequence_Similarity'] =  stacked_df_seq['Sequence_similarity']
stacked_df

Unnamed: 0,entity_pair_names,Structural_similarity,Jaccardian_Similarity,Sequence_Similarity
0,ma 0001902 / nci c13071,0.275570,0.250000,0.230769
1,ma 0001902 / nci c12367,0.258785,0.250000,0.228571
2,ma 0001902 / nci c12766,0.282320,0.466667,0.228571
3,ma 0001902 / nci c33638,0.323964,0.625000,0.400000
4,ma 0001902 / nci c12899,0.233693,0.411765,0.243902
...,...,...,...,...
4895,ma 0000175 / nci c33218,0.269659,0.307692,0.260870
4896,ma 0000175 / nci c41177,0.281649,0.400000,0.270270
4897,ma 0000175 / nci c49273,0.298697,0.333333,0.333333
4898,ma 0000175 / nci c52719,0.267155,0.357143,0.148148


In [34]:
!pip install -U sentence-transformers


Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 3.2 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 9.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 49.6 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 34.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 57.4 MB/s 
Collecting

In [35]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
model = SentenceTransformer('all-MiniLM-L6-v2')

def sentence_transformers(p1,p2):
    embedding1 = model.encode(p1, convert_to_tensor=True)
    embedding2 = model.encode(p2, convert_to_tensor=True)
    # compute similarity scores of two embeddings
    cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
    return cosine_scores.item()

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [36]:
def transformers_similarity_matrix(df1_,df2_,param):

    entity_list = df1_['Name'].values
    entity_list2 = df2_['Name'].values

    label_list  = df1_['label'].values
    label_list2 = df2_['label'].values

    l1 = df1_['Path'].values
    l2 = df2_['Path'].values
    #print(l2)



    #Create empty matrix to fill
    M_sim = np.zeros((l1.shape[0],l2.shape[0]))
    #Iterate and fill 
    for i in range(l1.shape[0]):
      u = label_list[i]
      for j in range(l2.shape[0]):
          v = label_list2[j]
          #similarity -> structural similarity 
          M_sim[i,j] = sentence_transformers(u,v)

    DF_sim = pd.DataFrame(M_sim,columns=label_list2,index=label_list)

    return DF_sim

In [37]:
sim_df_trans = transformers_similarity_matrix(human[:70], mouse[:70], (0.5,0.5))

In [38]:
stacked_df_trans = sim_df_trans.stack().reset_index()

In [39]:
stacked_df_trans = stacked_df_trans.rename(columns={0: 'Transformers_similarity'})
stacked_df['Transformers_similarity'] =  stacked_df_trans['Transformers_similarity']
stacked_df

Unnamed: 0,entity_pair_names,Structural_similarity,Jaccardian_Similarity,Sequence_Similarity,Transformers_similarity
0,ma 0001902 / nci c13071,0.275570,0.250000,0.230769,0.119843
1,ma 0001902 / nci c12367,0.258785,0.250000,0.228571,0.182627
2,ma 0001902 / nci c12766,0.282320,0.466667,0.228571,0.423646
3,ma 0001902 / nci c33638,0.323964,0.625000,0.400000,0.352093
4,ma 0001902 / nci c12899,0.233693,0.411765,0.243902,0.144147
...,...,...,...,...,...
4895,ma 0000175 / nci c33218,0.269659,0.307692,0.260870,0.211520
4896,ma 0000175 / nci c41177,0.281649,0.400000,0.270270,0.248115
4897,ma 0000175 / nci c49273,0.298697,0.333333,0.333333,0.090086
4898,ma 0000175 / nci c52719,0.267155,0.357143,0.148148,0.100052


In [40]:
!pip install -U spacy

Collecting spacy
  Downloading spacy-3.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 3.9 MB/s 
[?25hCollecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.7-py3-none-any.whl (17 kB)
Collecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.1-py3-none-any.whl (27 kB)
Collecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (457 kB)
[K     |████████████████████████████████| 457 kB 45.8 MB/s 
[?25hCollecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.2-py3-none-any.whl (7.2 kB)
Collecting spacy-legacy<3.1.0,>=3.0.9
  Downloading spacy_legacy-3.0.9-py2.py3-none-any.whl (20 kB)
Collecting thinc<8.1.0,>=8.0.14
  Downloading thinc-8.0.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (653 kB)
[K     |████████████████████████████████| 653 kB 39.7 MB/s 
[?25hCollecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydanti

In [41]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.3.0/en_core_web_md-3.3.0-py3-none-any.whl (33.5 MB)
[K     |████████████████████████████████| 33.5 MB 2.1 MB/s 
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [42]:
from math import sqrt, pow, exp
 
def squared_sum(x):
  """ return 3 rounded square rooted value """
 
  return round(sqrt(sum([a*a for a in x])),3)
  
def cos_similarity(x,y):
    """ return cosine similarity between two lists """

    numerator = sum(a*b for a,b in zip(x,y))
    denominator = squared_sum(x)*squared_sum(y)
    return round(numerator/float(denominator),3)


In [43]:
import spacy
nlp = spacy.load('en_core_web_md')

import numpy as np
import scipy
from scipy.spatial import distance

def spacy_similarity_matrix(df1_,df2_,param):

    entity_list = df1_['Name'].values
    entity_list2 = df2_['Name'].values

    label_list  = df1_['label'].values
    label_list2 = df2_['label'].values

    l1 = df1_['Path'].values
    l2 = df2_['Path'].values

    #print(l2)



    #Create empty matrix to fill
    M_sim = np.zeros((l1.shape[0],l2.shape[0]))
    #Iterate and fill 
    for i in range(l1.shape[0]):
      u = nlp(label_list[i]).vector
      for j in range(l2.shape[0]):
          v = nlp(label_list2[j]).vector
          #similarity -> structural similarity 
          M_sim[i,j] = cos_similarity(u,v)

    DF_sim = pd.DataFrame(M_sim,columns=label_list2,index=label_list)

    return DF_sim
#scipy.spatial.distance.cosine

In [44]:
sim_df_spacy = spacy_similarity_matrix(human[:70], mouse[:70], (0.5,0.5))

  del sys.path[0]


In [45]:
stacked_df_spacy = sim_df_spacy.stack().reset_index()

In [46]:
stacked_df_spacy = stacked_df_spacy.rename(columns={0: 'Spacy_similarity'})
stacked_df['Spacy_similarity'] =  stacked_df_spacy['Spacy_similarity']
stacked_df.to_csv('the first_five.csv')


In [53]:
stacked_df

Unnamed: 0,entity_pair_names,Structural_similarity,Jaccardian_Similarity,Sequence_Similarity,Transformers_similarity,Spacy_similarity
0,ma 0001902 / nci c13071,0.275570,0.250000,0.230769,0.119843,0.377
1,ma 0001902 / nci c12367,0.258785,0.250000,0.228571,0.182627,0.269
2,ma 0001902 / nci c12766,0.282320,0.466667,0.228571,0.423646,0.473
3,ma 0001902 / nci c33638,0.323964,0.625000,0.400000,0.352093,0.499
4,ma 0001902 / nci c12899,0.233693,0.411765,0.243902,0.144147,0.079
...,...,...,...,...,...,...
4895,ma 0000175 / nci c33218,0.269659,0.307692,0.260870,0.211520,
4896,ma 0000175 / nci c41177,0.281649,0.400000,0.270270,0.248115,
4897,ma 0000175 / nci c49273,0.298697,0.333333,0.333333,0.090086,
4898,ma 0000175 / nci c52719,0.267155,0.357143,0.148148,0.100052,


In [51]:
from google.colab import files

stacked_df.to_csv('output.csv', encoding = 'utf-8-sig') 
files.download('output.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [47]:
#!pip install gensim

In [48]:
#import gensim
#import gensim.downloader as api
#model_gigaword = api.load("word2vec-google-news-300")


In [49]:
for i in human['label'][:70]:
  print(i)

face
mammary gland
parietal bone
stroma of the ovarian cortex
hypoglossal nucleus
prostatic portion of the male urethra
arachnoid villus
cerebral peduncle
deep palmar artery
external granular layer
hair root
superior parathyroid gland
superficial femoral artery
epithalamus
bone matrix
arteriole
bone marrow myeloid stem cell with predominant neutrophil differentiation
penis erectile tissue
skin of the extremity
inferior rectal artery
flexor carpi ulnaris
stratum conjunctum
ethmoid sinus
areola
peripheral nerve
maxilla
posterior chamber of the eye
male urethra
arch of the vertebra
cerumen
palmar fascia
external iliac artery
spiral tube of schchowa
shoulder girdle
s Five vertebra
thalamus
osteoclast
axilla
bone marrow stem cell at the earliest stage of myeloid differentiation
prepuce epithelium
foot skin
flexor digitorum profundus
hyoglossus muscle
reserve stem cell
pyloric gland
posterior ulnar vein
prostatic duct
breast fat pad
frontal sinus
peritoneal cavity
alpha granule
posterior eye

In [None]:
cosine = model_gigaword.similarity("pericyte", "cranial nerve")
cosine

In [54]:
def wordnnetsimilarity(e1,e2):
    
    s_e1 = wn.synsets(e1)
    s_e2 = wn.synsets(e2)

    if len(s_e1) == 0 or len(s_e2) == 0: # for classes that do not exist in wordnet dict
      return 0

    s_e1 = wn.synsets(e1)[0]
    s_e2 = wn.synsets(e2)[0]

    

    return s_e1.path_similarity(s_e2)


def label_coment_toekn_list(sentence):

  words = nltk.word_tokenize(sentence)
  clean_string= [word for word in words if word.isalnum()]

  stop_words = list(stopwords.words('english'))
  no_stop_words = [word for word in clean_string if word not in stop_words]



  return no_stop_words


def label_coment_wordnet_sim(c1,c2):
  l1 = label_coment_toekn_list(c1)
  l2 = label_coment_toekn_list(c2)

  n = len(l1)
  m = len(l2)


  if n == 0 or m == 0 :
    return (0)


  n_mat = [n+1-k for k in range(1,n+1)]
  m_mat = [m+1-k for k in range(1,m+1)]

  #Create empty matrix to fill with lex sim of parents
  M_lex = np.zeros((len(l1),len(l2)))

 #Iterate and fill matrix with lex sim of all parents 
  for i in range(3):
      u = l1[i]
      for j in range(3):
          v = l2[j]
          # similarity -> Lexical similarity
          M_lex[i,j] = wordnnetsimilarity(u,v)

         
  print(M_lex)
  d = pd.DataFrame(M_lex,columns=l2,index=l1).fillna(0)
  

  n_parents = sum(list(d.max(axis=1)*n_mat))/(n*(n+1))
  m_parents = sum(list(i for i in d.max(axis=0)*m_mat))/(m*(m+1))
  return (n_parents+m_parents)


def semantic_sim(row1,row2,param):
  b,c= param


  ni = row1['Name'].values
  nj = row2['Name'].values
  li = row1['label'].values
  lj = row2['label'].values
  ci = row1['comment'].values
  cj = row2['comment'].values

  #sim_entity = wordnnetsimilarity(ni,nj)

  sim_labels = label_coment_wordnet_sim(li,lj)

  sim_comments = label_coment_wordnet_sim(ci,cj)

  sem_sim =  ((b*sim_labels)/(b+c)) + ((c*sim_comments)/(b+c))

  return sem_sim 

In [57]:
wordnnetsimilarity('timestamp','time')

0