In [1]:
import pandas as pd
import pickle
from math import e

In [2]:
from gensim.models import Word2Vec


In [3]:
# in his notebook we will find possible compounds which could cause a given side effect.\
#using the sigmoid function to represent the similarity of the untested compound and known compound causing the side effect


In [4]:
with open('../data/meta/side_effects.pickle', 'rb') as file:
    side_eff_dict = pickle.load(file)


In [5]:
df=pd.DataFrame(side_eff_dict.items(),columns=['side_effect','compounds'])
df.head()

Unnamed: 0,side_effect,compounds
0,c0032584,"{db00740, db00736, db00448, db01132, db00313, ..."
1,c0424024,"{db00575, db01024, db01238}"
2,c0235309,"{db00740, db00921, db00523, db00422, db08815, ..."
3,c0014724,"{db08827, db00375, db00989, db00503, db06777, ..."
4,c0001824,"{db00740, db00190, db00441, db00934, db00301, ..."


In [6]:
model=Word2Vec.load('../data/meta/compound_embeddings.bin')

In [7]:
model.wv.most_similar('db00014',topn=5)

[('db01395', 0.866726279258728),
 ('db00123', 0.8640450239181519),
 ('db00328', 0.8638545870780945),
 ('db01047', 0.8620090484619141),
 ('db01559', 0.8619389533996582)]

In [8]:
def get_similar_compounds(compound,top=5):
    return model.wv.most_similar(compound,topn=top)
def sigm(a):
    return (1+e**-a)**-1
def wrapper(col2,top=5):
    lis =[]
   
    for comp in col2:
        lis += get_similar_compounds(comp)
        #insertion takes n and sorting is n logn, the current values of n are small enough to make the cost negligible.
        #we will just sort at the end of the function and return only the top n compounds.
    top_lis = []
    keys={}
    for i in lis:
        if (not i[0] in col2) and (not i[0] in keys.keys()):
            top_lis.append(i)
        keys[i[0]] = 1
    
    top_lis.sort(key=lambda x:x[1],reverse=True)
    return top_lis[0][0],sigm(top_lis[0][1]),[i[0] for i in top_lis[1:top+1]]

            
        

In [9]:

df[['mostProbableUntestedCompound','score','otherPossibleCompounds']] = df.compounds.apply(lambda x:pd.Series(wrapper(x)))
df.head()

Unnamed: 0,side_effect,compounds,mostProbableUntestedCompound,score,otherPossibleCompounds
0,c0032584,"{db00740, db00736, db00448, db01132, db00313, ...",db06710,0.730506,"[db00769, db01406, db00676, db01173, db00894]"
1,c0424024,"{db00575, db01024, db01238}",db00880,0.719511,"[db00521, db00354, db01580, db01116, db00178]"
2,c0235309,"{db00740, db00921, db00523, db00422, db08815, ...",db00223,0.730743,"[db00959, db01130, db00324, db01013, db01063]"
3,c0014724,"{db08827, db00375, db00989, db00503, db06777, ...",db01069,0.730691,"[db00456, db01615, db09000, db08967, db00508]"
4,c0001824,"{db00740, db00190, db00441, db00934, db00301, ...",db00647,0.730709,"[db08801, db00420, db00902, db01615, db01614]"


In [11]:
df.to_csv('../data/results/side_effect_compound_pred.csv',index=False)

In [12]:
#score is sigmoid of the similarity (which is a cosine similarity)

In [13]:
#we initially thought of using graph rags and a graph database and then create texts based on those.
#however there are no graph databases which offer such services without some kind of subscription. So I changed my approach and thought of using embeddings.
#i didnt use LLM because the model will focus on the text patterns and grammar more than on the compound names. However i want to see if what i 
#think is true by using the story based data with a very structured format, all rows would have only have 2 patterns which might make 
#it hard for the model to focus on the compound ids.
