# Step 7: Calculate distance between drug and disease

In [1]:
import os
os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
import modin.pandas as pd
import re

In [3]:
df_drug_disease_sentences = pd.read_csv('drug_disease_sentences.csv')


    import ray
    ray.init()



In [4]:
# Clean sentences before calculating distance:
def clean_sentence(text):
    return ''.join(e for e in text if (e.isalnum() or e.isspace()))

In [5]:
# Get minimum (absolute) distance between word1 and word1 in a sentence
def get_distance(w1, w2, sentence):
    words = sentence.split()
    if w1 in words and w2 in words:
          return abs(words.index(w2) - words.index(w1))
    return -1

In [6]:
def process_disease_drug_rows(data):
    word1 = clean_sentence(data['disease'].lower())
    word2 = clean_sentence(data['drug'].lower())
    sentence = clean_sentence(data['sentences'].lower())
    distance = get_distance(word1, word2, sentence)
    return distance

In [10]:
from tqdm import tqdm

In [11]:
df_drug_disease_sentences['distance'] = df_drug_disease_sentences.apply(process_disease_drug_rows, axis=1)



In [12]:
df_drug_disease_sentences.count()

disease      4220
drug         4220
sentences    4220
distance     4220
dtype: int64

In [27]:
df_drug_disease_sentences[df_drug_disease_sentences['distance'] == -1].values

array([['hypertension', 'Selexipag',
        'The TRITON trial (Efficacy and Safety of Initial Triple Versus Initial Dual Oral Combination Therapy in Patients With Newly Diagnosed Pulmonary Arterial Hypertension)-which compared triple therapy with tadalafil, macitentan, and selexipag versus double therapy with tadalafil and macitentanshowed no statistical differences in the primary end point (change in pulmonary vascular resistance, or PVR) and multiple secondary end points (6-minute walk distance [6MWD], NT pro-brain natriuretic peptide, and no worsening in functional class) between the two groups.',
        -1],
       ['COVID-19', 'Gilteritinib',
        "174 The presence of another small molecule AXL inhibitor, bemcentinib, in the RECOVERY COVID-19 Phase II clinical trial COVID-19 patients backed up gilteritinib's antiviral efficacy.",
        -1],
       ['cholera', 'Phthalocyanine',
        'All the reagents used were obtained from Sigma-Aldrich South Africa, and include the foll

In [28]:
df_drug_disease_sentences.to_csv('drug_disease_sentences_with_distance.csv', index=False)

## Appying on #1 & #2 dataset

In [113]:
dataframe = pd.read_csv('Dataset/dataset_1_junaed.csv')
dataframe2 = pd.read_csv('Dataset/dataset_2_hamada.csv')

In [114]:
dataframe2 = dataframe2[['Disease Name', 'Drug Name', 'Common Sentence', 'Manual Label']]

In [115]:
# Excluding -1 labels for binary classification
dataframe2 = dataframe2[dataframe2['Manual Label'] != -1]

In [116]:
dataframe = dataframe.append(dataframe2)



In [117]:
dataframe = dataframe[['Disease Name', 'Drug Name', 'Common Sentence', 'Manual Label']]

In [121]:
dataframe.describe()

Unnamed: 0,Manual Label
count,178.0
mean,0.61236
std,0.488586
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


Total 178 labels


In [122]:
dataframe.rename(columns={'Disease Name': 'disease', 'Drug Name': 'drug', 'Common Sentence': 'sentences'}, inplace=True)

In [123]:
dataframe = dataframe[dataframe.sentences.notna()]

In [124]:
dataframe['distance'] = dataframe.apply(process_disease_drug_rows, axis=1)



In [125]:
dataframe = dataframe[dataframe.distance != -1]

In [126]:
dataframe

Unnamed: 0,disease,drug,sentences,Manual Label,distance
1,thrombosis,thalidomide,. christmas disease-national survey (biggs a...,0.0,385
4,influenza,imatinib,. 5mg 00521183 dacarbazine iv pws 200mg/v...,0.0,60
6,dermatitis,peanut,". he had past history of asthma, allergic rh...",0.0,11
7,schizophrenia,glucose,. troglitazone is a ligand to both pparα and...,0.0,14
9,rhinitis,peanut,". he had past history of asthma, allergic rh...",0.0,14
...,...,...,...,...,...
28,covid-19,chloroquine,. this has led in china to include chloroqui...,1.0,10
35,covid-19,creatinine,". nevertheless, the renal function of patien...",0.0,12
37,covid-19,fedratinib,". in summary, jak2 inhibitor fedratinib can ...",1.0,23
38,covid-19,oseltamivir,. â€¢ oseltamivir: los inhibidores de la neu...,0.0,23
