In [3]:
# Importing libraries
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, average_precision_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Load train and test files

train =  pd.read_table('/medisyn-labs/train.tsv',sep='\t', header=None)
test = pd.read_table('medisyn-labs/test.tsv',sep='\t', header=None)

In [4]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,2202,enalapril,4,Highly Effective,Mild Side Effects,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ..."
1,3117,ortho-tri-cyclen,1,Highly Effective,Severe Side Effects,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest..."
2,1146,ponstel,10,Highly Effective,No Side Effects,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...
3,3947,prilosec,3,Marginally Effective,Mild Side Effects,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...
4,1951,lyrica,2,Marginally Effective,Severe Side Effects,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above


In [5]:
### Adding headers to train and test files
train.columns = ['Customer_Identifier','Medicine_Name','Rating','Effectiveness','Side_Effects','Condition','Benefit_Review','Side_Effect_Review','Overall_Review']
test.columns = ['Customer_Identifier','Medicine_Name','Rating','Effectiveness','Side_Effects','Condition','Benefit_Review','Side_Effect_Review','Overall_Review']

In [6]:
# Number of rows and cols in train and test set
print("The dimensions of training data is {}".format(train.shape))
print("The dimensions of testing data is {}".format(test.shape))

The dimensions of training data is (3107, 9)
The dimensions of testing data is (1036, 9)


In [7]:
# Data Preprocessing and Embeddings

# Load SBERT model for embeddings
model  = SentenceTransformer('all-MiniLM-L6-v2')

# Combine fields that capture illness and context
train["combined_text"] = (train['Condition'].fillna('') + ' . ' +
            train['Benefit_Review'].fillna('') + ' . ' +
            train['Side_Effect_Review'].fillna('') + ' . ' +
            train['Overall_Review'].fillna(''))
#Encode using SBERT
embeddings = model.encode(train['combined_text'].tolist(),show_progress_bar=True, convert_to_tensor=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/98 [00:00<?, ?it/s]

In [8]:
## Converting effectiveness and side effects labels into rating on scale of 1-5
train['Effectiveness_Rating'] = train['Effectiveness'].map({'Highly Effective':1,'Considerably Effective':2,'Moderately Effective':3,
                                                           'Marginally Effective':4,'Ineffective':5})
test['Effectiveness_Rating'] = test['Effectiveness'].map({'Highly Effective':1,'Considerably Effective':2,'Moderately Effective':3,
                                                           'Marginally Effective':4,'Ineffective':5})
train['Side_Effects_Rating'] = train['Side_Effects'].map({'No Side Effects':1,'Mild Side Effects':2,'Moderate Side Effects':3,
                                                           'Severe Side Effects':4,'Extremely Severe Side Effects':5})
test['Side_Effects_Rating'] = test['Side_Effects'].map({'No Side Effects':1,'Mild Side Effects':2,'Moderate Side Effects':3,
                                                           'Severe Side Effects':4,'Extremely Severe Side Effects':5})

In [9]:
#Normaize effectivess and side effect ratings between 0-1
scaler = MinMaxScaler()
train[["effectiveness_norm", "side_effect_norm"]] = scaler.fit_transform(train[["Effectiveness_Rating", "Side_Effects_Rating"]])
test[["effectiveness_norm", "side_effect_norm"]] = scaler.transform(test[["Effectiveness_Rating", "Side_Effects_Rating"]])


In [20]:
## Defining recommendation scoring function

def recommend_medicine(test_input,alpha=0.9,beta=0.05,gamma=0.05,top_k=1):
    #Encode the testing sample
    test_embedding = model.encode(test_input,convert_to_tensor=True)
    #compute cosine similarities with all medicines
    similarity_scores = util.cos_sim(test_embedding,embeddings)[0].cpu().numpy()
    #compute combined score
    scores = (alpha * similarity_scores +
              beta * train["effectiveness_norm"].values -
              gamma * train["side_effect_norm"].values)
    #get top k recommendations
    top_indices = np.argsort(scores)[::-1][:top_k]
    top_medicines = train.iloc[top_indices][["Medicine_Name", "Condition",
                                          "Rating", "Effectiveness_Rating",
                                          "Side_Effects_Rating"]].copy()
    top_medicines["Score"] = scores[top_indices]
    return top_medicines

In [21]:
#Inference on test data
query = test.Condition.iloc[0]
print("The condition is {}".format(query))
recommendations = recommend_medicine(query)
print("Top Recommended Medicines:")
print(recommendations)

The condition is sinus infection
Top Recommended Medicines:
     Medicine_Name        Condition  Rating  Effectiveness_Rating  \
2776          zmax  sinus infection       5                     3   

      Side_Effects_Rating     Score  
2776                    2  0.622914  


In [12]:
### Removing nulls from overall_review col in test data
test = test.dropna(subset=['Overall_Review'],inplace=False)
test.isnull().sum()

Customer_Identifier      0
Medicine_Name            0
Rating                   0
Effectiveness            0
Side_Effects             0
Condition                0
Benefit_Review           5
Side_Effect_Review      23
Overall_Review           0
Effectiveness_Rating     0
Side_Effects_Rating      0
effectiveness_norm       0
side_effect_norm         0
dtype: int64

In [28]:
#Evaluate recommended medicines
def evaluate_recommendation_system(df, alpha=0.9, beta=0.05, gamma=0.05, top_k=1):
    y_true,y_pred = [],[]
    for i,row in df.iterrows():
        #testing query
        query = row['Condition']+" "+row['Overall_Review']
        recommended = recommend_medicine(query, alpha, beta, gamma, top_k)
        recommended_names = recommended["Medicine_Name"].tolist()

        #Ground truth labels
        true_labels = row['Medicine_Name']
        y_true.append(true_labels)
        y_pred.append(recommended_names)

        #Convert predictions to binary
        correct = sum([true in preds for true, preds in zip(y_true, y_pred)])
        precision_at_k = correct / (len(df) * top_k)
        recall_at_k = correct / len(df)
        f_score_at_k = (2*precision_at_k*recall_at_k)/(precision_at_k+recall_at_k) if (precision_at_k+recall_at_k) else 0


    print(f"Precision@{top_k}: {precision_at_k:.2f}")
    print(f"Recall@{top_k}: {recall_at_k:.2f}")
    print(f"Fscore@{top_k}: {f_score_at_k:.2f}")

evaluate_recommendation_system(test)


Precision@1: 0.31
Recall@1: 0.31
Fscore@1: 0.31
