In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

Load Data

In [2]:
data = pd.read_csv('drugs_side_effects_drugs_com.csv')

In [3]:
data.dropna(subset=['medical_condition', 'drug_name'], inplace=True)

In [6]:
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):
        tokens = word_tokenize(text.lower())
        filtered_tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
        preprocessed_text = ' '.join(filtered_tokens)
        return preprocessed_text
    else:
        return '' 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Arisawn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Arisawn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
data['processed_medical_condition'] = data['medical_condition'].apply(preprocess_text)
data['processed_side_effects'] = data['side_effects'].apply(preprocess_text)

In [8]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_medical = tfidf_vectorizer.fit_transform(data['processed_medical_condition'])
tfidf_matrix_side_effects = tfidf_vectorizer.fit_transform(data['processed_side_effects'])

In [9]:
cosine_sim_medical = cosine_similarity(tfidf_matrix_medical, tfidf_matrix_medical)
cosine_sim_side_effects = cosine_similarity(tfidf_matrix_side_effects, tfidf_matrix_side_effects)

In [12]:
def recommend_drugs(condition, num_recommendations=5):
    condition_lower = condition.lower()
    condition_indices = data[data['medical_condition'].str.lower() == condition_lower].index
    
    if len(condition_indices) == 0:
        print(f"No drugs found for medical condition: {condition}")
        return []
    
    condition_index = condition_indices[0]  # Use the first matching index
    
    # Get cosine similarity scores for medical conditions
    medical_sim_scores = list(enumerate(cosine_sim_medical[condition_index]))
    # Get cosine similarity scores for side effects
    side_effects_sim_scores = list(enumerate(cosine_sim_side_effects[condition_index]))
    
    # Combine similarity scores
    combined_sim_scores = [(i, 0.5*med_sim_score + 0.5*side_effects_sim_scores[i][1]) for i, med_sim_score in medical_sim_scores]
    # Sort drugs by combined similarity scores
    combined_sim_scores = sorted(combined_sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top recommended drugs
    recommended_drugs = [(data.iloc[i]['drug_name'], combined_sim_score) for i, combined_sim_score in combined_sim_scores[:num_recommendations]]
    
    return recommended_drugs

In [13]:
recommended_drugs = recommend_drugs('headache')
print("Recommended drugs for headache:")
for drug, similarity in recommended_drugs:
    print(f"- {drug} (Similarity: {similarity:.2f})")

No drugs found for medical condition: headache
Recommended drugs for headache:
