# Create a Dummy Dataset


In [None]:
import pandas as pd
import random

# Define possible values for each column
diseases = ["Diabetes", "Asthma", "Hypertension", "Cancer", "COVID-19", "Flu", "Tuberculosis", "Arthritis"]
symptoms_list = {
    "Diabetes": ["Fatigue", "Thirst", "Frequent Urination", "Blurred Vision"],
    "Asthma": ["Wheezing", "Cough", "Shortness of Breath"],
    "Hypertension": ["Headache", "Dizziness", "Chest Pain"],
    "Cancer": ["Weight Loss", "Fatigue", "Pain", "Fever"],
    "COVID-19": ["Fever", "Cough", "Loss of Taste", "Shortness of Breath"],
    "Flu": ["Fever", "Body Aches", "Sore Throat", "Runny Nose"],
    "Tuberculosis": ["Cough", "Weight Loss", "Night Sweats", "Fever"],
    "Arthritis": ["Joint Pain", "Stiffness", "Swelling", "Fatigue"]
}
drugs = {
    "Diabetes": ["Metformin", "Insulin", "Glipizide"],
    "Asthma": ["Salbutamol", "Budesonide", "Montelukast"],
    "Hypertension": ["Amlodipine", "Losartan", "Lisinopril"],
    "Cancer": ["Cisplatin", "Doxorubicin", "Paclitaxel"],
    "COVID-19": ["Remdesivir", "Molnupiravir", "Paxlovid"],
    "Flu": ["Oseltamivir", "Zanamivir", "Baloxavir"],
    "Tuberculosis": ["Isoniazid", "Rifampin", "Ethambutol"],
    "Arthritis": ["Ibuprofen", "Naproxen", "Methotrexate"]
}
molecular_structures = {
    "Metformin": "C4H11N5",
    "Insulin": "C257H383N65O77S6",
    "Glipizide": "C21H27N5O4S",
    "Salbutamol": "C13H21NO3",
    "Budesonide": "C25H34O6",
    "Montelukast": "C35H36ClNO3S",
    "Amlodipine": "C20H25ClN2O5",
    "Losartan": "C22H23ClN6O",
    "Lisinopril": "C21H31N3O5",
    "Cisplatin": "PtCl2(NH3)2",
    "Doxorubicin": "C27H29NO11",
    "Paclitaxel": "C47H51NO14",
    "Remdesivir": "C27H35N6O8P",
    "Molnupiravir": "C13H19N3O7",
    "Paxlovid": "C23H32F3N5O4",
    "Oseltamivir": "C16H28N2O4",
    "Zanamivir": "C12H20N4O7",
    "Baloxavir": "C27H23F2N3O7S",
    "Isoniazid": "C6H7N3O",
    "Rifampin": "C43H58N4O12",
    "Ethambutol": "C10H24N2O2",
    "Ibuprofen": "C13H18O2",
    "Naproxen": "C14H14O3",
    "Methotrexate": "C20H22N8O5"
}
mechanism_of_action = ["Lowers glucose", "Relaxes airways", "Reduces inflammation", "Kills bacteria", "Prevents virus replication"]
side_effects = ["Nausea", "Dizziness", "Fatigue", "Headache", "None"]
related_diseases = {
    "Diabetes": ["Obesity", "PCOS"],
    "Asthma": ["COPD", "Bronchitis"],
    "Hypertension": ["Stroke", "Heart Disease"],
    "Cancer": ["Leukemia", "Lymphoma"],
    "COVID-19": ["Pneumonia", "SARS"],
    "Flu": ["Pneumonia", "Bronchitis"],
    "Tuberculosis": ["Pneumonia", "HIV"],
    "Arthritis": ["Osteoporosis", "Gout"]
}

# Generate 100,000 rows of data
data = []
for _ in range(100000):
    disease = random.choice(diseases)
    symptoms = random.sample(symptoms_list[disease], k=random.randint(1, 3))
    age_range = random.choice(["10-20", "21-30", "31-40", "41-50", "51-60", "61-70", "71-80", "81-90"])
    gender = random.choice(["M", "F"])
    bmi = round(random.uniform(18, 35), 1)
    smoker = random.choice(["Yes", "No"])
    exercise = random.choice(["Rare", "Moderate", "Frequent"])
    comorbidities = random.choice(related_diseases[disease])
    drug = random.choice(drugs[disease])
    effectiveness = random.randint(50, 95)
    molecular_structure = molecular_structures[drug]
    moa = random.choice(mechanism_of_action)
    side_effect = random.choice(side_effects)
    related_disease = ", ".join(random.sample(related_diseases[disease], k=1))

    # Ensure 3 symptom columns
    symptom_1, symptom_2, symptom_3 = (symptoms + [None, None, None])[:3]

    data.append([disease, symptom_1, symptom_2, symptom_3, age_range, gender, bmi, smoker, exercise, comorbidities, drug, effectiveness, molecular_structure, moa, side_effect, related_disease])

# Create a DataFrame
df = pd.DataFrame(data, columns=[
    "Disease", "Symptom 1", "Symptom 2", "Symptom 3", "Age Range", "Gender", "BMI", "Smoker", "Exercise", "Comorbidities",
    "Recommended Drug", "Effectiveness (%)", "Molecular Structure", "Mechanism of Action", "Side Effects", "Related Diseases"
])

file_path = "/content/drug_prediction_dataset_3.csv"
df.to_csv(file_path, index=False)
file_path


'/content/drug_prediction_dataset_3.csv'

## Now we have split the dataset into output csv and the csv which will be use for creating ml

## Load Model

In [None]:
# Import all the tools we need

# Regular EDA and plotting Librarries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Models from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#Model Evaluations
from sklearn.model_selection import train_test_split,cross_val_predict
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,cross_val_score
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.metrics import RocCurveDisplay,roc_curve,roc_auc_score
from sklearn import metrics

In [None]:
df = pd.read_csv("/content/drug_prediction_dataset_3.csv")
df.head()

Unnamed: 0,Disease,Symptom 1,Symptom 2,Symptom 3,Age Range,Gender,BMI,Smoker,Exercise,Comorbidities,Recommended Drug,Effectiveness (%),Molecular Structure,Mechanism of Action,Side Effects,Related Diseases
0,Hypertension,Dizziness,,,31-40,F,27.8,No,Moderate,Stroke,Amlodipine,57,C20H25ClN2O5,Prevents virus replication,,Heart Disease
1,COVID-19,Shortness of Breath,,,41-50,M,27.7,No,Moderate,Pneumonia,Molnupiravir,79,C13H19N3O7,Kills bacteria,Nausea,SARS
2,COVID-19,Cough,,,81-90,M,28.6,Yes,Moderate,Pneumonia,Remdesivir,58,C27H35N6O8P,Reduces inflammation,Fatigue,Pneumonia
3,Flu,Fever,,,71-80,F,32.0,No,Rare,Bronchitis,Oseltamivir,68,C16H28N2O4,Prevents virus replication,Headache,Bronchitis
4,COVID-19,Cough,Loss of Taste,,71-80,F,25.1,Yes,Frequent,SARS,Molnupiravir,51,C13H19N3O7,Lowers glucose,Headache,Pneumonia


In [None]:
df = df.drop(columns=["BMI"])

In [None]:
df["Symptom 1"]= df["Symptom 1"].fillna("None")
df["Symptom 2"]= df["Symptom 2"].fillna("None")
df["Symptom 3"]= df["Symptom 3"].fillna("None")

In [None]:
df.isna().sum()

Unnamed: 0,0
Disease,0
Symptom 1,0
Symptom 2,0
Symptom 3,0
Age Range,0
Gender,0
Smoker,0
Exercise,0
Comorbidities,0
Recommended Drug,0


In [None]:
df["Side Effects"]= df["Side Effects"].fillna("None")

In [None]:
df.isna().sum()

Unnamed: 0,0
Disease,0
Symptom 1,0
Symptom 2,0
Symptom 3,0
Age Range,0
Gender,0
Smoker,0
Exercise,0
Comorbidities,0
Recommended Drug,0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report


# Define categorical columns for encoding
categorical_cols = ["Symptom 1", "Symptom 2", "Symptom 3", "Smoker", "Exercise", "Comorbidities", "Age Range", "Gender"]

# Encode categorical features using Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Encode the target variable (Recommended Drug)
drug_encoder = LabelEncoder()
df["Recommended Drug"] = drug_encoder.fit_transform(df["Recommended Drug"])

# Define features (X) and target (y)
X = df[["Symptom 1", "Symptom 2", "Symptom 3", "Smoker", "Exercise", "Comorbidities", "Age Range", "Gender"]]
y = df["Recommended Drug"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    model_scores = {}

    for name, model in models.items():
        print(f"Training {name}...")

        # Train the model
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=1)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=1)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=1)

        # Store the results
        model_scores[name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1
        }

        print(f"{name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    return model_scores



In [None]:


import pandas as pd
import re
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import random
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
# Tune Logistic Regression
import numpy as np
from sklearn.model_selection import RandomizedSearchCV # Import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

models = {
    "Logistic Regression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "LightGBM": LGBMClassifier()

}

def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    model_scores = {}
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

# ... (your data preprocessing code up to the point where you have transformed_x and y)


# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
y = label_encoder.fit_transform(y)

# Print the mapping between original labels and encoded labels
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate the models
scores = fit_and_score(models, X_train, X_test, y_train, y_test)
scores



{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23}
Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training KNN...
Training Random Forest...
Training SVM...
Training Decision Tree...
Training Gradient Boosting...
Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002749 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 265
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 9
[LightGBM] [Info] Start training from score -3.170383
[LightGBM] [Info] Start training from score -3.182664
[LightGBM] [Info] Start training from score -3.164743
[LightGBM] [Info] Start training from score -3.143652
[LightGBM] [Info] Start training from score -3.199073
[LightGBM] [Info] Start training from score -3.179355
[LightGBM] [Info] Start training from score -3.205528
[LightGBM] [Info] Start training from score -3.170383
[LightGBM] [Info] Start training from score -3.182062
[LightGBM] [Info] Start trai

{'Logistic Regression': 0.11115,
 'KNN': 0.32085,
 'Random Forest': 0.3304,
 'SVM': 0.22345,
 'Decision Tree': 0.3302,
 'Gradient Boosting': 0.32465,
 'LightGBM': 0.3273}

In [52]:
import pandas as pd
import random

def predict_drug(disease, symptoms, age_range, gender, bmi, smoker, exercise, comorbidities, dataset):
    # Filter by disease
    filtered_df = dataset[dataset['Disease'] == disease]

    # Further filter by symptoms (check if symptoms exist in the Symptom columns)
    for symptom in symptoms:
        filtered_df = filtered_df[(filtered_df['Symptom 1'] == symptom) |
                                  (filtered_df['Symptom 2'] == symptom) |
                                  (filtered_df['Symptom 3'] == symptom)]

    # Further filter by additional parameters
    filtered_df = filtered_df[filtered_df['Age Range'] == age_range]
    filtered_df = filtered_df[filtered_df['Gender'] == gender]
    filtered_df = filtered_df[filtered_df['Smoker'] == smoker]
    filtered_df = filtered_df[filtered_df['Exercise'] == exercise]
    filtered_df = filtered_df[filtered_df['Comorbidities'] == comorbidities]

    # If multiple matches, select the first one
    if not filtered_df.empty:
        recommended_drug = filtered_df.iloc[0]['Recommended Drug']
        effectiveness_score = random.randint(50, 95)  # Generate random effectiveness score
        return recommended_drug, effectiveness_score
    else:
        return "No matching drug found. Try adjusting your inputs.", None

# Load the dataset
dataset = pd.read_csv("ML_dataset_3.csv")

# Example user input
user_disease = "COVID-19"
user_symptoms = ["Cough", "Loss of Taste"]
user_age_range = "31-40"
user_gender = "F"
user_bmi = 25.0  # Not used for filtering in this version
user_smoker = "No"
user_exercise = "Moderate"
user_comorbidities = "Pneumonia"

# Predict the drug
predicted_drug, effectiveness = predict_drug(user_disease, user_symptoms, user_age_range, user_gender, user_bmi, user_smoker, user_exercise, user_comorbidities, dataset)
print(f"Recommended Drug: {predicted_drug}, Effectiveness Score: {effectiveness if effectiveness else 'N/A'}")


Recommended Drug: Remdesivir, Effectiveness Score: 72
