In [5]:
# Import all the tools we need

# Regular EDA and plotting Librarries
import numpy as np
import pandas as pd


# Models from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#Model Evaluations
from sklearn.model_selection import train_test_split,cross_val_predict
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,cross_val_score
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.metrics import RocCurveDisplay,roc_curve,roc_auc_score
from sklearn import metrics

In [6]:
df = pd.read_csv("ML_dataset_3.csv")
df.head()

Unnamed: 0,Disease,Symptom 1,Symptom 2,Symptom 3,Age Range,Gender,BMI,Smoker,Exercise,Comorbidities,Recommended Drug
0,Hypertension,Dizziness,,,31-40,F,27.8,No,Moderate,Stroke,Amlodipine
1,COVID-19,Shortness of Breath,,,41-50,M,27.7,No,Moderate,Pneumonia,Molnupiravir
2,COVID-19,Cough,,,81-90,M,28.6,Yes,Moderate,Pneumonia,Remdesivir
3,Flu,Fever,,,71-80,F,32.0,No,Rare,Bronchitis,Oseltamivir
4,COVID-19,Cough,Loss of Taste,,71-80,F,25.1,Yes,Frequent,SARS,Molnupiravir


In [10]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
Disease,Hypertension,COVID-19,COVID-19,Flu,COVID-19,Asthma,Diabetes,Tuberculosis,Flu,Arthritis,...,Arthritis,Asthma,Hypertension,Hypertension,Diabetes,Hypertension,Cancer,Diabetes,Diabetes,Asthma
Symptom 1,Dizziness,Shortness of Breath,Cough,Fever,Cough,Shortness of Breath,Frequent Urination,Weight Loss,Body Aches,Joint Pain,...,Stiffness,Cough,Headache,Headache,Blurred Vision,Headache,Fever,Frequent Urination,Fatigue,Shortness of Breath
Symptom 2,,,,,Loss of Taste,Wheezing,Blurred Vision,Night Sweats,Sore Throat,Fatigue,...,Fatigue,Wheezing,Dizziness,,,Chest Pain,Pain,Thirst,,Wheezing
Symptom 3,,,,,,Cough,,,,Stiffness,...,Swelling,,,,,Dizziness,Weight Loss,,,Cough
Age Range,31-40,41-50,81-90,71-80,71-80,41-50,61-70,51-60,21-30,81-90,...,21-30,31-40,41-50,31-40,51-60,31-40,71-80,21-30,61-70,51-60
Gender,F,M,M,F,F,F,F,F,M,F,...,M,F,M,F,M,F,M,F,F,F
Smoker,No,No,Yes,No,Yes,No,Yes,No,Yes,Yes,...,Yes,No,Yes,Yes,Yes,No,Yes,No,Yes,No
Exercise,Moderate,Moderate,Moderate,Rare,Frequent,Frequent,Moderate,Frequent,Moderate,Frequent,...,Rare,Moderate,Moderate,Moderate,Frequent,Moderate,Rare,Frequent,Rare,Frequent
Comorbidities,Stroke,Pneumonia,Pneumonia,Bronchitis,SARS,Bronchitis,Obesity,HIV,Pneumonia,Gout,...,Osteoporosis,COPD,Stroke,Heart Disease,PCOS,Stroke,Leukemia,PCOS,Obesity,Bronchitis
Recommended Drug,Amlodipine,Molnupiravir,Remdesivir,Oseltamivir,Molnupiravir,Budesonide,Insulin,Rifampin,Baloxavir,Methotrexate,...,Naproxen,Budesonide,Amlodipine,Amlodipine,Metformin,Losartan,Paclitaxel,Metformin,Metformin,Montelukast


In [7]:
df["Symptom 1"]= df["Symptom 1"].fillna("None")
df["Symptom 2"]= df["Symptom 2"].fillna("None")
df["Symptom 3"]= df["Symptom 3"].fillna("None")

In [8]:
df = df.drop(columns=["BMI"])

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    model_scores = {}

    for name, model in models.items():
        print(f"Training {name}...")

        # Train the model
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=1)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=1)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=1)

        # Store the results
        model_scores[name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1
        }

        print(f"{name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    return model_scores

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report


# Define categorical columns for encoding
categorical_cols = ["Symptom 1", "Symptom 2", "Symptom 3", "Smoker", "Exercise", "Comorbidities", "Age Range", "Gender"]

# Encode categorical features using Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Encode the target variable (Recommended Drug)
drug_encoder = LabelEncoder()
df["Recommended Drug"] = drug_encoder.fit_transform(df["Recommended Drug"])

# Define features (X) and target (y)
X = df[["Symptom 1", "Symptom 2", "Symptom 3", "Smoker", "Exercise", "Comorbidities", "Age Range", "Gender"]]
y = df["Recommended Drug"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:


import pandas as pd
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import random
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
# Tune Logistic Regression
import numpy as np
from sklearn.model_selection import RandomizedSearchCV # Import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

models = {
    "Logistic Regression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "LightGBM": LGBMClassifier()

}

def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    model_scores = {}
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

# ... (your data preprocessing code up to the point where you have transformed_x and y)


# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
y = label_encoder.fit_transform(y)

# Print the mapping between original labels and encoded labels
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate the models
scores = fit_and_score(models, X_train, X_test, y_train, y_test)
scores


{np.int64(0): np.int64(0), np.int64(1): np.int64(1), np.int64(2): np.int64(2), np.int64(3): np.int64(3), np.int64(4): np.int64(4), np.int64(5): np.int64(5), np.int64(6): np.int64(6), np.int64(7): np.int64(7), np.int64(8): np.int64(8), np.int64(9): np.int64(9), np.int64(10): np.int64(10), np.int64(11): np.int64(11), np.int64(12): np.int64(12), np.int64(13): np.int64(13), np.int64(14): np.int64(14), np.int64(15): np.int64(15), np.int64(16): np.int64(16), np.int64(17): np.int64(17), np.int64(18): np.int64(18), np.int64(19): np.int64(19), np.int64(20): np.int64(20), np.int64(21): np.int64(21), np.int64(22): np.int64(22), np.int64(23): np.int64(23)}
Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training KNN...
Training Random Forest...
Training SVM...
Training Decision Tree...
Training Gradient Boosting...
Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001379 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 8
[LightGBM] [Info] Start training from score -3.170383
[LightGBM] [Info] Start training from score -3.182664
[LightGBM] [Info] Start training from score -3.164743
[LightGBM] [Info] Start training from score -3.143652
[LightGBM] [Info] Start training from score -3.199073
[LightGBM] [Info] Start training from score -3.179355
[LightGBM] [Info] Start training from score -3.205528
[LightGBM] [Info] Start training from score -3.170383
[LightGBM] [Info] Start training from score -3.182062
[LightGBM] [Info] Start train

{'Logistic Regression': 0.124,
 'KNN': 0.3311,
 'Random Forest': 0.3302,
 'SVM': 0.28345,
 'Decision Tree': 0.33345,
 'Gradient Boosting': 0.32435,
 'LightGBM': 0.33075}

In [17]:
# prompt: create a pkl file of Random Forest model

import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd


# ... (your data preprocessing code as provided in the prompt)

# Define categorical columns for encoding
categorical_cols = ["Symptom 1", "Symptom 2", "Symptom 3", "Smoker", "Exercise", "Comorbidities", "Age Range", "Gender"]

# Encode categorical features using Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Encode the target variable (Recommended Drug)
drug_encoder = LabelEncoder()
df["Recommended Drug"] = drug_encoder.fit_transform(df["Recommended Drug"])

# Define features (X) and target (y)
X = df[["Symptom 1", "Symptom 2", "Symptom 3", "Smoker", "Exercise", "Comorbidities", "Age Range", "Gender"]]
y = df["Recommended Drug"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a RandomForestClassifier (you can adjust parameters as needed)
rf_classifier = RandomForestClassifier(random_state=42)  #Example, can be tuned
rf_classifier.fit(X_train, y_train)

# Save the trained model to a pickle file
filename = 'random_forest_model.pkl'
pickle.dump(rf_classifier, open(filename, 'wb'))