*******************************************************************
Using Random Forest Classifier for Supervised Machine Learning to create an OTC medicine Reccommendation System
*******************************************************************

In [1]:
import pandas as pd
import numpy as np


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint


from sklearn.preprocessing import LabelEncoder
import joblib
from sklearn.model_selection import GridSearchCV


In [2]:
dp = r"C:\Users\anany\OneDrive\Desktop\combined_file_otc.csv"
df = pd.read_csv(dp)
df.head()

Unnamed: 0,symptom1,symptom2,symptom3,symptom4,OTC Medicine
0,Nausea,Muscle pain,Heartburn,Diarrhea,Antacid (Rolaids)
1,Sinus pressure,Acid reflux,Muscle pain,Fever,Electrolyte solution (Pedialyte)
2,Hangover,Allergic rash,Dry cough,Insomnia,Cold-EEZE
3,Common cold,Heartburn,Runny nose,Cough,Bismuth subsalicylate
4,Headache,Acid reflux,Nausea,Menstrual cramps,Antacid (Tums)


In [3]:
# Split dataset into features and target variable
X = df[['symptom1', 'symptom2', 'symptom3', 'symptom4']]
y = df['OTC Medicine']

# Encode target variable to numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X = pd.get_dummies(X, columns=['symptom1', 'symptom2', 'symptom3', 'symptom4'])



In [4]:
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Ensure one-hot encoding is consistent between training and testing datasets
missing_cols_train = set(X_train.columns) - set(X_test.columns)
missing_cols_test = set(X_test.columns) - set(X_train.columns)

for col in missing_cols_train:
    X_test[col] = 0

for col in missing_cols_test:
    X_train[col] = 0

# Ensure the order of columns is the same
X_test = X_test[X_train.columns]

In [6]:
rf = RandomForestClassifier()

# Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(1, 20)
}

rand_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=5, cv=5, random_state=42)

# Fit the RandomizedSearchCV object to the training data
rand_search.fit(X_train, y_train)

# Get the best model
best_rf = rand_search.best_estimator_

# Predict on the test set using the best model
y_pred = best_rf.predict(X_test)

# Convert the predictions back to original categorical labels
y_pred_labels = label_encoder.inverse_transform(y_pred)





In [7]:
score = best_rf.score(X_train, y_train)
score

0.944553396327331

In [8]:
joblib.dump(best_rf, 'best_otc_recommendation_model.pkl')

['best_otc_recommendation_model.pkl']

In [9]:
joblib.dump(label_encoder, 'label_encoder.pkl')


['label_encoder.pkl']

********************************************
LOADING AND RUNNING THE MODEL SAVED AND CREATED IN THE ABOVE CODE (OTC MEDCINE RECOMMENDATION MODEL)
************************************************

In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the dataset
dp = r"C:\Users\anany\OneDrive\Desktop\combined_file_otc.csv"
df = pd.read_csv(dp)

# Split dataset into features and target variable
X = df[['symptom1', 'symptom2', 'symptom3', 'symptom4']]
y = df['OTC Medicine']

# Encode target variable to numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X = pd.get_dummies(X, columns=['symptom1', 'symptom2', 'symptom3', 'symptom4'])

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure one-hot encoding is consistent between training and testing datasets
missing_cols_train = set(X_train.columns) - set(X_test.columns)
missing_cols_test = set(X_test.columns) - set(X_train.columns)

for col in missing_cols_train:
    X_test[col] = 0

for col in missing_cols_test:
    X_train[col] = 0

# Ensure the order of columns is the same
X_test = X_test[X_train.columns]

# Load the trained model
best_rf = joblib.load('best_otc_recommendation_model.pkl')

def predict_medicine(symptoms):
    # Create a DataFrame from symptoms
    df = pd.DataFrame([symptoms], columns=['symptom1', 'symptom2', 'symptom3', 'symptom4'])
    
    # One-hot encode the symptoms
    df = pd.get_dummies(df, columns=['symptom1', 'symptom2', 'symptom3', 'symptom4'])
    
    # Align columns with training data
    df = df.reindex(columns=X_train.columns, fill_value=0)
    
    # Predict the medicine
    y_pred = best_rf.predict(df)
    
    # Convert the prediction to original categorical label
    medicine = label_encoder.inverse_transform(y_pred)[0]
    
    return medicine

# Test the prediction function
symptoms1 = ['Fever', 'Cough', 'Headache', 'Fatigue']
medicine = predict_medicine(symptoms1)
print(f"The recommended OTC medicine for the symptoms {symptoms1} is {medicine}.")


symptoms2 = ['Itching', 'Skin Rash', 'Patches', 'Boils']
medicine = predict_medicine(symptoms2)
print(f"The recommended OTC medicine for the symptoms {symptoms2} is {medicine}.")


The recommended OTC medicine for the symptoms ['Fever', 'Cough', 'Headache', 'Fatigue'] is Cold-EEZE.
The recommended OTC medicine for the symptoms ['Itching', 'Skin Rash', 'Patches', 'Boils'] is Miconazole.
