# Title: Medicine Recommendation System with Machine Learning

# Description:

This notebook implements a machine learning system that predicts diseases based on symptom input and provides relevant health recommendations including descriptions, precautions, medications, diet, and workout suggestions.

### Installing Required Libraries

In [1]:
!pip install fuzzywuzzy
!pip install scikit-learn
!pip install python-Levenshtein  # Speeds up fuzzy matching

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rap

### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from fuzzywuzzy import process
import pickle
import time
import warnings
warnings.filterwarnings("ignore")

### Loading and Exploring the Training Dataset

In [3]:
# Load the training dataset
dataset = pd.read_csv('training.csv')

### Data Preprocessing

In [4]:
# Split features and target
X = dataset.drop('prognosis', axis=1)
y = dataset['prognosis']

# Encode disease labels
le = LabelEncoder()
le.fit(y)
Y = le.transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=20)

print(f"Training set shape: {X_train.shape}, {y_train.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")

Training set shape: (3442, 131), (3442,)
Test set shape: (1476, 131), (1476,)


### Training Multiple Models and Enhanced Evaluation

In [5]:
# Dictionary of models to evaluate
models = {
    'SVC': SVC(kernel='linear', probability=True),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'KNeighbors': KNeighborsClassifier(n_neighbors=5),
    'MultinomialNB': MultinomialNB()
}

# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Scale data for MultinomialNB
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_imputed), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_imputed), columns=X_test.columns)

# Evaluation function
def evaluate_model(name, model, X_tr, X_te):
    start_time = time.time()
    model.fit(X_tr, y_train)
    predictions = model.predict(X_te)
    processing_time = (time.time() - start_time) * 1000  # Convert to milliseconds

    # Calculate metrics
    accuracy = accuracy_score(y_test, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions, average='weighted')

    print(f"\n{name} Model Evaluation:")
    print(f"  - Accuracy: {accuracy:.4f}")
    print(f"  - Precision: {precision:.4f}")
    print(f"  - Recall: {recall:.4f}")
    print(f"  - F1 Score: {f1:.4f}")
    print(f"  - Processing Time: {processing_time:.1f} ms")
    print(f"{'='*40}")

    return accuracy, model, processing_time

# Store results for comparison
model_results = {}

# Train and evaluate all models
for name, model in models.items():
    if name == 'MultinomialNB':
        accuracy, fitted_model, proc_time = evaluate_model(name, model, X_train_scaled, X_test_scaled)
        model_results[name] = {'accuracy': accuracy, 'model': fitted_model, 'scaled': True, 'time': proc_time}
    else:
        accuracy, fitted_model, proc_time = evaluate_model(name, model, X_train_imputed, X_test_imputed)
        model_results[name] = {'accuracy': accuracy, 'model': fitted_model, 'scaled': False, 'time': proc_time}


SVC Model Evaluation:
  - Accuracy: 1.0000
  - Precision: 1.0000
  - Recall: 1.0000
  - F1 Score: 1.0000
  - Processing Time: 1296.2 ms

RandomForest Model Evaluation:
  - Accuracy: 1.0000
  - Precision: 1.0000
  - Recall: 1.0000
  - F1 Score: 1.0000
  - Processing Time: 786.7 ms

KNeighbors Model Evaluation:
  - Accuracy: 1.0000
  - Precision: 1.0000
  - Recall: 1.0000
  - F1 Score: 1.0000
  - Processing Time: 372.0 ms

MultinomialNB Model Evaluation:
  - Accuracy: 1.0000
  - Precision: 1.0000
  - Recall: 1.0000
  - F1 Score: 1.0000
  - Processing Time: 123.6 ms


### Find and Save the Best Model

In [6]:
# Select the best model (MultinomialNB from previous analysis)
best_model_name = 'MultinomialNB'
best_model = model_results[best_model_name]['model']
best_accuracy = model_results[best_model_name]['accuracy']
requires_scaling = model_results[best_model_name].get('scaled', False)

# Create the components dictionary with preprocessing tools
components = {
    'model': best_model,
    'imputer': imputer,
    'label_encoder': le,
    'requires_scaling': requires_scaling,
    'scaler': scaler if requires_scaling else None,
    'feature_names': list(X.columns)
}

# Save the model and preprocessing components to a pickle file
with open('best_model.pkl', 'wb') as file:
    pickle.dump(components, file)

In [7]:
# Load the saved model components
def load_model(model_path='best_model.pkl'):
    with open(model_path, 'rb') as file:
        components = pickle.load(file)
    return components

# Check symptom relevance for a disease
def check_symptom_relevance(disease, user_symptoms, training_data):
    # Get all symptoms for the given disease from training data
    disease_symptoms = []
    disease_row = training_data[training_data['prognosis'] == disease]

    if not disease_row.empty:
        # Get symptom columns that are 1 for this disease
        disease_symptoms = [col for col in disease_row.columns[:-1]
                           if disease_row[col].values[0] == 1]

    # Calculate how many of the user's symptoms are common for this disease
    common_symptoms = [s for s in user_symptoms if s in disease_symptoms]

    # Calculate percentage match
    match_percentage = (len(common_symptoms) / len(user_symptoms)) * 100 if user_symptoms else 0

    return {
        "common_symptoms": common_symptoms,
        "match_percentage": match_percentage
    }

### Loading All Necessary Datasets for Recommendation System

In [8]:
# Clean column names to handle case and space inconsistencies
def clean_columns(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
    return df

# Load and clean all relevant datasets
def load_datasets():
    datasets = {
        'description': pd.read_csv('description.csv'),
        'precaution': pd.read_csv('precaution.csv'),
        'diet': pd.read_csv('diet.csv'),
        'medication': pd.read_csv('medication.csv'),
        'workout': pd.read_csv('workout.csv'),
        'severity': pd.read_csv('Symptom-severity.csv'),
        'training': pd.read_csv('training.csv')
    }

    # Clean all dataframes
    for key in datasets:
        datasets[key] = clean_columns(datasets[key])

    return datasets

# Load datasets
datasets = load_datasets()

# Extract all symptoms from severity dataset for fuzzy matching
all_symptoms = datasets['severity']['symptom'].str.lower().str.strip().unique().tolist()

### Enhanced Symptom Matching with Feedback

In [9]:
# Enhanced symptom matching function with score and feedback
def match_symptom(input_symptom, min_score=60):
    input_symptom = input_symptom.lower().strip()
    match, score = process.extractOne(input_symptom, all_symptoms)

    if score > min_score:
        return match, score
    else:
        return None, score

# Converts a list of symptoms to a binary feature vector (FIXED: now uses binary values instead of severity weights)
def symptoms_to_vector(user_symptoms, feature_names, severity_data):
    input_vector = [0] * len(feature_names)

    for symptom in user_symptoms:
        if symptom in feature_names:
            index = feature_names.index(symptom)
            input_vector[index] = 1  # Mark symptom as present (1) regardless of severity

    return np.array([input_vector])

### Comprehensive Diagnosis Function with Error Handling

In [19]:
# Diagnose function that handles top alternative diagnoses
def diagnose(symptoms_list, components, datasets, top_n=3, min_symptoms=2, min_confidence=40):
    try:
        # Check if symptoms were provided
        if not symptoms_list or all(not s.strip() for s in symptoms_list):
            return {"Error": "No symptoms provided. Please enter at least one symptom."}

        # Access model components
        model = components['model']
        imputer = components['imputer']
        le = components['label_encoder']
        scaler = components['scaler']
        requires_scaling = components['requires_scaling']
        feature_names = components['feature_names']

        # Match symptoms using fuzzy matching
        matched_symptoms = []
        unmatched_symptoms = []

        for sym in symptoms_list:
            match, score = match_symptom(sym)
            if match:
                matched_symptoms.append(match)
            else:
                unmatched_symptoms.append((sym, score))

        # Check minimum symptoms requirement
        if len(matched_symptoms) < min_symptoms:
            return {"Warning": f"Please provide at least {min_symptoms} symptoms for more accurate prediction. Currently matched: {len(matched_symptoms)} symptoms"}

        # Convert symptoms to feature vector (now uses binary values)
        vector = symptoms_to_vector(matched_symptoms, feature_names, datasets['severity'])

        # Apply preprocessing
        vector_imputed = imputer.transform(vector)
        if requires_scaling:
            vector_processed = scaler.transform(vector_imputed)
        else:
            vector_processed = vector_imputed

        # Get prediction and confidence
        if hasattr(model, 'predict_proba'):
            proba = model.predict_proba(vector_processed)[0]
            # Get top N predictions
            top_indices = proba.argsort()[-top_n:][::-1]
            top_diseases = [(le.inverse_transform([idx])[0], proba[idx] * 100) for idx in top_indices]

            # Primary prediction
            pred_index = top_indices[0]
            disease = le.inverse_transform([pred_index])[0]
            confidence = proba[pred_index] * 100

            # Check confidence threshold
            if confidence < min_confidence:
                return {"Warning": f"Low confidence prediction ({confidence:.1f}%). Please provide more symptoms for better accuracy."}
        else:
            pred_index = model.predict(vector_processed)[0]
            disease = le.inverse_transform([pred_index])[0]
            confidence = None
            top_diseases = [(disease, None)]

        # Get disease details for primary prediction
        desc = datasets['description'][datasets['description']['disease'] == disease]['description'].values
        precautions_list = datasets['precaution'][datasets['precaution']['disease'] == disease].values.flatten()[1:]
        meds = datasets['medication'][datasets['medication']['disease'] == disease].values.flatten()[1:]
        diets_list = datasets['diet'][datasets['diet']['disease'] == disease].values.flatten()[1:]
        workouts_list = datasets['workout'][datasets['workout']['disease'] == disease].values.flatten()[1:]

        # Calculate symptom relevance score for the disease
        symptom_relevance = check_symptom_relevance(disease, matched_symptoms, datasets['training'])

        # Handle mixed types safely by converting to string first
        result = {
            "Disease": disease,
            "Confidence": f"{confidence:.1f}%" if confidence is not None else "Not available",
            "Description": str(desc[0]) if len(desc) else "No description found.",
            "Precautions": [str(p) for p in precautions_list if p and str(p).strip() != "nan"],
            "Medications": [str(m) for m in meds if m and str(m).strip() != "nan"],
            "Diet": [str(d) for d in diets_list if d and str(d).strip() != "nan"],
            "Workouts": [str(w) for w in workouts_list if w and str(w).strip() != "nan"],
            "MatchedSymptoms": matched_symptoms,
            "TopAlternatives": top_diseases[1:],  # Exclude the primary prediction
            "SymptomRelevance": symptom_relevance
        }

        return result

    except Exception as e:
        return {"Error": f"An error occurred during diagnosis: {str(e)}. Please try again."}

### User Interaction and Output Formatting

In [20]:
# Display results function for CLI use
def display_results(result):
    if "Error" in result:
        print(f"\n ERROR: {result['Error']}")
        return
    elif "Warning" in result:
        print(f"\n WARNING: {result['Warning']}")
        return

    print("\n" + "="*50)
    print(f" PREDICTED DISEASE: {result['Disease']}")
    print(f" CONFIDENCE: {result['Confidence']}")

    # Display symptom relevance information
    if "SymptomRelevance" in result:
        match_pct = result["SymptomRelevance"]["match_percentage"]
        print(f" SYMPTOM MATCH: {match_pct:.1f}% of your symptoms match this disease")

    print("="*50)

    print("\n DESCRIPTION:")
    print(result["Description"])

    print("\n PRECAUTIONS:")
    for i, p in enumerate(result["Precautions"], 1):
        print(f"{i}. {p}")

    print("\n MEDICATIONS:")
    if result["Medications"]:
        for i, m in enumerate(result["Medications"], 1):
            print(f"{i}. {m}")
    else:
        print("No specific medications listed. Please consult a healthcare professional.")

    print("\n RECOMMENDED WORKOUTS:")
    if result["Workouts"] and result["Workouts"][0] != "None":
        for i, w in enumerate(result["Workouts"], 1):
            print(f"{i}. {w}")
    else:
        print("No specific workouts listed. Rest may be recommended.")

    print("\n DIETARY RECOMMENDATIONS:")
    if result["Diet"] and result["Diet"][0] != "None":
        for i, d in enumerate(result["Diet"], 1):
            print(f"{i}. {d}")
    else:
        print("No specific diet recommendations listed.")

    print("\n MATCHED SYMPTOMS:")
    for i, s in enumerate(result["MatchedSymptoms"], 1):
        print(f"{i}. {s}")

    # Display alternative diagnoses
    if "TopAlternatives" in result and result["TopAlternatives"]:
        print("\n ALTERNATIVE POSSIBLE DIAGNOSES:")
        for i, (disease, conf) in enumerate(result["TopAlternatives"], 1):
            conf_str = f"{conf:.1f}%" if conf is not None else "N/A"
            print(f"{i}. {disease} (Confidence: {conf_str})")

    print("\n" + "="*50)
    print(" DISCLAIMER: This is not a medical diagnosis. Please consult a healthcare professional.")
    print("="*50)

### Interactive User Interface

In [23]:
if __name__ == "__main__":
    print("\n WELCOME TO THE DISEASE PREDICTION SYSTEM \n")
    print("This system uses machine learning to predict potential diseases based on symptoms.")
    print("Please note this is for educational purposes only and is not a substitute for professional medical advice.\n")

    # Get user input
    user_input = input("Enter your symptoms separated by commas (e.g., headache, dizziness, nausea): ").split(',')
    user_symptoms = [s.strip() for s in user_input]

    # Load model and datasets
    components = load_model()
    datasets = load_datasets()

    # Run diagnosis
    result = diagnose(user_symptoms, components, datasets)

    # Display results
    display_results(result)

    print("\nThank you for using the Disease Prediction System. Stay healthy!")


 WELCOME TO THE DISEASE PREDICTION SYSTEM 

This system uses machine learning to predict potential diseases based on symptoms.
Please note this is for educational purposes only and is not a substitute for professional medical advice.

Enter your symptoms separated by commas (e.g., headache, dizziness, nausea): headache, fever, sweating

 PREDICTED DISEASE: Malaria
 CONFIDENCE: 97.0%
 SYMPTOM MATCH: 66.7% of your symptoms match this disease

 DESCRIPTION:
Malaria is a mosquito-borne infectious disease affecting humans and other animals.

 PRECAUTIONS:
1. Consult nearest hospital
2. avoid oily food
3. avoid non veg food
4. keep mosquitos out

 MEDICATIONS:
1. ['Antimalarial drugs', 'Antipyretics', 'Antiemetic drugs', 'IV fluids', 'Blood transfusions']

 RECOMMENDED WORKOUTS:
1. Stay hydrated
2. Malaria
3. Consume nutrient-rich foods
4. Malaria
5. Include protein-rich foods
6. Malaria
7. Consume foods rich in antioxidants
8. Malaria
9. Limit fatty and greasy foods
10. Malaria
11. Avoid a

### Check Scikit-learn Version

In [13]:
import sklearn
print(f"scikit-learn version: {sklearn.__version__}")

scikit-learn version: 1.6.1


### Preparing for Flask Implementation

In [14]:
from flask import Flask, request, jsonify, render_template

app = Flask(__name__)

# Load the model once when starting the server
components = pickle.load(open('best_model.pkl', 'rb'))
model = components['model']
imputer = components['imputer']
le = components['label_encoder']
scaler = components['scaler']
requires_scaling = components['requires_scaling']
feature_names = components['feature_names']
datasets = load_datasets()

@app.route('/')
def home():
    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():
    symptoms = request.form.getlist('symptoms')
    result = diagnose(symptoms, components, datasets)
    return jsonify(result)

if __name__ == '__main__':
    app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
