In [44]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Disease Predictor**

In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder # Import label encoder

# Load the testing dataset
df = pd.read_csv('/content/drive/MyDrive/MRC/Training.csv')

# Separate features (symptoms) and target (disease)
X = df.drop('prognosis', axis=1)
y = df['prognosis']

# Encode the target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y) # creates numerical labels.

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42) # use encoded labels

# Create and train an XGBoost classifier
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Print classification report for more detailed evaluation
print(classification_report(y_test, y_pred))

# Print the encoded labels
print(label_encoder.classes_) # Shows the mapping of encoded labels to disease names.

Parameters: { "use_label_encoder" } are not used.



Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      1.00      1.00        30
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00        25
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        23
           6       1.00      1.00      1.00        33
           7       1.00      1.00      1.00        23
           8       1.00      1.00      1.00        21
           9       1.00      1.00      1.00        15
          10       1.00      1.00      1.00        23
          11       1.00      1.00      1.00        26
          12       1.00      1.00      1.00        21
          13       1.00      1.00      1.00        29
          14       1.00      1.00      1.00        24
          15       1.00      1.00      1.00        19
          16       1.00      1.00      1.00        28
          17 

# **Saving Disease Predictor**

In [46]:
import pickle

# Save the model
with open('/content/drive/MyDrive/MRC/Divit HRC Disease Prediction.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the label encoder
with open('/content/drive/MyDrive/MRC/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# **Recommendation System**

**Loading Libraries and Creating Dataset**

In [47]:
import pandas as pd
import ast
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load the datasets
description_df = pd.read_csv('/content/drive/MyDrive/MRC/description.csv')
diets_df = pd.read_csv('/content/drive/MyDrive/MRC/diets.csv')
precautions_df = pd.read_csv('/content/drive/MyDrive/MRC/precautions_df.csv')
medications_df = pd.read_csv('/content/drive/MyDrive/MRC/medications.csv')
symptoms_df = pd.read_csv('/content/drive/MyDrive/MRC/symtoms_df.csv')
severity_df = pd.read_csv('/content/drive/MyDrive/MRC/Symptom-severity.csv')
workout_df = pd.read_csv('/content/drive/MyDrive/MRC/workout_df.csv')

# Group by disease and aggregate symptoms (before merging)
symptoms_columns = ['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4']
symptoms_df['Symptoms'] = symptoms_df[symptoms_columns].apply(lambda row: [item for item in row if isinstance(item, str) and item != 'nan'], axis=1) #create list of symptoms per row.
symptoms_grouped = symptoms_df.groupby('Disease')['Symptoms'].apply(sum).reset_index() #sum the lists of symptoms per disease.

# Merge the datasets
merged_df = pd.merge(description_df, symptoms_grouped, on='Disease', how='left') # Merge symptoms first
merged_df = pd.merge(merged_df, diets_df, on='Disease', how='left')
merged_df = pd.merge(merged_df, precautions_df, on='Disease', how='left')
merged_df = pd.merge(merged_df, medications_df, on='Disease', how='left')
merged_df = pd.merge(merged_df, workout_df, left_on='Disease', right_on='disease', how='left')
merged_df.drop('disease', axis=1, inplace=True)
merged_df.fillna('Not Available', inplace=True)

# Convert list-like strings to lists
merged_df['Diet'] = merged_df['Diet'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
merged_df['Medication'] = merged_df['Medication'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Clean symptom list formatting and handle NaN values
merged_df['Symptoms'] = merged_df['Symptoms'].apply(lambda x: [s.strip() if isinstance(s, str) else s for s in x] if isinstance(x, list) else x)

# Integrate severity
def add_severity(symptom_list):
    if isinstance(symptom_list, list):
        # Remove duplicates and NaN values
        symptom_list = [symptom for symptom in symptom_list if isinstance(symptom, str) and symptom != 'nan']
        symptom_list = list(set(symptom_list))
        return [(symptom, severity_df[severity_df['Symptom'] == symptom]['weight'].values[0] if symptom in severity_df['Symptom'].values else 'Not Available') for symptom in symptom_list]
    else:
        return symptom_list

merged_df['Symptoms'] = merged_df['Symptoms'].apply(add_severity)

# Consolidate precautions into a list
precaution_columns = ['Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4']
merged_df['Precautions'] = merged_df[precaution_columns].apply(lambda row: [x for x in row if x != 'Not Available'], axis=1)

# Remove duplicate precautions
merged_df['Precautions'] = merged_df['Precautions'].apply(lambda x: list(set(x)))

# Drop the original precaution columns
merged_df.drop(precaution_columns, axis=1, inplace=True)

# Save merged_df to disease_model.csv
merged_df.to_csv('/content/drive/MyDrive/MRC/disease_model.csv', index=False)

**Building the Recommendation Engine**

In [48]:
def get_recommendations(disease_name):
    disease_data = merged_df[merged_df['Disease'] == disease_name].iloc[0]

    print(f"Disease: {disease_data['Disease']}")
    print(f"Description: {disease_data['Description']}")

    # Sort symptoms by severity
    symptoms = disease_data['Symptoms']
    if isinstance(symptoms, str) and symptoms != "Not Available": #Added check here.
        try:
            symptom_list = eval(symptoms)
            symptoms = sorted(symptom_list, key=lambda x: x[1] if isinstance(x, tuple) and len(x) > 1 and isinstance(x[1], (int, float)) else 0, reverse=True)
            print("Symptoms:")
            for item in symptoms:
                if isinstance(item, tuple) and len(item) == 2:
                    symptom, severity = item
                    print(f"- {symptom} (Severity: {severity})")
                else:
                    print(f"- Invalid symptom entry: {item}")
        except (SyntaxError, NameError):
            print(f"Error evaluating symptoms for {disease_name}: {symptoms}")
    else:
        print("Symptoms: Not Available")

    print(f"Diet: {disease_data['Diet']}")
    print(f"Medications: {disease_data['Medication']}")
    print(f"Precautions: {disease_data['Precautions']}")
    print(f"Workout: {disease_data['workout']}")

# **Example Usage**

In [49]:
# Example usage
get_recommendations('Fungal infection')

Disease: Fungal infection
Description: Fungal infection is a common skin condition caused by fungi.
Symptoms: Not Available
Diet: ['Antifungal Diet', 'Probiotics', 'Garlic', 'Coconut oil', 'Turmeric']
Medications: ['Antifungal Cream', 'Fluconazole', 'Terbinafine', 'Clotrimazole', 'Ketoconazole']
Precautions: ['use clean cloths', 'use detol or neem in bathing water', 'keep infected area dry', 'bath twice']
Workout: Avoid sugary foods


In [50]:
import pandas as pd
import pickle

# Load the saved XGBoost model and label encoder
with open('/content/drive/MyDrive/MRC/Divit HRC Disease Prediction.pkl', 'rb') as f:
    xgb_model = pickle.load(f)

with open('/content/drive/MyDrive/MRC/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

# Create a list of all symptoms from the training data
symptoms = X_train.columns.tolist()

# Load the merged_df and severity_df
severity_df = pd.read_csv('/content/drive/MyDrive/MRC/Symptom-severity.csv')
merged_df = pd.read_csv('/content/drive/MyDrive/MRC/disease_model.csv')

def find_disease_by_symptoms(user_symptoms, merged_df, severity_df):
    disease_scores = {}

    for index, row in merged_df.iterrows():
        disease = row['Disease']
        symptoms = row['Symptoms']
        score = 0

        if isinstance(symptoms, str) and symptoms != "Not Available": #Added check here.
            try:
                symptom_list = eval(symptoms)
                for user_symptom in user_symptoms:
                    for s, severity in symptom_list:
                        if s == user_symptom:
                            if isinstance(severity, (int, float)):
                                score += severity
                            else:
                                score += 1 #if severity is not available, add 1.
            except (SyntaxError, NameError):
                print(f"Error evaluating symptoms for {disease}: {symptoms}")
        elif isinstance(symptoms, str) and symptoms == "Not Available":
            continue #skip if symptoms are not available.
        else:
            print(f"Unexpected symptom data type for {disease}: {type(symptoms)}")

        if score > 0: # Only add diseases with a score greater than 0
            disease_scores[disease] = score

    ranked_diseases = sorted(disease_scores.items(), key=lambda x: x[1], reverse=True)

    return ranked_diseases

def predict_disease(user_symptoms, xgb_model, merged_df, severity_df, symptom_list, label_encoder, confidence_threshold=0.8):
    """Predicts disease using XGBoost and falls back to severity matching."""

    # One-hot encode user symptoms
    user_symptoms_encoded = pd.Series([1 if symptom in user_symptoms else 0 for symptom in symptom_list]).values.reshape(1, -1)

    # Predict using XGBoost
    probabilities = xgb_model.predict_proba(user_symptoms_encoded)[0]
    predicted_disease_encoded = probabilities.argmax()
    predicted_disease = label_encoder.inverse_transform([predicted_disease_encoded])[0]
    confidence = probabilities[predicted_disease_encoded]

    if confidence >= confidence_threshold:
        return [(predicted_disease, confidence)] # Return with confidence score.

    else:
        # Fallback to severity-weighted matching
        results = find_disease_by_symptoms(user_symptoms, merged_df, severity_df)
        return results

# Example usage
user_symptoms = ['itching', 'skin_rash', 'headache']

predictions = predict_disease(user_symptoms, xgb_model, merged_df, severity_df, symptoms, label_encoder)

print("Predictions:")
for disease, score in predictions:
    print(f"Disease: {disease}, Score: {score}")

# Get recommendations for the top predicted disease
if predictions:
    top_disease = predictions[0][0]
    get_recommendations(top_disease)

Predictions:
Disease: Fungal infection, Score: 4
Disease: Drug Reaction, Score: 4
Disease: Chicken pox, Score: 4
Disease: Hypertension, Score: 3
Disease: Migraine, Score: 3
Disease: Paralysis (brain hemorrhage), Score: 3
Disease: Malaria, Score: 3
Disease: Dengue, Score: 3
Disease: Typhoid, Score: 3
Disease: Hypoglycemia, Score: 3
Disease: Acne, Score: 3
Disease: Psoriasis, Score: 3
Disease: Impetigo, Score: 3
Disease: Chronic cholestasis, Score: 1
Disease: Jaundice, Score: 1
Disease: Hepatitis B, Score: 1
Disease: Fungal infection
Description: Fungal infection is a common skin condition caused by fungi.
Symptoms:
- dischromic_patches (Severity: 6)
- nodal_skin_eruptions (Severity: 4)
- skin_rash (Severity: 3)
- itching (Severity: 1)
Diet: ['Antifungal Diet', 'Probiotics', 'Garlic', 'Coconut oil', 'Turmeric']
Medications: ['Antifungal Cream', 'Fluconazole', 'Terbinafine', 'Clotrimazole', 'Ketoconazole']
Precautions: ['use clean cloths', 'use detol or neem in bathing water', 'keep infec