In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
import joblib

# Set random seed for reproducibility
np.random.seed(42)


In [2]:
# Load symptom severity data
severity_df = pd.read_csv('Symptom-severity.csv')
symptoms_list = severity_df['Symptom'].tolist()
severity_weights = dict(zip(severity_df['Symptom'], severity_df['weight']))

print("Number of symptoms:", len(symptoms_list))
print("\nFirst 10 symptoms:", symptoms_list[:10])


Number of symptoms: 133

First 10 symptoms: ['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing', 'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity', 'ulcers_on_tongue']


In [3]:
# Create synthetic disease-symptom dataset
diseases = [
    'Common Cold',
    'Pneumonia',
    'Dengue',
    'Typhoid',
    'Hepatitis',
    'Jaundice'
]

# Define typical symptoms for each disease
disease_symptoms = {
    'Common Cold': ['continuous_sneezing', 'runny_nose', 'headache', 'cough', 'fatigue'],
    'Pneumonia': ['high_fever', 'breathlessness', 'chest_pain', 'cough', 'fatigue'],
    'Dengue': ['high_fever', 'headache', 'joint_pain', 'fatigue', 'nausea'],
    'Typhoid': ['high_fever', 'headache', 'abdominal_pain', 'diarrhoea', 'fatigue'],
    'Hepatitis': ['yellowish_skin', 'dark_urine', 'nausea', 'loss_of_appetite', 'abdominal_pain'],
    'Jaundice': ['yellowish_skin', 'yellowing_of_eyes', 'dark_urine', 'fatigue', 'weight_loss']
}

# Create training data
data = []
for disease in diseases:
    symptoms = disease_symptoms[disease]
    # Create 50 samples per disease with different symptom combinations
    for _ in range(50):
        # Randomly select 3-5 symptoms from the disease's symptom list
        num_symptoms = np.random.randint(3, len(symptoms) + 1)
        selected_symptoms = np.random.choice(symptoms, num_symptoms, replace=False)
        
        # Create a row with all symptoms set to 0
        row = {symptom: 0 for symptom in symptoms_list}
        
        # Set selected symptoms to their severity weights
        for symptom in selected_symptoms:
            row[symptom] = severity_weights.get(symptom, 1)
        
        # Add disease label
        row['Disease'] = disease
        data.append(row)

# Convert to DataFrame
df = pd.DataFrame(data)

print("Dataset shape:", df.shape)
print("\nSample data:")
print(df.head())


Dataset shape: (300, 133)

Sample data:
   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        0          0                     0                    4          0   
1        0          0                     0                    4          0   
2        0          0                     0                    4          0   
3        0          0                     0                    4          0   
4        0          0                     0                    4          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  scurring  \
0       0           0             0        0                 0  ...         0   
1       0           0             0        0                 0  ...         0   
2       0           0             0        0                 0  ...         0   
3       0           0             0        0                 0  ...         0   
4       0           0             0        0                 0  ...         0   

In [4]:
# Prepare features and target
X = df.drop('Disease', axis=1)
y = df['Disease']

# Label encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

# Save feature names and label encoder
feature_names = X.columns.tolist()
joblib.dump(feature_names, 'feature_names.joblib')
joblib.dump(le, 'label_encoder.joblib')


Training set shape: (240, 132)
Testing set shape: (60, 132)


['label_encoder.joblib']

In [6]:
# Train and save Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_score = rf_model.score(X_test, y_test)
print("Random Forest accuracy:", rf_score)

# Train and save Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_score = nb_model.score(X_test, y_test)
print("Naive Bayes accuracy:", nb_score)

# Save the models
joblib.dump(rf_model, 'random_forest_model.joblib')
joblib.dump(nb_model, 'naive_bayes_model.joblib')


Random Forest accuracy: 1.0
Naive Bayes accuracy: 1.0


['naive_bayes_model.joblib']