In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load your dataset
df = pd.read_csv('DiseaseAndSymptoms.csv')

# Fill missing symptom columns with a placeholder
df.fillna('None', inplace=True)

# Combine symptoms into a single list
symptom_columns = [f'Symptom_{i}' for i in range(1, 18)]  # Explicitly list the symptom columns
df['symptom_list'] = df[symptom_columns].values.tolist()

# Convert the list of symptoms to a string
df['symptom_str'] = df['symptom_list'].apply(lambda x: ' '.join(x))

# One-hot encode symptoms
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['symptom_str'])

# Encode the disease labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Disease'])

In [3]:
unique_diseases = df['Disease'].unique()

print(unique_diseases)

['Fungal infection' 'Allergy' 'GERD' 'Chronic cholestasis' 'Drug Reaction'
 'Peptic ulcer diseae' 'AIDS' 'Diabetes ' 'Gastroenteritis'
 'Bronchial Asthma' 'Hypertension ' 'Migraine' 'Cervical spondylosis'
 'Paralysis (brain hemorrhage)' 'Jaundice' 'Malaria' 'Chicken pox'
 'Dengue' 'Typhoid' 'hepatitis A' 'Hepatitis B' 'Hepatitis C'
 'Hepatitis D' 'Hepatitis E' 'Alcoholic hepatitis' 'Tuberculosis'
 'Common Cold' 'Pneumonia' 'Dimorphic hemmorhoids(piles)' 'Heart attack'
 'Varicose veins' 'Hypothyroidism' 'Hyperthyroidism' 'Hypoglycemia'
 'Osteoarthristis' 'Arthritis' '(vertigo) Paroymsal  Positional Vertigo'
 'Acne' 'Urinary tract infection' 'Psoriasis' 'Impetigo']


In [3]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred) * 100

# Generate classification report
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, output_dict=True)

# Convert report metrics to percentage form
for key in report.keys():
    if key != 'accuracy':
        for metric in report[key]:
            report[key][metric] = report[key][metric] * 100

# Print accuracy and classification report
print(f"Accuracy: {accuracy:.2f}%")
print("\nClassification Report:")
for key, values in report.items():
    if key == 'accuracy':
        continue
    print(f"\nClass: {key}")
    for metric, value in values.items():
        print(f"{metric.capitalize()}: {value:.2f}%")

Accuracy: 100.00%

Classification Report:

Class: (vertigo) Paroymsal  Positional Vertigo
Precision: 100.00%
Recall: 100.00%
F1-score: 100.00%
Support: 1800.00%

Class: AIDS
Precision: 100.00%
Recall: 100.00%
F1-score: 100.00%
Support: 3000.00%

Class: Acne
Precision: 100.00%
Recall: 100.00%
F1-score: 100.00%
Support: 2400.00%

Class: Alcoholic hepatitis
Precision: 100.00%
Recall: 100.00%
F1-score: 100.00%
Support: 2500.00%

Class: Allergy
Precision: 100.00%
Recall: 100.00%
F1-score: 100.00%
Support: 2400.00%

Class: Arthritis
Precision: 100.00%
Recall: 100.00%
F1-score: 100.00%
Support: 2300.00%

Class: Bronchial Asthma
Precision: 100.00%
Recall: 100.00%
F1-score: 100.00%
Support: 3300.00%

Class: Cervical spondylosis
Precision: 100.00%
Recall: 100.00%
F1-score: 100.00%
Support: 2300.00%

Class: Chicken pox
Precision: 100.00%
Recall: 100.00%
F1-score: 100.00%
Support: 2100.00%

Class: Chronic cholestasis
Precision: 100.00%
Recall: 100.00%
F1-score: 100.00%
Support: 1500.00%

Class: Co

In [4]:
import joblib

# Save the model to a file
joblib.dump(model, 'disease_prediction_model.pkl')

# Save the label encoder to a file (to decode predictions later)
joblib.dump(label_encoder, 'label_encoder.pkl')

# Save the vectorizer to a file (to transform input text later)
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']