In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load dataset
df = pd.read_csv('ghana_disease_symptoms.csv')

In [None]:
# Collect all unique symptoms
all_symptoms = set()
for col in df.columns[1:]:
    all_symptoms.update(df[col].dropna().unique())

In [None]:
# Convert diseases to symptom lists
disease_symptoms = {}
for _, row in df.iterrows():
    symptoms = [s for s in row[1:] if pd.notna(s)]
    disease_symptoms[row['Disease']] = symptoms

In [None]:
# Create binary features
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(disease_symptoms.values())
y = list(disease_symptoms.keys())

In [None]:
# Create DataFrame
symptoms_df = pd.DataFrame(X, columns=mlb.classes_)
symptoms_df['Disease'] = y

In [None]:
# Split data
X = symptoms_df.drop('Disease', axis=1)
y = symptoms_df['Disease']

In [None]:
# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [None]:
# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

In [None]:
# Initialize and train Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

In [None]:
def predict_disease(symptoms):
    # Create input vector
    input_vector = np.zeros(len(mlb.classes_))
    for symptom in symptoms:
        if symptom in mlb.classes_:
            idx = list(mlb.classes_).index(symptom)
            input_vector[idx] = 1

    # Make prediction
    prediction = model.predict([input_vector])
    return le.inverse_transform(prediction)[0]

In [None]:
# Example symptoms input
test_symptoms = ['Fever', 'Headache', 'Fatigue', 'Muscle Pain']

predicted_disease = predict_disease(test_symptoms)
print(f"\nPredicted Disease: {predicted_disease}")