In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [10]:
import pandas as pd
import random

# Define symptom pool
symptoms_pool = [
    "Fever", "Cough", "Loss of appetite", "Limping", "Swelling", "Pain", "Vomiting",
    "Diarrhea", "Dehydration", "Rash", "Itching", "Sneezing", "Watery eyes",
    "Lethargy", "Weight loss", "Bruising", "Hair loss", "Nasal discharge",
    "Ear infection", "Excessive drooling", "Red eyes", "Shivering", "Anxiety", "Aggression"
]

# Disease-symptom mapping (more realistic)
disease_symptom_map = {
    "Flu": ["Fever", "Cough", "Sneezing", "Lethargy", "Watery eyes"],
    "Leg Injury": ["Limping", "Swelling", "Pain", "Bruising"],
    "Food Poisoning": ["Vomiting", "Diarrhea", "Dehydration", "Loss of appetite"],
    "Skin Infection": ["Rash", "Itching", "Hair loss", "Red eyes"],
    "Allergy": ["Itching", "Sneezing", "Watery eyes", "Rash"],
    "Worm Infestation": ["Weight loss", "Vomiting", "Diarrhea", "Lethargy"],
    "Respiratory Infection": ["Cough", "Nasal discharge", "Sneezing", "Fever"],
    "Muscle Strain": ["Limping", "Pain", "Shivering", "Lethargy"],
    "Eye Infection": ["Red eyes", "Watery eyes", "Discharge", "Lethargy"],
    "Ear Mite Infestation": ["Ear infection", "Itching", "Aggression", "Shivering"],
    "Dental Disease": ["Loss of appetite", "Excessive drooling", "Pain", "Aggression"],
    "Parvovirus": ["Vomiting", "Diarrhea", "Dehydration", "Fever"],
    "Arthritis": ["Limping", "Pain", "Lethargy", "Shivering"]
}

# Generate 10,000 rows of more meaningful data
data = []
for _ in range(1001):
    disease = random.choice(list(disease_symptom_map.keys()))
    core_symptoms = disease_symptom_map[disease]
    
    # Choose 2–4 core symptoms
    chosen_symptoms = random.sample(core_symptoms, k=random.randint(2, min(4, len(core_symptoms))))
    
    # Add 0–1 noise symptoms (from unrelated symptoms)
    noise_symptoms = list(set(symptoms_pool) - set(core_symptoms))
    if random.random() < 0.3:  # 30% chance of adding noise
        chosen_symptoms.append(random.choice(noise_symptoms))
    
    # Ensure 3 symptoms per row (pad if needed)
    while len(chosen_symptoms) < 3:
        chosen_symptoms.append(random.choice(core_symptoms))
    random.shuffle(chosen_symptoms)
    
    # Add row
    data.append(chosen_symptoms[:3] + [disease])

# Create DataFrame
df = pd.DataFrame(data, columns=["Symptom1", "Symptom2", "Symptom3", "Disease"])
df

Unnamed: 0,Symptom1,Symptom2,Symptom3,Disease
0,Watery eyes,Shivering,Red eyes,Eye Infection
1,Itching,Hair loss,Rash,Skin Infection
2,Rash,Itching,Ear infection,Allergy
3,Aggression,Shivering,Ear infection,Ear Mite Infestation
4,Itching,Aggression,Shivering,Ear Mite Infestation
...,...,...,...,...
996,Limping,Pain,Swelling,Leg Injury
997,Weight loss,Vomiting,Lethargy,Worm Infestation
998,Pain,Swelling,Limping,Leg Injury
999,Fever,Cough,Sneezing,Respiratory Infection


In [11]:
df_onehot = pd.get_dummies(df, columns=['Symptom1', 'Symptom2', 'Symptom3'])

In [12]:
X = df_onehot.drop(columns=['Disease']).astype('float32')
y = df_onehot['Disease']

In [13]:
from sklearn.model_selection import train_test_split

# Convert features to float
X = df_onehot.drop(columns=['Disease']).astype('float32')

# Encode target (label encoding or one-hot)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df_onehot['Disease'])  # y will be integer labels

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# --- Logistic Regression ---
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

# --- Random Forest ---
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

# --- Support Vector Machine ---
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

Logistic Regression Accuracy: 0.8258706467661692
Random Forest Accuracy: 0.7711442786069652
SVM Accuracy: 0.8308457711442786
