In [14]:
# 🐾 Animal Disease Prediction - Training Notebook

# Step 1: Install dependencies

# Step 2: Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Step 3: Load the dataset
df = pd.read_csv('animal_disease_dataset_200.csv')
df.head()

# Step 4: Preprocessing
df_encoded = pd.get_dummies(df, columns=['animal'])
X = df_encoded.drop(['disease', 'recommendation'], axis=1)
y = df['disease']

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Model Training
model = RandomForestClassifier(n_estimators=500, random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)

# Step 7: Accuracy
accuracy = accuracy_score(y_test, preds)
print("✅ Model Accuracy:", accuracy)
print(f"📊 Accuracy: {accuracy * 100:.2f}%")

# Step 8: Save model and helpers
joblib.dump(model, 'animal_disease_model.pkl')
joblib.dump(X.columns.tolist(), 'model_columns.pkl')
recommendation_dict = df.set_index('disease')['recommendation'].to_dict()
joblib.dump(recommendation_dict, 'recommendations.pkl')
joblib.dump(accuracy, 'model_accuracy.pkl')  # <-- this is new


✅ Model Accuracy: 0.175
📊 Accuracy: 17.50%


['model_accuracy.pkl']

In [15]:
# Step 6.5: Predict Probabilities
probs = model.predict_proba(X_test)

# For each prediction, show the top prediction with its confidence
for i in range(5):  # change 5 to any number of test samples you want to inspect
    top_pred = preds[i]
    top_confidence = max(probs[i])
    print(f"🦠 Predicted Disease: {top_pred} | 🎯 Confidence: {top_confidence * 100:.2f}%")


🦠 Predicted Disease: Feline Calicivirus | 🎯 Confidence: 29.80%
🦠 Predicted Disease: Equine Influenza | 🎯 Confidence: 42.59%
🦠 Predicted Disease: Bloat | 🎯 Confidence: 38.40%
🦠 Predicted Disease: Rabies | 🎯 Confidence: 46.18%
🦠 Predicted Disease: Lyme Disease | 🎯 Confidence: 33.61%


In [16]:
# Create a DataFrame of predictions with confidence
results_df = pd.DataFrame({
    'True Label': y_test.reset_index(drop=True),
    'Predicted Disease': preds,
    'Confidence': [max(prob) for prob in probs]
})

results_df.to_csv("prediction_confidence_report.csv", index=False)
results_df.head()


Unnamed: 0,True Label,Predicted Disease,Confidence
0,Feline Calicivirus,Feline Calicivirus,0.298
1,Tetanus,Equine Influenza,0.425933
2,Anthrax,Bloat,0.384
3,Kennel Cough,Rabies,0.461767
4,Rabies,Lyme Disease,0.3361


In [4]:
from sklearn.metrics import confusion_matrix

performance_metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred, average='weighted'),
    'recall': recall_score(y_test, y_pred, average='weighted'),
    'f1_score': f1_score(y_test, y_pred, average='weighted'),
    'confusion_matrix': confusion_matrix(y_test, y_pred, labels=model.classes_).tolist(),
    'labels': list(model.classes_)
}
joblib.dump(performance_metrics, "performance_metrics.pkl")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['performance_metrics.pkl']