In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib


## Load the dataset

In [44]:
data = pd.read_csv(r"C:/Users/devya/OneDrive/Desktop/AI-Integration-Project/animal_dis_me.csv")

## Analyze the dataset

In [45]:
data.head(10)

Unnamed: 0,sudden_death,blood_from_nose,trembling,difficult_breathing,blood_from_openings,fever,loss_of_appetite,dullness,swelling,recumnency,...,encephalitis,septicaemia,infertility,nacrotic_foci,diarrhea,weight_loss,shivering,drooling,excessive_urination,prognosis
0,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,anthrax
1,0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,anthrax
2,1,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,anthrax
3,1,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,anthrax
4,1,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,anthrax
5,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,anthrax
6,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,anthrax
7,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,anthrax
8,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,anthrax
9,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,anthrax


In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 38 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   sudden_death         129 non-null    int64 
 1   blood_from_nose      129 non-null    int64 
 2   trembling            129 non-null    int64 
 3   difficult_breathing  129 non-null    int64 
 4   blood_from_openings  129 non-null    int64 
 5   fever                129 non-null    int64 
 6   loss_of_appetite     129 non-null    int64 
 7   dullness             129 non-null    int64 
 8   swelling             129 non-null    int64 
 9   recumnency           129 non-null    int64 
 10  profuse_salivation   129 non-null    int64 
 11  vesicles             129 non-null    int64 
 12  lameness             129 non-null    int64 
 13  change_in_behaviour  129 non-null    int64 
 14  furious              129 non-null    int64 
 15  dumbness             129 non-null    int64 
 16  nasal_di

In [47]:
data.isnull().sum()

sudden_death           0
blood_from_nose        0
trembling              0
difficult_breathing    0
blood_from_openings    0
fever                  0
loss_of_appetite       0
dullness               0
swelling               0
recumnency             0
profuse_salivation     0
vesicles               0
lameness               0
change_in_behaviour    0
furious                0
dumbness               0
nasal_discharge        0
eye_discharge          0
haemorrages            0
lethargy               0
enteritis              0
abortion               0
no_breed               0
unwillingness          0
stiffness              0
eraction               0
mastication            0
paralysis              0
encephalitis           0
septicaemia            0
infertility            0
nacrotic_foci          0
diarrhea               0
weight_loss            0
shivering              0
drooling               0
excessive_urination    0
prognosis              0
dtype: int64

# Preprocess data

### New Dataset

In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 38 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   sudden_death         129 non-null    int64 
 1   blood_from_nose      129 non-null    int64 
 2   trembling            129 non-null    int64 
 3   difficult_breathing  129 non-null    int64 
 4   blood_from_openings  129 non-null    int64 
 5   fever                129 non-null    int64 
 6   loss_of_appetite     129 non-null    int64 
 7   dullness             129 non-null    int64 
 8   swelling             129 non-null    int64 
 9   recumnency           129 non-null    int64 
 10  profuse_salivation   129 non-null    int64 
 11  vesicles             129 non-null    int64 
 12  lameness             129 non-null    int64 
 13  change_in_behaviour  129 non-null    int64 
 14  furious              129 non-null    int64 
 15  dumbness             129 non-null    int64 
 16  nasal_di

# Missing Values currently

In [60]:
df = data.dropna(subset=['prognosis'])

In [49]:
print(data.columns)

Index(['sudden_death', 'blood_from_nose', 'trembling', 'difficult_breathing',
       'blood_from_openings', 'fever', 'loss_of_appetite', 'dullness',
       'swelling', 'recumnency', 'profuse_salivation', 'vesicles', 'lameness',
       'change_in_behaviour', 'furious', 'dumbness', 'nasal_discharge',
       'eye_discharge', 'haemorrages', 'lethargy', 'enteritis', 'abortion',
       'no_breed', 'unwillingness', 'stiffness', 'eraction ', 'mastication',
       'paralysis', 'encephalitis', 'septicaemia', 'infertility',
       'nacrotic_foci', 'diarrhea', 'weight_loss', 'shivering', 'drooling',
       'excessive_urination', 'prognosis'],
      dtype='object')


In [50]:
print(data['prognosis'].dtype)

object


In [51]:
print(data['prognosis'].value_counts())


listeriosis                14
bovine_ephemeral_fever     14
blue_tongue                12
west_nile_virus            11
bovine_tuberculosis        11
anthrax                    10
johnes_disease              9
chronic_wasting_disease     9
tetanus                     8
vibriosis                   8
foot_and_mouth              7
black_leg                   6
rabies                      5
pox                         2
brucellosis                 2
no_disease                  1
Name: prognosis, dtype: int64


## Label the target variable

In [61]:
X = df.drop('prognosis', axis=1)  # All columns except 'prognosis'
y = df['prognosis']  

## Shape issue??

## Split the data into training and testing data

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Initialize the Random Forest Model

## Reduce the number of estimators for better scores: 100 to 10?

In [63]:
rf_model = RandomForestClassifier(n_estimators=10, random_state=42, class_weight='balanced')  

## Train the random forest model

In [64]:
rf_model.fit(X_train, y_train)

## Evaluate the Model using the Test Data

In [65]:
y_pred = rf_model.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8461538461538461

Classification Report:
                          precision    recall  f1-score   support

                anthrax       1.00      1.00      1.00         1
              black_leg       1.00      1.00      1.00         2
            blue_tongue       1.00      1.00      1.00         2
 bovine_ephemeral_fever       1.00      1.00      1.00         3
    bovine_tuberculosis       0.00      0.00      0.00         2
chronic_wasting_disease       1.00      1.00      1.00         3
         foot_and_mouth       1.00      1.00      1.00         2
         johnes_disease       0.50      0.50      0.50         2
            listeriosis       1.00      1.00      1.00         3
                    pox       1.00      1.00      1.00         1
                 rabies       1.00      0.50      0.67         2
                tetanus       1.00      1.00      1.00         1
              vibriosis       1.00      1.00      1.00         1
        west_nile_virus       0.33

## Here is another way to run the model

In [31]:
from sklearn.model_selection import LeaveOneOut, cross_val_score


In [32]:
loo = LeaveOneOut()

In [33]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

In [34]:
scores = cross_val_score(rf_model, features, labels, cv=loo)

In [35]:
print("LOOCV Accuracy:", scores.mean())

LOOCV Accuracy: 0.0
