### 1. Cloning the dataset

In [1]:
!git clone https://github.com/Alvin-Tan-Yi-Tung/Animal_Disease_Prediction.git

Cloning into 'Animal_Disease_Prediction'...
remote: Enumerating objects: 49, done.[K
remote: Counting objects: 100% (49/49), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 49 (delta 18), reused 40 (delta 12), pack-reused 0 (from 0)[K
Receiving objects: 100% (49/49), 249.29 KiB | 1.66 MiB/s, done.
Resolving deltas: 100% (18/18), done.


### 2. Import neccessary libraries

In [2]:
import pandas as pd
import numpy as np

### Data Preparation

In [3]:
import pandas as pd

# Load the detailed outbreak report (Dataset 1)
df_detailed = pd.read_csv('/content/Animal_Disease_Prediction/backend/Outbreak_Data.csv')

# Load the aggregated outbreak counts (Dataset 2)
# df_counts = pd.read_csv('/content/Animal_Disease_Prediction/backend/Total_Outbreaks_State.csv')

# Merge both datasets on 'State' and 'Disease Type'
# df_merged = pd.merge(df_detailed, df_counts, on=['State', 'Disease Type'], how='left')

# Fill missing counts with zero (in case no historical count available)
# df_merged['Count of State'] = df_merged['Count of State'].fillna(0)

# Preview merged dataset
# print(df_merged.head())
print(df_detailed.head())


   Reference Administrative divisions  \
0  ob_150996                   Sepang   
1  ob_150997                   Sepang   
2  ob_150998             Kuala Langat   
3  ob_151002                   Sepang   
4  ob_151003             Kuala Langat   

                                    Location Epidemiological unit  \
0                LOT 3336 SG BELANKAN (S443)                 Farm   
1            LOT 3330 SUNGAI BELANKAN (S445)                 Farm   
2          NO.4,LORONG KUIL KG.TUMBUK (SE20)                 Farm   
3  LOT 1871 LDG TUMBUK 42800 TG SEPAT (SE50)                 Farm   
4                 LOT 1871 LDG TUMBUK (S130)                 Farm   

   Number of outbreaks Start Date End Date     State      Report  \
0                    1  28/1/2025      NaN  Selangor  EVENT 4158   
1                    1   4/2/2025      NaN  Selangor  EVENT 4158   
2                    1   5/2/2025      NaN  Selangor  EVENT 4158   
3                    1   7/2/2025      NaN  Selangor  EVENT 4158  

### Data Preprocessing and data cleaning

In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the merged dataset (from Step 1)
df = pd.read_csv('/content/Animal_Disease_Prediction/backend/Outbreak_Data.csv')

# Clean Dates
df['Start Date'] = pd.to_datetime(df['Start Date'], format='%d/%m/%Y', errors='coerce')
df = df.dropna(subset=['Start Date'])  # Remove rows with invalid dates

In [8]:
# Remove any duplicate rows in the dataset
print("Shape of the DataFrame before dropping duplicates:", df.shape)

df = df.drop_duplicates()

print("Shape of the DataFrame after dropping duplicates:", df.shape)

Shape of the DataFrame before dropping duplicates: (1396, 10)
Shape of the DataFrame after dropping duplicates: (1396, 10)


In [9]:
# Remove rows where key columns are missing (State, Disease Type, Start Date)
df = df.dropna(subset=['State', 'Disease Type', 'Start Date'])

print("Shape of the DataFrame after remove missing values:", df.shape)

Shape of the DataFrame after remove missing values: (1396, 10)


In [11]:
# Remove any invalid number of outbreaks
df = df[df['Number of outbreaks'].notnull()]

print("Shape of the DataFrame after cleaning:", df.shape)

Shape of the DataFrame after cleaning: (1396, 10)


In [12]:
# Feature Engineering: Time Features: Year, Month, Day
df['Year'] = df['Start Date'].dt.year
df['Month'] = df['Start Date'].dt.month
df['Day'] = df['Start Date'].dt.day

# Encode Categories (State and Disease Type)
le_disease = LabelEncoder()
le_state = LabelEncoder()

df['Disease_Code'] = le_disease.fit_transform(df['Disease Type'])
df['State_Code'] = le_state.fit_transform(df['State'])

# Create an 'Outbreak Occurred' binary target (1 = outbreak reported)
df['Outbreak'] = (df['Number of outbreaks'] > 0).astype(int)

# Preview cleaned and feature-engineered data
print(df.head())

   Reference Administrative divisions  \
0  ob_150996                   Sepang   
1  ob_150997                   Sepang   
2  ob_150998             Kuala Langat   
3  ob_151002                   Sepang   
4  ob_151003             Kuala Langat   

                                    Location Epidemiological unit  \
0                LOT 3336 SG BELANKAN (S443)                 Farm   
1            LOT 3330 SUNGAI BELANKAN (S445)                 Farm   
2          NO.4,LORONG KUIL KG.TUMBUK (SE20)                 Farm   
3  LOT 1871 LDG TUMBUK 42800 TG SEPAT (SE50)                 Farm   
4                 LOT 1871 LDG TUMBUK (S130)                 Farm   

   Number of outbreaks Start Date End Date     State      Report  \
0                    1 2025-01-28      NaN  Selangor  EVENT 4158   
1                    1 2025-02-04      NaN  Selangor  EVENT 4158   
2                    1 2025-02-05      NaN  Selangor  EVENT 4158   
3                    1 2025-02-07      NaN  Selangor  EVENT 4158  

In [17]:
# SHow the total counts of the label encoding
display(df['Disease_Code'].value_counts())
display(df['State_Code'].value_counts())

Unnamed: 0_level_0,count
Disease_Code,Unnamed: 1_level_1
6,831
5,322
1,186
2,49
7,4
4,2
0,1
3,1


Unnamed: 0_level_0,count
State_Code,Unnamed: 1_level_1
12,830
7,106
11,81
4,71
2,61
6,46
13,42
5,37
9,28
10,28


In [18]:
# Mapping between the original label and label encoding
print(dict(zip(le_state.classes_, le_state.transform(le_state.classes_))))
print(dict(zip(le_disease.classes_, le_disease.transform(le_disease.classes_))))

{'Johor': np.int64(0), 'Kedah': np.int64(1), 'Kelantan': np.int64(2), 'Kuala Lumpur': np.int64(3), 'Melaka': np.int64(4), 'Negeri Sembilan': np.int64(5), 'Pahang': np.int64(6), 'Perak': np.int64(7), 'Perak ': np.int64(8), 'Perlis': np.int64(9), 'Pulau Pinang': np.int64(10), 'Sabah': np.int64(11), 'Sarawak': np.int64(12), 'Selangor': np.int64(13), 'Terengganu': np.int64(14)}
{'African horse sickness virus': np.int64(0), 'African swine fever virus': np.int64(1), 'Highly pathogenic avian influenza virus': np.int64(2), 'Infectious myonecrosis virus': np.int64(3), 'Influenza A virus': np.int64(4), 'Lumpy skin disease virus': np.int64(5), 'Rabies virus': np.int64(6), 'Tilapia lake virus': np.int64(7)}


In [19]:
# Save the cleaned dataset to a new CSV file
df.to_csv('cleaned_animal_disease_data.csv', index=False, encoding='utf-8-sig')

### Data Training

In [21]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv('cleaned_animal_disease_data.csv')

# Sort data chronologically (important for time-based prediction)
df = df.sort_values(by='Start Date').reset_index(drop=True)

print(df.head())

  Reference Administrative divisions          Location Epidemiological unit  \
0      7433             Kuala Lumpur  Pasir Wardieburn              Village   
1      7434                    Perak  Changkat Tualang              Village   
2      7435                    Perak       Bukit Merah              Village   
3      7437             Pulau Pinang   Permatang Bogak              Village   
4      7436                    Perak      Titi Gantung              Village   

   Number of outbreaks  Start Date   End Date         State     Report  \
0                    1  2006-02-06  22/3/2006  Kuala Lumpur  EVENT 120   
1                    1  2006-03-11  22/3/2006         Perak  EVENT 120   
2                    1  2006-03-16  22/3/2006         Perak  EVENT 120   
3                    1  2006-03-18  22/3/2006  Pulau Pinang  EVENT 120   
4                    1  2006-03-21  22/3/2006         Perak  EVENT 120   

                              Disease Type  Year  Month  Day  Disease_Code  \
0 

In [23]:
feature_cols = ['State_Code', 'Disease_Code', 'Year', 'Month', 'Day']
X = df[feature_cols].values
y = df['Outbreak'].values

print("✅ Feature shape:", X.shape)
print("✅ Target shape:", y.shape)

✅ Feature shape: (1396, 5)
✅ Target shape: (1396,)


In [24]:
# Create Sequences for Transformer (Sliding Window)
import numpy as np

# Define sequence length (e.g., 3 months)
sequence_length = 3

X_seq = []
y_seq = []

# Create sliding windows
for i in range(len(X) - sequence_length):
    X_seq.append(X[i:i+sequence_length])
    y_seq.append(y[i+sequence_length])  # predict next time step

X_seq = np.array(X_seq)
y_seq = np.array(y_seq)

print("✅ Sequence shape:", X_seq.shape)  # (samples, time_steps, features)
print("✅ Target shape:", y_seq.shape)


✅ Sequence shape: (1393, 3, 5)
✅ Target shape: (1393,)


In [25]:
train_size = int(0.8 * len(X_seq))

X_train, X_test = X_seq[:train_size], X_seq[train_size:]
y_train, y_test = y_seq[:train_size], y_seq[train_size:]

print(f"✅ Training samples: {X_train.shape[0]}")
print(f"✅ Testing samples: {X_test.shape[0]}")


✅ Training samples: 1114
✅ Testing samples: 279


In [26]:
# Transformer Model Setup
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model Parameters
input_dim = X_train.shape[2]
d_model = 64
nhead = 4
num_layers = 2
sequence_length = X_train.shape[1]

# defines the neural network model
class OutbreakTransformer(nn.Module):
    def __init__(self):
        super(OutbreakTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(d_model * sequence_length, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        x = x.reshape(x.size(0), -1)
        x = self.fc_out(x)
        return self.sigmoid(x)

model = OutbreakTransformer().to(device)


In [27]:
# Prepare data for PyTorch
# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1).to(device)

# DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)


In [28]:
# Training Loop
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {total_loss/len(train_loader):.4f}")


Epoch 1/10 - Loss: 0.0310
Epoch 2/10 - Loss: 0.0018
Epoch 3/10 - Loss: 0.0007
Epoch 4/10 - Loss: 0.0004
Epoch 5/10 - Loss: 0.0003
Epoch 6/10 - Loss: 0.0002
Epoch 7/10 - Loss: 0.0001
Epoch 8/10 - Loss: 0.0001
Epoch 9/10 - Loss: 0.0001
Epoch 10/10 - Loss: 0.0001


In [33]:
# Predictions & Results
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor).cpu().numpy()
    preds_binary = (outputs > 0.5).astype(int)

# Print 5 example predictions
for i in range(5):
    print(f"📍 State Code: {X_test[i, -1, 0]}")
    print(f"🚨 Predicted Outbreak: {preds_binary[i][0]} | True: {int(y_test[i])}")
    print("-" * 40)


📍 State Code: 5
🚨 Predicted Outbreak: 1 | True: 1
----------------------------------------
📍 State Code: 4
🚨 Predicted Outbreak: 1 | True: 1
----------------------------------------
📍 State Code: 4
🚨 Predicted Outbreak: 1 | True: 1
----------------------------------------
📍 State Code: 12
🚨 Predicted Outbreak: 1 | True: 1
----------------------------------------
📍 State Code: 13
🚨 Predicted Outbreak: 1 | True: 1
----------------------------------------


**Notes:**

Currently have some issue in prediction, as our dataset is too small and not enough data for it to predict.

Then another reason will be since our number of outbreaks for the location is always 1, so it will alway predict true as it does not have 0.