# FYP - Vector Borne Disease Prediction with xAI

## Introduction
Vectors are living organisms that can transmit infectious pathogens between humans, or from animals to humans. Many of these vectors are bloodsucking insects, which ingest disease-producing microorganisms during a blood meal from an infected host (human or animal) and later transmit it into a new host, after the pathogen has replicated. Often, once a vector becomes infectious, they are capable of transmitting the pathogen for the rest of their life during each subsequent bite/blood meal.

Vector-borne diseases are human illnesses caused by parasites, viruses and bacteria that are transmitted by vectors. Every year there are more than 700,000 deaths from vector borne diseases such as malaria, dengue, yellow fever, Japanese encephalitis and West Nile Fever.

## Libraries

In [1]:
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.model_selection import GridSearchCV
from tabulate import tabulate

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [2]:
from google.colab import drive

In [None]:
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer

## Set-Up

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data_folder = '/content/drive/MyDrive/VB Disease Dataset'

## Dataset Analysis

In [5]:
# Read the dataset
train = pd.read_csv('/content/drive/MyDrive/VB Disease Dataset/train.csv')
test = pd.read_csv('/content/drive/MyDrive/VB Disease Dataset/test.csv')

In [6]:
data = train

In [7]:
print(data.head())

   id  sudden_fever  headache  mouth_bleed  nose_bleed  muscle_pain  \
0   0           1.0       1.0          0.0         1.0          1.0   
1   1           0.0       0.0          0.0         0.0          0.0   
2   2           0.0       1.0          1.0         1.0          0.0   
3   3           0.0       0.0          1.0         1.0          1.0   
4   4           0.0       0.0          0.0         0.0          0.0   

   joint_pain  vomiting  rash  diarrhea  ...  breathing_restriction  \
0         1.0       1.0   0.0       1.0  ...                    0.0   
1         0.0       1.0   0.0       1.0  ...                    0.0   
2         1.0       1.0   1.0       1.0  ...                    1.0   
3         1.0       0.0   1.0       0.0  ...                    0.0   
4         0.0       0.0   0.0       1.0  ...                    0.0   

   toe_inflammation  finger_inflammation  lips_irritation  itchiness  ulcers  \
0               0.0                  0.0              0.0        0

In [8]:
id_df = data.pop('id')

In [9]:
data.head()

Unnamed: 0,sudden_fever,headache,mouth_bleed,nose_bleed,muscle_pain,joint_pain,vomiting,rash,diarrhea,hypotension,...,breathing_restriction,toe_inflammation,finger_inflammation,lips_irritation,itchiness,ulcers,toenail_loss,speech_problem,bullseye_rash,prognosis
0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lyme_disease
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tungiasis
2,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,Lyme_disease
3,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Zika
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,Rift_Valley_fever


In [10]:
print(data.shape)

(707, 65)


In [11]:
print(data.describe())

       sudden_fever    headache  mouth_bleed  nose_bleed  muscle_pain  \
count    707.000000  707.000000   707.000000  707.000000   707.000000   
mean       0.503536    0.449788     0.459689    0.487977     0.517680   
std        0.500341    0.497825     0.498725    0.500209     0.500041   
min        0.000000    0.000000     0.000000    0.000000     0.000000   
25%        0.000000    0.000000     0.000000    0.000000     0.000000   
50%        1.000000    0.000000     0.000000    0.000000     1.000000   
75%        1.000000    1.000000     1.000000    1.000000     1.000000   
max        1.000000    1.000000     1.000000    1.000000     1.000000   

       joint_pain    vomiting        rash    diarrhea  hypotension  ...  \
count  707.000000  707.000000  707.000000  707.000000   707.000000  ...   
mean     0.449788    0.441301    0.487977    0.390382     0.393211  ...   
std      0.497825    0.496894    0.500209    0.488181     0.488809  ...   
min      0.000000    0.000000    0.000000 

In [12]:
target_types = list(train['prognosis'].unique())
target_types

['Lyme_disease',
 'Tungiasis',
 'Zika',
 'Rift_Valley_fever',
 'West_Nile_fever',
 'Malaria',
 'Chikungunya',
 'Plague',
 'Dengue',
 'Yellow_Fever',
 'Japanese_encephalitis']

## EDA

In [13]:
print(f'[INFO] Shapes:'
      f'\n train: {train.shape}'
      f'\n test: {test.shape}\n')

print(f'[INFO] Any missing values:'
      f'\n train: {train.isna().any().any()}'
      f'\n test: {test.isna().any().any()}')

[INFO] Shapes:
 train: (707, 65)
 test: (303, 65)

[INFO] Any missing values:
 train: False
 test: False


In [14]:
# Create figure
fig = px.histogram(train['prognosis'], color_discrete_sequence=['#636EFA'])

# Set Title and x/y axis labels
fig.update_layout(
    xaxis_title="Disease",
    yaxis_title="Frequency",
    showlegend=False,
    font=dict(size=14),
    title={
        'text': "Train Prognosis Distribution",
        'y': 0.95,
        'x': 0.5
    }
)

# Display
fig.show()


In [15]:
# Create figure
fig = px.imshow(train.corr())

# Set Title and x/y axis labels
fig.update_layout(
    showlegend=False,
    font=dict(size=14),
    title={
        'text': "Train Dataset Correlation",
        'y': 0.95,
        'x': 0.49
    }
)

# Display
fig.show()

ValueError: could not convert string to float: 'Lyme_disease'

## Pre-Processing


### Basic Pre-processing

In [16]:
# Remove any missing values
data = data.dropna()

# Convert 'prognosis' text to lowercase
data['prognosis'] = data['prognosis'].str.lower()

# Remove punctuation from 'prognosis' text
data['prognosis'] = data['prognosis'].str.replace('[^\w\s]', '', regex=True)

# Display the modified DataFrame
print(data.head())


   sudden_fever  headache  mouth_bleed  nose_bleed  muscle_pain  joint_pain  \
0           1.0       1.0          0.0         1.0          1.0         1.0   
1           0.0       0.0          0.0         0.0          0.0         0.0   
2           0.0       1.0          1.0         1.0          0.0         1.0   
3           0.0       0.0          1.0         1.0          1.0         1.0   
4           0.0       0.0          0.0         0.0          0.0         0.0   

   vomiting  rash  diarrhea  hypotension  ...  breathing_restriction  \
0       1.0   0.0       1.0          1.0  ...                    0.0   
1       1.0   0.0       1.0          0.0  ...                    0.0   
2       1.0   1.0       1.0          1.0  ...                    1.0   
3       0.0   1.0       0.0          1.0  ...                    0.0   
4       0.0   0.0       1.0          0.0  ...                    0.0   

   toe_inflammation  finger_inflammation  lips_irritation  itchiness  ulcers  \
0           

### Target Encoding

Tokenization, stop words removal, lemmatization, and conversion of text into numerical representation using TF-IDF vectorization.

In [None]:
# corpus = data['prognosis'].values.astype(str)

# # Tokenize the text
# corpus = [word_tokenize(text) for text in corpus]

# # Remove stop words
# stop_words = set(stopwords.words('english'))
# corpus = [[word for word in text if word not in stop_words] for text in corpus]

# # Lemmatize the text
# lemmatizer = WordNetLemmatizer()
# corpus = [[lemmatizer.lemmatize(word) for word in text] for text in corpus]

# # Convert text to numerical representation
# vectorizer = TfidfVectorizer()
# text_vectors = vectorizer.fit_transform([' '.join(text) for text in corpus])

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(text_vectors, data['prognosis'], test_size=0.2, random_state=42)


In [None]:
# print(corpus)

[['lyme_disease'], ['tungiasis'], ['lyme_disease'], ['zika'], ['rift_valley_fever'], ['lyme_disease'], ['west_nile_fever'], ['tungiasis'], ['malaria'], ['chikungunya'], ['plague'], ['zika'], ['dengue'], ['malaria'], ['chikungunya'], ['malaria'], ['zika'], ['yellow_fever'], ['tungiasis'], ['lyme_disease'], ['chikungunya'], ['chikungunya'], ['west_nile_fever'], ['zika'], ['lyme_disease'], ['japanese_encephalitis'], ['yellow_fever'], ['plague'], ['rift_valley_fever'], ['chikungunya'], ['japanese_encephalitis'], ['tungiasis'], ['west_nile_fever'], ['tungiasis'], ['zika'], ['chikungunya'], ['tungiasis'], ['rift_valley_fever'], ['japanese_encephalitis'], ['west_nile_fever'], ['zika'], ['zika'], ['dengue'], ['dengue'], ['zika'], ['chikungunya'], ['yellow_fever'], ['chikungunya'], ['lyme_disease'], ['yellow_fever'], ['tungiasis'], ['chikungunya'], ['japanese_encephalitis'], ['west_nile_fever'], ['lyme_disease'], ['japanese_encephalitis'], ['plague'], ['dengue'], ['malaria'], ['zika'], ['yellow

In [None]:
# # X_test head()
# print(X_test[:5].toarray())

[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]


In [None]:
# # Reverse TF-IDF vectorization
# original_text = vectorizer.inverse_transform(X_test)

# # Convert text lists back to strings
# original_text = [' '.join(text) for text in original_text]

# # Print the original 'prognosis' column after preprocessing
# print(original_text)


['dengue', 'west_nile_fever', 'west_nile_fever', 'japanese_encephalitis', 'rift_valley_fever', 'tungiasis', 'japanese_encephalitis', 'rift_valley_fever', 'plague', 'plague', 'plague', 'west_nile_fever', 'west_nile_fever', 'west_nile_fever', 'west_nile_fever', 'chikungunya', 'dengue', 'rift_valley_fever', 'malaria', 'malaria', 'lyme_disease', 'chikungunya', 'yellow_fever', 'yellow_fever', 'japanese_encephalitis', 'malaria', 'lyme_disease', 'japanese_encephalitis', 'plague', 'plague', 'west_nile_fever', 'chikungunya', 'tungiasis', 'lyme_disease', 'japanese_encephalitis', 'lyme_disease', 'rift_valley_fever', 'malaria', 'rift_valley_fever', 'rift_valley_fever', 'yellow_fever', 'west_nile_fever', 'chikungunya', 'west_nile_fever', 'lyme_disease', 'plague', 'west_nile_fever', 'rift_valley_fever', 'yellow_fever', 'plague', 'west_nile_fever', 'plague', 'japanese_encephalitis', 'lyme_disease', 'chikungunya', 'japanese_encephalitis', 'yellow_fever', 'dengue', 'malaria', 'tungiasis', 'rift_valley_

In [17]:
target_types = list(data['prognosis'].unique())
target_types

['lyme_disease',
 'tungiasis',
 'zika',
 'rift_valley_fever',
 'west_nile_fever',
 'malaria',
 'chikungunya',
 'plague',
 'dengue',
 'yellow_fever',
 'japanese_encephalitis']

In [18]:
out_mapping = {}
for index , i in enumerate(target_types):
    out_mapping[i] = index;
out_mapping
data['prognosis'] = data['prognosis'].replace(out_mapping)
data.head()

Unnamed: 0,sudden_fever,headache,mouth_bleed,nose_bleed,muscle_pain,joint_pain,vomiting,rash,diarrhea,hypotension,...,breathing_restriction,toe_inflammation,finger_inflammation,lips_irritation,itchiness,ulcers,toenail_loss,speech_problem,bullseye_rash,prognosis
0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0
3,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,3


### Correlation Matrix

In [19]:
correlation_matrix = data.corr()

# Set up the matplotlib figure
plt.figure(figsize=(35,30))

# Create the heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")

# Add a title
plt.title('Correlation Heatmap')

# Show the plot
plt.show()

Output hidden; open in https://colab.research.google.com to view.

### Dataset split

In [20]:
X = data.drop(['prognosis'],axis=1)
Y = data['prognosis']
X_train , X_test , Y_train , Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

## Model Building

### Random Forests & Naive Bayes

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# Train Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, Y_train)

# Predictions using Naive Bayes classifier
nb_predictions = nb_classifier.predict(X_test)

# Evaluate Naive Bayes classifier
nb_accuracy = accuracy_score(Y_test, nb_predictions)
print("Naive Bayes Classifier Accuracy:", nb_accuracy)
print("Naive Bayes Classifier Report:\n", classification_report(Y_test, nb_predictions, zero_division=0))

# Train Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, Y_train)

# Predictions using Random Forest classifier
rf_predictions = rf_classifier.predict(X_test)

# Evaluate Random Forest classifier
rf_accuracy = accuracy_score(Y_test, rf_predictions)
print("\nRandom Forest Classifier Accuracy:", rf_accuracy)
print("Random Forest Classifier Report:\n", classification_report(Y_test, rf_predictions, zero_division=0))

Naive Bayes Classifier Accuracy: 0.3028169014084507
Naive Bayes Classifier Report:
               precision    recall  f1-score   support

           0       0.33      0.45      0.38        11
           1       0.56      0.75      0.64        12
           2       0.20      0.15      0.17        13
           3       0.20      0.17      0.18        12
           4       0.50      0.11      0.18        18
           5       0.13      0.30      0.18        10
           6       0.75      0.75      0.75        12
           7       0.00      0.00      0.00        16
           8       0.30      0.50      0.37         6
           9       0.22      0.53      0.31        15
          10       0.00      0.00      0.00        17

    accuracy                           0.30       142
   macro avg       0.29      0.34      0.29       142
weighted avg       0.28      0.30      0.26       142


Random Forest Classifier Accuracy: 0.323943661971831
Random Forest Classifier Report:
               p

### 5 Fold Cross Validation

In [22]:
from sklearn.model_selection import cross_val_score

# 1. Cross-validation
# Perform 5-fold cross-validation for Naive Bayes
nb_cv_scores = cross_val_score(nb_classifier, X_train, Y_train, cv=5)
print("Cross-validation scores for Naive Bayes:", nb_cv_scores)
print("Mean cross-validation score for Naive Bayes:", nb_cv_scores.mean())

# Perform 5-fold cross-validation for Random Forest
rf_cv_scores = cross_val_score(rf_classifier, X_train, Y_train, cv=5)
print("\nCross-validation scores for Random Forest:", rf_cv_scores)
print("Mean cross-validation score for Random Forest:", rf_cv_scores.mean())

# 2. Hyperparameter Tuning
# Define hyperparameters to tune for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 3. Perform grid search with 5-fold cross-validation to find the best parameters
rf_grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, verbose=1, n_jobs=-1)
rf_grid_search.fit(X_train, Y_train)

# 4. Print the best parameters found by grid search
print("\nBest parameters for Random Forest:", rf_grid_search.best_params_)

# 5. Evaluate on Test Set
# For Naive Bayes classifier
nb_classifier.fit(X_train, Y_train)
nb_predictions = nb_classifier.predict(X_test)

# For Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, Y_train)
rf_predictions = rf_classifier.predict(X_test)

# Evaluate Naive Bayes classifier on the test set
nb_test_predictions = nb_classifier.predict(X_test)
nb_test_accuracy = accuracy_score(Y_test, nb_test_predictions)
print("\nNaive Bayes Classifier Test Accuracy:", nb_test_accuracy)

# Evaluate Random Forest classifier on the test set
rf_test_predictions = rf_classifier.predict(X_test)
rf_test_accuracy = accuracy_score(Y_test, rf_test_predictions)
print("Random Forest Classifier Test Accuracy:", rf_test_accuracy)

Cross-validation scores for Naive Bayes: [0.30973451 0.28318584 0.33628319 0.2920354  0.34513274]
Mean cross-validation score for Naive Bayes: 0.31327433628318585

Cross-validation scores for Random Forest: [0.28318584 0.27433628 0.30973451 0.20353982 0.3539823 ]
Mean cross-validation score for Random Forest: 0.28495575221238945
Fitting 5 folds for each of 81 candidates, totalling 405 fits

Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

Naive Bayes Classifier Test Accuracy: 0.3028169014084507
Random Forest Classifier Test Accuracy: 0.323943661971831


### Model Analysis with GridSearchCV

In [23]:
# Support Vector Classifier
svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
}

svm_grid_search = GridSearchCV(SVC(random_state=42), svm_param_grid, refit=True, verbose=3, cv=3)
svm_grid_search.fit(X_train, Y_train)

# Random Forest Classifier
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, refit=True, verbose=3, cv=3)
rf_grid_search.fit(X_train, Y_train)

# Naive Bayes Classifier (No hyperparameters to tune)
nb_classifier = MultinomialNB()
# No hyperparameters for Naive Bayes, fit the model directly
nb_classifier.fit(X_train, Y_train)

# Evaluate models
svm_best_model = svm_grid_search.best_estimator_
rf_best_model = rf_grid_search.best_estimator_

svm_accuracy = accuracy_score(Y_test, svm_best_model.predict(X_test))
rf_accuracy = accuracy_score(Y_test, rf_best_model.predict(X_test))
nb_accuracy = accuracy_score(Y_test, nb_classifier.predict(X_test))

print("SVM Classifier:")
print("Best Parameters Found:", svm_grid_search.best_params_)
print("Best Cross-validation Score:", svm_grid_search.best_score_)
print("Accuracy of the best model:", svm_accuracy * 100, "%")
print(classification_report(Y_test, svm_best_model.predict(X_test)))

print("\nRandom Forest Classifier:")
print("Best Parameters Found:", rf_grid_search.best_params_)
print("Best Cross-validation Score:", rf_grid_search.best_score_)
print("Accuracy of the best model:", rf_accuracy * 100, "%")
print(classification_report(Y_test, rf_best_model.predict(X_test)))

print("\nNaive Bayes Classifier:")
print("Accuracy of the model:", nb_accuracy * 100, "%")
print(classification_report(Y_test, nb_classifier.predict(X_test)))


Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.328 total time=   0.1s
[CV 2/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.298 total time=   0.0s
[CV 3/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.293 total time=   0.1s
[CV 1/3] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.132 total time=   0.1s
[CV 2/3] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.128 total time=   0.1s
[CV 3/3] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.133 total time=   0.1s
[CV 1/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.196 total time=   0.0s
[CV 2/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.165 total time=   0.0s
[CV 3/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.197 total time=   0.0s
[CV 1/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.328 total time=   0.0s
[CV 2/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.298 total time=   0.0s
[CV 3/3] END ..C=0.1, gamma=auto, kernel=linear;


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [24]:
import joblib

# Save the trained model to a file
joblib.dump(rf_grid_search.best_estimator_, 'random_forest_model.pkl')


['random_forest_model.pkl']

### Model Evaluation & Comparision

In [28]:
models = {
    'm_regression':LogisticRegression(max_iter=4000),
    'm_forest':RandomForestClassifier(max_depth=5, min_samples_leaf=4, random_state=0),
    'SVC':SVC(kernel='linear', C=1.0),
    'KNN':KNeighborsClassifier(n_neighbors=3),
    'XGBoost': XGBClassifier(learning_rate =0.1, n_estimators=600, max_depth=5)
}
scores = ['Accuracy']
res = np.zeros(shape=(len(models),len(scores)))

In [29]:
for i,key in enumerate(models):
    mod = models[key].fit(X_train,Y_train)
    y_p = mod.predict(X_test)

    res[i][0] = accuracy_score(y_p,Y_test)

In [30]:
table_np = tabulate(res, headers=scores, showindex=list(models.keys()), tablefmt='pretty')
print(table_np)

+--------------+--------------------+
|              |      Accuracy      |
+--------------+--------------------+
| m_regression | 0.2887323943661972 |
|   m_forest   | 0.2676056338028169 |
|     SVC      | 0.2887323943661972 |
|     KNN      | 0.2746478873239437 |
|   XGBoost    | 0.3380281690140845 |
+--------------+--------------------+


## Model Save

In [31]:
import joblib

# Save the SVC model to a file
joblib.dump(models['XGBoost'], 'XGBoost.pkl')


['XGBoost.pkl']

## Testing

In [25]:
# Create an inverse mapping from numerical labels to text labels
inv_mapping = {v: k for k, v in out_mapping.items()}

print(inv_mapping)

{0: 'lyme_disease', 1: 'tungiasis', 2: 'zika', 3: 'rift_valley_fever', 4: 'west_nile_fever', 5: 'malaria', 6: 'chikungunya', 7: 'plague', 8: 'dengue', 9: 'yellow_fever', 10: 'japanese_encephalitis'}


In [26]:
from joblib import load

# # Load the saved Random Forest model
# loaded_rf_model = load('random_forest_model.joblib')

# # Use the loaded model for prediction
# predicted_labels = loaded_rf_model.predict(X_test)


In [None]:
# # Get predicted labels for the first few samples
# predicted_labels = rf_best_model.predict(X_test)
# predicted_probabilities = rf_best_model.predict_proba(X_test)

# # Convert numerical labels to text labels using the inverse mapping
# predicted_diseases = [inv_mapping[label] for label in predicted_labels]

# # Print predicted diseases and probabilities for the first few samples
# for i in range(5):  # Adjust the range as needed
#     print("Sample", i+1, "predicted disease:", predicted_diseases[i])
#     print("Sample", i+1, "predicted probability:", max(predicted_probabilities[i]) * 100, "%")
