# Import of libraries and data visualization

In [15]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/vector-borne-disease-prediction/testt.csv
/kaggle/input/vector-borne-disease-prediction/trainn.csv


In [3]:
# Test and train file path
test_file_path = '/kaggle/input/vector-borne-disease-prediction/testt.csv'
train_file_path = '/kaggle/input/vector-borne-disease-prediction/trainn.csv'

In [4]:
# Load test and training data from CSV files
test_data = pd.read_csv(test_file_path)
train_data = pd.read_csv(train_file_path)

# Display the first few rows of the training DataFrame
train_data.head()

Unnamed: 0,sudden_fever,headache,mouth_bleed,nose_bleed,muscle_pain,joint_pain,vomiting,rash,diarrhea,hypotension,...,breathing_restriction,toe_inflammation,finger_inflammation,lips_irritation,itchiness,ulcers,toenail_loss,speech_problem,bullseye_rash,prognosis
0,0,1,1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,Chikungunya
1,1,1,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,Chikungunya
2,0,1,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,Chikungunya
3,0,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,Chikungunya
4,1,0,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,Chikungunya


In [5]:
# Prognosis column for test_data
test_data['prognosis']

0               Chikungunya
1                    Dengue
2         Rift Valley fever
3              Yellow Fever
4                      Zika
5                   Malaria
6     Japanese encephalitis
7           West Nile fever
8                    Plague
9                 Tungiasis
10             Lyme disease
Name: prognosis, dtype: object

# Display of the frequency of symptoms

In [6]:
# Create a list of column names from the training data, excluding the 'prognosis' column
symptom_columns = [col for col in train_data.columns if col != 'prognosis']

# Calculate the frequency of each symptom
symptom_frequencies = train_data[symptom_columns].sum()

# Create a DataFrame to hold the symptom frequencies
symptom_frequencies_df = pd.DataFrame({
    'Symptom': symptom_frequencies.index,
    'Frequency': symptom_frequencies.values
})

# Create a bar chart to visualize the frequency distribution of symptoms
fig = px.bar(symptom_frequencies_df, x='Symptom', y='Frequency',
             template='plotly_dark',
             color_discrete_sequence=['#1192AA'])

# Update the layout of the plot
fig.update_layout(
    xaxis_title="Symptom",
    yaxis_title="Frequency",
    showlegend=False,
    font=dict(size=14),
    title={
        'text': "Symptom Frequency Distribution",
        'y': 0.95,
        'x': 0.5
    },
    xaxis=dict(
        tickangle=-45
    ),
    bargap=0.2
)

# Display the plot
fig.show()

# Display of the Symptoms presence in different diseases

In [7]:
# Melt the training data to reshape it for easier analysis
# 'id_vars' specifies the column to keep as identifier variable ('prognosis')
# 'var_name' specifies the name for the new column that will hold the original column names ('Symptom')
# 'value_name' specifies the name for the new column that will hold the values ('Presence')
data_melted = train_data.melt(id_vars='prognosis', var_name='Symptom', value_name='Presence')
print(data_melted)

# Convert the 'Presence' values to binary (0 or 1)
# If the original value is greater than 0, set it to 1. Otherwise, set it to 0
data_melted['Presence'] = data_melted['Presence'].apply(lambda x: 1 if x > 0 else 0)

          prognosis        Symptom  Presence
0       Chikungunya   sudden_fever         0
1       Chikungunya   sudden_fever         1
2       Chikungunya   sudden_fever         0
3       Chikungunya   sudden_fever         0
4       Chikungunya   sudden_fever         1
...             ...            ...       ...
16123  Lyme disease  bullseye_rash         1
16124  Lyme disease  bullseye_rash         1
16125  Lyme disease  bullseye_rash         1
16126  Lyme disease  bullseye_rash         1
16127  Lyme disease  bullseye_rash         1

[16128 rows x 3 columns]


In [8]:
# Group the melted data by 'prognosis' and 'Symptom' columns
# For each combination of disease and symptom, take the maximum value of 'Presence' 
# This shows whether a symptom is present for a disease 
heatmap_data = data_melted.groupby(['prognosis', 'Symptom'], observed=False)['Presence'].max().unstack(fill_value=0)

# Create a heatmap to visualize the presence of symptoms for different diseases
fig = px.imshow(
    heatmap_data,
    labels=dict(x="Symptom", y="Disease", color="Presence"),
    color_continuous_scale='Viridis',  
    template='plotly_dark'
)

# Update the layout of the heatmap to make it more readable
fig.update_layout(
    xaxis_title="Symptom",
    yaxis_title="Disease",
    font=dict(size=14),
    title={
        'text': "Symptoms Presence in Different Diseases",
        'y': 0.95,
        'x': 0.5
    }
)

# Display the heatmap plot
fig.show()

In [84]:
test_data

Unnamed: 0,sudden_fever,headache,mouth_bleed,nose_bleed,muscle_pain,joint_pain,vomiting,rash,diarrhea,hypotension,...,breathing_restriction,toe_inflammation,finger_inflammation,lips_irritation,itchiness,ulcers,toenail_loss,speech_problem,bullseye_rash,prognosis
0,1,0,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,Chikungunya
1,1,0,0,0,1,1,1,1,0,1,...,0,0,0,0,0,0,1,0,0,Dengue
2,1,1,1,1,0,1,0,1,0,1,...,0,1,0,1,0,0,0,0,0,Rift Valley fever
3,1,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Yellow Fever
4,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Zika
5,1,1,0,1,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,Malaria
6,0,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,Japanese encephalitis
7,0,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,West Nile fever
8,1,1,1,1,1,0,1,1,0,1,...,1,1,1,1,0,0,0,0,0,Plague
9,0,1,0,0,0,0,0,1,1,0,...,0,0,0,0,1,1,1,0,0,Tungiasis


# Development of the multinomial logistic regression model

In [13]:
# Initialize a LabelEncoder to convert the categorical 'prognosis' column into numerical values
label_encoder = LabelEncoder()
# Fit the label encoder to the 'prognosis' column and transform it into a numerical format
train_data['prognosis'] = label_encoder.fit_transform(train_data['prognosis'])

# 'X_train' contains all columns except 'prognosis'
# 'y_train' contains only the 'prognosis' column, which is the target we are predicting
X_train = train_data.drop(columns=['prognosis'])
y_train = train_data['prognosis']

# This data will be used for testing the model's predictions without the target column
X_test = test_data.drop(columns=['prognosis'])
print(X_test)

    sudden_fever  headache  mouth_bleed  nose_bleed  muscle_pain  joint_pain  \
0              1         0            0           0            1           1   
1              1         0            0           0            1           1   
2              1         1            1           1            0           1   
3              1         1            0           1            1           0   
4              0         0            1           0            0           1   
5              1         1            0           1            1           0   
6              0         0            0           1            1           1   
7              0         1            0           0            0           1   
8              1         1            1           1            1           0   
9              0         1            0           0            0           0   
10             1         1            1           0            0           1   

    vomiting  rash  diarrhea  hypotensi

In [14]:
# Split the original training data into training and validation sets
# 80% of the data is used for training, and 20% is used for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize a logistic regression model for multinomial classification
# 'multi_class' is set to 'multinomial' to handle multi-class classification
# 'solver' is set to 'lbfgs' (a suitable solver for small-to-medium datasets)
# 'max_iter' is set to 1000 to ensure enough iterations for convergence
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# The model learns the patterns in the training features (X_train_split) and labels (y_train_split)
model.fit(X_train_split, y_train_split)

# Predict the labels for the validation set (X_val)
y_pred_val = model.predict(X_val)
print(classification_report(y_val, y_pred_val))

# Predict the labels for the test set (X_test)
y_pred_test = model.predict(X_test)

# Convert the predicted numerical labels back to their original disease names using the label encoder
predicted_diseases = label_encoder.inverse_transform(y_pred_test)

# Loop through each patient in the test set and print the predicted disease
for i, disease in enumerate(test_data['prognosis']):
    print(f"Paciente {i}: {disease}")


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      0.83      0.91         6
           2       0.75      1.00      0.86         6
           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         4
           6       0.75      0.75      0.75         4
           7       1.00      1.00      1.00         4
           8       1.00      0.57      0.73         7
           9       0.40      1.00      0.57         2
          10       0.67      0.50      0.57         4

    accuracy                           0.86        51
   macro avg       0.87      0.88      0.85        51
weighted avg       0.90      0.86      0.87        51

Paciente 0: Chikungunya
Paciente 1: Dengue
Paciente 2: Rift Valley fever
Paciente 3: Yellow Fever
Paciente 4: Zika
Paciente 5: Malaria
Paciente 6: Japanese encephalitis
Paciente 7: West N

# Saving the files to implement the app in Streamlit

In [18]:
# Create a dictionary that maps the numerical indices to the corresponding disease names
index_to_disease = {0: 'Chikungunya', 1: 'Dengue', 2: 'Rift Valley fever', 3: 'Yellow Fever',
                   4: 'Zika', 5:  'Malaria', 6: 'Japanese encephalitis', 7: 'West Nile fever',
                   8: 'Plague', 9:'Tungiasis', 10: 'Lyme disease'}

# Save the index_to_disease dictionary to a file using joblib
joblib.dump(index_to_disease, 'index_to_disease.pkl')

# Save the trained logistic regression model to a file using joblib
joblib.dump(model, 'model.pkl')

# Save the label encoder to a file using joblib
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']

In [17]:
# Extracting the feature names from the training data
feature_names = X_train.columns

# Saving the extracted feature names to a file using joblib
joblib.dump(feature_names, 'feature_names.pkl')

['feature_names.pkl']