# Disease Recommendation/Prediction Application

## Import Data

In [113]:
import pandas as pd

# Load the datasets
dataset_df = pd.read_csv('resources/dataset.csv')
symptom_description_df = pd.read_csv('resources/symptom_Description.csv')
symptom_precaution_df = pd.read_csv('resources/symptom_precaution.csv')
symptom_severity_df = pd.read_csv('resources/Symptom-severity.csv')

In [115]:
# Display the head and tail of each dataset
print("Head and Tail of dataset.csv:")
print(dataset_df.head())  # Display the first few rows
print(dataset_df.tail())  # Display the last few rows

print("\nHead and Tail of symptom_Description.csv:")
print(symptom_description_df.head())  # Display the first few rows
print(symptom_description_df.tail())  # Display the last few rows

print("\nHead and Tail of symptom_precaution.csv:")
print(symptom_precaution_df.head())  # Display the first few rows
print(symptom_precaution_df.tail())  # Display the last few rows

print("\nHead and Tail of Symptom-severity.csv:")
print(symptom_severity_df.head())  # Display the first few rows
print(symptom_severity_df.tail())  # Display the last few rows

Head and Tail of dataset.csv:
            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   
3  Fungal infection     itching              skin_rash    dischromic _patches   
4  Fungal infection     itching              skin_rash   nodal_skin_eruptions   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic _patches       NaN       NaN       NaN       NaN       NaN   
1                   NaN       NaN       NaN       NaN       NaN       NaN   
2                   NaN       NaN       NaN       NaN       NaN       NaN   
3                   NaN       NaN       NaN       NaN       NaN       NaN   
4                   NaN       NaN       NaN       NaN       NaN       NaN   

  Symptom_10 Symptom

In [117]:
# Display all the columns of each dataset to understand their structure.

# Display the columns of dataset.csv
print("Columns of dataset.csv:")
print(dataset_df.columns)

# Display the columns of symptom_Description.csv
print("\nColumns of symptom_Description.csv:")
print(symptom_description_df.columns)

# Display the columns of symptom_precaution.csv
print("\nColumns of symptom_precaution.csv:")
print(symptom_precaution_df.columns)

# Display the columns of Symptom-severity.csv
print("\nColumns of Symptom-severity.csv:")
print(symptom_severity_df.columns)


Columns of dataset.csv:
Index(['Disease', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4',
       'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
       'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14',
       'Symptom_15', 'Symptom_16', 'Symptom_17'],
      dtype='object')

Columns of symptom_Description.csv:
Index(['Disease', 'Description'], dtype='object')

Columns of symptom_precaution.csv:
Index(['Disease', 'Precaution_1', 'Precaution_2', 'Precaution_3',
       'Precaution_4'],
      dtype='object')

Columns of Symptom-severity.csv:
Index(['Symptom', 'weight'], dtype='object')


## Data Cleaning - Handling Missing Values and Standardizing Symptom Names

In [120]:
# Handling missing values in dataset.csv
# Replace empty symptom columns with 'None' to indicate the absence of further symptoms for that disease.
dataset_df.fillna('None', inplace=True)

# Standardizing symptom names in dataset.csv by stripping any leading/trailing spaces
for col in dataset_df.columns:
    if 'Symptom' in col:
        dataset_df[col] = dataset_df[col].str.strip()

In [122]:
# Standardizing symptom names in Symptom-severity.csv by stripping any leading/trailing spaces
symptom_severity_df['Symptom'] = symptom_severity_df['Symptom'].str.strip()

# Display the cleaned data (optional, to verify the cleaning process)
print("Cleaned dataset.csv (Head):")
print(dataset_df.head())

print("\nCleaned Symptom-severity.csv (Head):")
print(symptom_severity_df.head())

Cleaned dataset.csv (Head):
            Disease  Symptom_1             Symptom_2             Symptom_3  \
0  Fungal infection    itching             skin_rash  nodal_skin_eruptions   
1  Fungal infection  skin_rash  nodal_skin_eruptions   dischromic _patches   
2  Fungal infection    itching  nodal_skin_eruptions   dischromic _patches   
3  Fungal infection    itching             skin_rash   dischromic _patches   
4  Fungal infection    itching             skin_rash  nodal_skin_eruptions   

             Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0  dischromic _patches      None      None      None      None      None   
1                 None      None      None      None      None      None   
2                 None      None      None      None      None      None   
3                 None      None      None      None      None      None   
4                 None      None      None      None      None      None   

  Symptom_10 Symptom_11 Symptom_12 Symptom_13 

## Combining Datsets

### In this step, we're focused on combining the `dataset.csv` and `Symptom_Severity.csv` files. The main goal is to map the symptoms in our dataset to their corresponding severity weights and calculate a total severity score for each disease.

#### Why are we only using `dataset.csv` and `Symptom_Severity.csv` at this stage?

- **Mapping Severity**: We need to map symptoms to their severity before merging with other data like descriptions or precautions. This step ensures that we accurately understand the seriousness of each symptom before incorporating additional information, which might complicate the analysis.

- **Calculating Total Severity (`weight_total`)**: Calculating the total severity for each disease helps us understand the overall impact of the symptoms. By assigning a numerical value to the severity of each symptom, we create a quantifiable measure that reflects how severe a disease might be based on its symptoms. This numerical value (or score) is essential for making precise predictions in the later stages of the project.

#### Why is this approach beneficial for deep learning?

- **Feature Importance**: Deep learning models perform better when they have access to relevant, well-structured features. In this case, the total severity score (`weight_total`) becomes a crucial feature. It encapsulates the cumulative impact of all symptoms, allowing the model to more effectively differentiate between diseases based on their symptom severity. 

- **Simplified Data for Training**: By focusing on these two datasets first, we reduce the complexity of the data that the deep learning model will be trained on. A model with less noisy or irrelevant data can learn more effectively, leading to better performance and more accurate predictions.

- **Improved Model Accuracy**: Deep learning models benefit from having clearly defined and meaningful inputs. The severity mapping and total severity calculation provide the model with a strong foundation to recognize patterns and make connections between symptoms and diseases. This leads to a more accurate and reliable model, which is critical in healthcare applications.

In summary, this approach is necessary to ensure that the deep learning model has the right data to learn from, which in turn enhances its ability to make accurate predictions. By carefully preparing and focusing on the most important aspects of the data, we set the stage for building a powerful and effective model.


In [102]:
# Check for duplicates in Symptom-severity.csv
duplicates = symptom_severity_df[symptom_severity_df.duplicated(subset='Symptom', keep=False)]
print("Duplicate entries in Symptom-severity.csv (if any):")
print(duplicates)

# Remove duplicates by keeping the first occurrence
symptom_severity_df = symptom_severity_df.drop_duplicates(subset='Symptom', keep='first')

# Initialize the weight_total column with zeros
dataset_df['weight_total'] = 0

# Iterate through each symptom column to map the severity and calculate the total weight
for col in dataset_df.columns:
    if 'Symptom' in col:
        dataset_df[col] = dataset_df[col].map(symptom_severity_df.set_index('Symptom')['weight']).fillna(0)
        dataset_df['weight_total'] += dataset_df[col]

# Display the combined dataset with weight_total (optional, to verify the combination process)
print("Combined dataset with weight_total (Head):")
print(dataset_df.head())

Duplicate entries in Symptom-severity.csv (if any):
Empty DataFrame
Columns: [Symptom, weight]
Index: []
Combined dataset with weight_total (Head):
            Disease  Symptom_1  Symptom_2  Symptom_3  Symptom_4  Symptom_5  \
0  Fungal infection        0.0        0.0        0.0        0.0        0.0   
1  Fungal infection        0.0        0.0        0.0        0.0        0.0   
2  Fungal infection        0.0        0.0        0.0        0.0        0.0   
3  Fungal infection        0.0        0.0        0.0        0.0        0.0   
4  Fungal infection        0.0        0.0        0.0        0.0        0.0   

   Symptom_6  Symptom_7  Symptom_8  Symptom_9  Symptom_10  Symptom_11  \
0        0.0        0.0        0.0        0.0         0.0         0.0   
1        0.0        0.0        0.0        0.0         0.0         0.0   
2        0.0        0.0        0.0        0.0         0.0         0.0   
3        0.0        0.0        0.0        0.0         0.0         0.0   
4        0.0      

# Combining Datasets and Calculating Symptom Severity

In this step, we're focused on combining the `dataset.csv` and `Symptom_Severity.csv` files. The main goal is to map the symptoms in our dataset to their corresponding severity weights and calculate a total severity score for each disease.

### Why are we only using `dataset.csv` and `Symptom_Severity.csv` at this stage?

- **Mapping Severity**: We need to map symptoms to their severity before merging with other data like descriptions or precautions.
- **Calculating Total Severity (`weight_total`)**: Calculating the total severity for each disease helps us understand the overall impact of the symptoms.

This focused approach allows us to simplify the merging process by handling the most critical data first.


In [28]:
# Display cleaned data for verification (show first few rows)
print("Cleaned dataset.csv - Head")
print(dataset.head())  # Display the first few rows of the cleaned dataset

print("\nCleaned Symptom_Severity.csv - Head")
print(symptom_severity.head())  # Display the first few rows of the cleaned Symptom_Severity dataset

Cleaned dataset.csv - Head
            Disease  Symptom_1             Symptom_2             Symptom_3  \
0  Fungal infection    itching             skin_rash  nodal_skin_eruptions   
1  Fungal infection  skin_rash  nodal_skin_eruptions   dischromic _patches   
2  Fungal infection    itching  nodal_skin_eruptions   dischromic _patches   
3  Fungal infection    itching             skin_rash   dischromic _patches   
4  Fungal infection    itching             skin_rash  nodal_skin_eruptions   

             Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0  dischromic _patches      None      None      None      None      None   
1                 None      None      None      None      None      None   
2                 None      None      None      None      None      None   
3                 None      None      None      None      None      None   
4                 None      None      None      None      None      None   

  Symptom_10 Symptom_11 Symptom_12 Symptom_13 S

In [109]:
# Save the combined dataset to 'resources/combined_dataset.csv'
combined_dataset_path = 'resources/combined_dataset.csv'
dataset_df.to_csv(combined_dataset_path, index=False)

## Exploratory Data Analysis (EDA)