In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

data_set_df = pd.read_csv('../../datasets/main_dataset.csv')
data_set_df = data_set_df.fillna('')
print("First few rows of the main dataset:")
print(data_set_df.head())

First few rows of the main dataset:
            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   
3  Fungal infection     itching              skin_rash    dischromic _patches   
4  Fungal infection     itching              skin_rash   nodal_skin_eruptions   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic _patches                                                     
1                                                                           
2                                                                           
3                                                                           
4                                                                           

  Symptom_10 S

In [2]:
symptoms = [f'Symptom_{i}' for i in range(1, 18)]
unique_symptoms_main = pd.unique(data_set_df[symptoms].values.ravel('K'))

In [3]:
symptom_severity_path = '../../datasets/Symptom-severity.csv'
ss_data = pd.read_csv(symptom_severity_path)

In [4]:
unique_symptoms_severity = pd.unique(ss_data['Symptom'].values)

unique_symptoms_main = [symptom for symptom in unique_symptoms_main if isinstance(symptom, str)]
unique_symptoms_severity = [symptom for symptom in unique_symptoms_severity if isinstance(symptom, str)]

combined_unique_symptoms = pd.unique(pd.concat([pd.Series(unique_symptoms_main), pd.Series(unique_symptoms_severity)]))

print("Combined unique symptoms from both datasets:")
print(combined_unique_symptoms)

Combined unique symptoms from both datasets:
['itching' ' skin_rash' ' continuous_sneezing' ' shivering'
 ' stomach_pain' ' acidity' ' vomiting' ' indigestion' ' muscle_wasting'
 ' patches_in_throat' ' fatigue' ' weight_loss' ' sunken_eyes' ' cough'
 ' headache' ' chest_pain' ' back_pain' ' weakness_in_limbs' ' chills'
 ' joint_pain' ' yellowish_skin' ' constipation'
 ' pain_during_bowel_movements' ' breathlessness' ' cramps' ' weight_gain'
 ' mood_swings' ' neck_pain' ' muscle_weakness' ' stiff_neck'
 ' pus_filled_pimples' ' burning_micturition' ' bladder_discomfort'
 ' high_fever' ' nodal_skin_eruptions' ' ulcers_on_tongue'
 ' loss_of_appetite' ' restlessness' ' dehydration' ' dizziness'
 ' weakness_of_one_body_side' ' lethargy' ' nausea' ' abdominal_pain'
 ' pain_in_anal_region' ' sweating' ' bruising' ' cold_hands_and_feets'
 ' anxiety' ' knee_pain' ' swelling_joints' ' blackheads'
 ' foul_smell_of urine' ' skin_peeling' ' blister' ' dischromic _patches'
 ' watering_from_eyes' ' ex

In [5]:
le_symptoms = LabelEncoder()
le_symptoms.fit(combined_unique_symptoms)

print("Classes of the encoder for symptoms: ")
print(le_symptoms.classes_)

Classes of the encoder for symptoms: 
['' ' abdominal_pain' ' abnormal_menstruation' ' acidity'
 ' acute_liver_failure' ' altered_sensorium' ' anxiety' ' back_pain'
 ' belly_pain' ' blackheads' ' bladder_discomfort' ' blister'
 ' blood_in_sputum' ' bloody_stool' ' blurred_and_distorted_vision'
 ' breathlessness' ' brittle_nails' ' bruising' ' burning_micturition'
 ' chest_pain' ' chills' ' cold_hands_and_feets' ' coma' ' congestion'
 ' constipation' ' continuous_feel_of_urine' ' continuous_sneezing'
 ' cough' ' cramps' ' dark_urine' ' dehydration' ' depression'
 ' diarrhoea' ' dischromic _patches' ' distention_of_abdomen' ' dizziness'
 ' drying_and_tingling_lips' ' enlarged_thyroid' ' excessive_hunger'
 ' extra_marital_contacts' ' family_history' ' fast_heart_rate' ' fatigue'
 ' fluid_overload' ' foul_smell_of urine' ' headache' ' high_fever'
 ' hip_joint_pain' ' history_of_alcohol_consumption' ' increased_appetite'
 ' indigestion' ' inflammatory_nails' ' internal_itching'
 ' irregular

In [6]:
for col in symptoms:
    data_set_df[col] = le_symptoms.transform(data_set_df[col].fillna(''))

print("First rows with the encoded symptom columns:")
print(data_set_df.head())

First rows with the encoded symptom columns:
            Disease  Symptom_1  Symptom_2  Symptom_3  Symptom_4  Symptom_5  \
0  Fungal infection        186        100         73         33          0   
1  Fungal infection        100         73         33          0          0   
2  Fungal infection        186         73         33          0          0   
3  Fungal infection        186        100         33          0          0   
4  Fungal infection        186        100         73          0          0   

   Symptom_6  Symptom_7  Symptom_8  Symptom_9  Symptom_10  Symptom_11  \
0          0          0          0          0           0           0   
1          0          0          0          0           0           0   
2          0          0          0          0           0           0   
3          0          0          0          0           0           0   
4          0          0          0          0           0           0   

   Symptom_12  Symptom_13  Symptom_14  Symptom_

In [7]:
ss_data['Symptom'] = le_symptoms.transform(ss_data['Symptom'].apply(str).fillna(''))

print("Encoded symptoms using the symptoms encoder:")
print(ss_data.head())

Encoded symptoms using the symptoms encoder:
   Symptom  weight
0      186       1
1      232       3
2      204       4
3      156       4
4      228       5


In [8]:
le_disease = LabelEncoder()

data_set_df['Disease'] = le_disease.fit_transform(data_set_df['Disease'])

print("First few rows of the dataset with encoded Disease column:")
print(data_set_df.head())

First few rows of the dataset with encoded Disease column:
   Disease  Symptom_1  Symptom_2  Symptom_3  Symptom_4  Symptom_5  Symptom_6  \
0       15        186        100         73         33          0          0   
1       15        100         73         33          0          0          0   
2       15        186         73         33          0          0          0   
3       15        186        100         33          0          0          0   
4       15        186        100         73          0          0          0   

   Symptom_7  Symptom_8  Symptom_9  Symptom_10  Symptom_11  Symptom_12  \
0          0          0          0           0           0           0   
1          0          0          0           0           0           0   
2          0          0          0           0           0           0   
3          0          0          0           0           0           0   
4          0          0          0           0           0           0   

   Symptom_13  

In [9]:
encoded_main_dataset_path = '../../datasets/encoded_main_dataset.csv'
data_set_df.to_csv(encoded_main_dataset_path, index=False)
print(f"Encoded main dataset saved to {encoded_main_dataset_path}")

Encoded main dataset saved to ../../datasets/encoded_main_dataset.csv


In [10]:
encoded_symptom_severity_path = '../../datasets/encoded_symptom_severity.csv'
ss_data.to_csv(encoded_symptom_severity_path, index=False)
print(f"Encoded symptom severity data saved to {encoded_symptom_severity_path}")

Encoded symptom severity data saved to ../../datasets/encoded_symptom_severity.csv
