In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [7]:
df = pd.read_excel("/home/abdeldjalil-hani/Desktop/emergency-sorting-system/data-sets/0. DATA.xlsx")

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6962 entries, 0 to 6961
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ID                              6962 non-null   int64  
 1   age                             6962 non-null   int64  
 2   gender                          6961 non-null   float64
 3   chest pain type                 6962 non-null   int64  
 4   cholesterol                     6962 non-null   int64  
 5   exercise angina                 6962 non-null   int64  
 6   plasma glucose                  6962 non-null   float64
 7   skin_thickness                  6962 non-null   int64  
 8   bmi                             6962 non-null   float64
 9   hypertension                    6962 non-null   int64  
 10  heart_disease                   6962 non-null   int64  
 11  Residence_type                  6962 non-null   object 
 12  smoking_status                  69

In [9]:
def preprocess_data(df):
    df['Risk Factors'] = df['Risk Factors'].fillna('No Risk Factor')
    df['gender'] = df['gender'].fillna(1)
    df = df.drop(columns=['ID'])

    df[['blood_pressure', 'heart_pressure']] = df['Blood Pressure (mmHg)'].str.split('/', expand=True)
    df['blood_pressure'] = df['blood_pressure'].astype(int)
    df['heart_pressure'] = df['heart_pressure'].astype(int)
    df = df.drop(columns=['Blood Pressure (mmHg)'])

    label_encoders = {}
    categorical_columns = ['Residence_type', 'smoking_status', 'Symptom', 'Consciousness', 'Risk Factors', 'Massive Bleeding', 'Respiratory Distress']

    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
        print(f"Encoding for '{col}':")
        for cls, code in zip(le.classes_, le.transform(le.classes_)):
            print(f"  {cls} --> {code}")
        print()

    return df, label_encoders

In [11]:
df_processed, encoders = preprocess_data(df)
df_processed.to_excel("/home/abdeldjalil-hani/Desktop/emergency-sorting-system/data-sets/1. PreProcessed DATA.xlsx", index=False)

Encoding for 'Residence_type':
  Rural --> 0
  Urban --> 1

Encoding for 'smoking_status':
  Unknown --> 0
  formerly smoked --> 1
  never smoked --> 2
  smokes --> 3

Encoding for 'Symptom':
  Abdominal pain --> 0
  Abdominal pain, Chest pain --> 1
  Abdominal pain, Difficulty breathing --> 2
  Abdominal pain, Fever --> 3
  Abdominal pain, Headache --> 4
  Abdominal pain, Weakness --> 5
  Chest pain --> 6
  Chest pain, Abdominal pain --> 7
  Chest pain, Difficulty breathing --> 8
  Chest pain, Fever --> 9
  Chest pain, Headache --> 10
  Chest pain, Weakness --> 11
  Difficulty breathing --> 12
  Difficulty breathing, Abdominal pain --> 13
  Difficulty breathing, Chest pain --> 14
  Difficulty breathing, Fever --> 15
  Difficulty breathing, Headache --> 16
  Difficulty breathing, Weakness --> 17
  Fever --> 18
  Fever, Abdominal pain --> 19
  Fever, Chest pain --> 20
  Fever, Difficulty breathing --> 21
  Fever, Headache --> 22
  Fever, Weakness --> 23
  Headache --> 24
  Headache, Abd

In [12]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6962 entries, 0 to 6961
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             6962 non-null   int64  
 1   gender                          6962 non-null   float64
 2   chest pain type                 6962 non-null   int64  
 3   cholesterol                     6962 non-null   int64  
 4   exercise angina                 6962 non-null   int64  
 5   plasma glucose                  6962 non-null   float64
 6   skin_thickness                  6962 non-null   int64  
 7   bmi                             6962 non-null   float64
 8   hypertension                    6962 non-null   int64  
 9   heart_disease                   6962 non-null   int64  
 10  Residence_type                  6962 non-null   int64  
 11  smoking_status                  6962 non-null   int64  
 12  Symptom                         69