In [3]:
import pandas as pd

df = pd.read_csv("patient_priority.csv")

print("Shape:", df.shape)
print("\nColumns:")
print(df.columns)

print("\nNull counts:")
print(df.isnull().sum())

print("\nTriage value counts:")
print(df["triage"].value_counts())

Shape: (6962, 18)

Columns:
Index(['Unnamed: 0', 'age', 'gender', 'chest pain type', 'blood pressure',
       'cholesterol', 'max heart rate', 'exercise angina', 'plasma glucose',
       'skin_thickness', 'insulin', 'bmi', 'diabetes_pedigree', 'hypertension',
       'heart_disease', 'Residence_type', 'smoking_status', 'triage'],
      dtype='object')

Null counts:
Unnamed: 0             0
age                    0
gender                 1
chest pain type        0
blood pressure         0
cholesterol            0
max heart rate         0
exercise angina        0
plasma glucose         0
skin_thickness         0
insulin                0
bmi                    0
diabetes_pedigree      0
hypertension           0
heart_disease          0
Residence_type         0
smoking_status         0
triage               410
dtype: int64

Triage value counts:
triage
yellow    5637
green      440
orange     346
red        129
Name: count, dtype: int64


In [9]:
import pandas as pd
import numpy as np

df = pd.read_csv("patient_priority.csv")
print("Original shape:", df.shape)

if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

df["triage"] = df["triage"].astype(str).str.strip().str.lower()

before = df.shape[0]
df = df.dropna(subset=["triage"])
print("Dropped rows with missing triage:", before - df.shape[0])

df["gender"] = df["gender"].astype(str).str.strip().str.lower()
mode_gender = df["gender"].mode()[0]
df["gender"] = df["gender"].fillna(mode_gender)

gender_map = {"male": 1, "m": 1, "female": 0, "f": 0}
df["gender"] = df["gender"].map(gender_map)

df = df.rename(columns={
    "chest pain type": "chest_pain_type",
    "blood pressure": "blood_pressure",
    "max heart rate": "max_heart_rate",
    "exercise angina": "exercise_angina",
    "Residence_type": "residence_type"
})

df["exercise_angina"] = df["exercise_angina"].astype(str).str.strip().str.lower()
df["exercise_angina"] = df["exercise_angina"].map({
    "yes": 1, "y": 1,
    "no": 0, "n": 0
})

df["residence_type"] = df["residence_type"].astype(str).str.strip().str.lower()
df["residence_type"] = df["residence_type"].map({
    "urban": 1,
    "rural": 0
})

df["smoking_status"] = df["smoking_status"].astype(str).str.strip().str.lower()
df["smoking_status"] = df["smoking_status"].astype("category").cat.codes

df["chest_pain_type"] = df["chest_pain_type"].astype(str).str.strip().str.lower()
df["chest_pain_type"] = df["chest_pain_type"].astype("category").cat.codes

triage_to_ctas = {
    "red": 1,
    "orange": 2,
    "yellow": 3,
    "green": 4,
    "blue": 5,   
}
df["ctas_level"] = df["triage"].map(triage_to_ctas)

missing_ctas = df["ctas_level"].isna().sum()
print("Rows with triage not mapped to CTAS:", missing_ctas)

df = df.dropna(subset=["ctas_level"])

for col in df.columns:
    if col != "triage":  
        df[col] = pd.to_numeric(df[col], errors="coerce")

numeric_cols = df.select_dtypes(include=["float64", "int64", "Int64"]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

print("Cleaned shape:", df.shape)
print("\nCTAS distribution:")
print(df["ctas_level"].value_counts().sort_index())

df.to_csv("patient_priority_clean.csv", index=False)
print("\nSaved cleaned data to patient_priority_clean.csv")

Original shape: (6962, 18)
Dropped rows with missing triage: 0
Rows with triage not mapped to CTAS: 410
Cleaned shape: (6552, 18)

CTAS distribution:
ctas_level
1.0     129
2.0     346
3.0    5637
4.0     440
Name: count, dtype: int64

Saved cleaned data to patient_priority_clean.csv


In [10]:
import pandas as pd
import numpy as np
df = pd.read_csv("patient_priority_clean.csv")

print("Original shape:", df.shape)
print("\nOriginal CTAS distribution:")
print(df["ctas_level"].value_counts().sort_index())

base_ctas4 = df[df["ctas_level"] == 4].copy()
print("\nNumber of CTAS 4 rows:", len(base_ctas4))

n_new = 500  

rng = np.random.default_rng(42)
synth = base_ctas4.sample(n=n_new, replace=True, random_state=42).reset_index(drop=True)


numeric_cols = synth.select_dtypes(include=["int64", "float64"]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c not in ["ctas_level"]]  # نستثني الهدف

for col in numeric_cols:
    std = synth[col].std()
    if std == 0 or np.isnan(std):
        continue
    noise = rng.normal(0, 0.05 * std, size=len(synth))
    synth[col] = synth[col] + noise

synth["ctas_level"] = 5
if "triage" in synth.columns:
    synth["triage"] = "blue"

df_final = pd.concat([df, synth], ignore_index=True)

print("\nFinal shape:", df_final.shape)
print("\nFinal CTAS distribution:")
print(df_final["ctas_level"].value_counts().sort_index())


df_final.to_csv("patient_priority_final.csv", index=False)
print("\n Saved final data to patient_priority_final.csv")

Original shape: (6552, 18)

Original CTAS distribution:
ctas_level
1.0     129
2.0     346
3.0    5637
4.0     440
Name: count, dtype: int64

Number of CTAS 4 rows: 440

Final shape: (7052, 18)

Final CTAS distribution:
ctas_level
1.0     129
2.0     346
3.0    5637
4.0     440
5.0     500
Name: count, dtype: int64

 Saved final data to patient_priority_final.csv
