<a href="https://colab.research.google.com/github/FrancoPalavicinoG/cellia/blob/main/notebooks/08_preprocessing_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load DF from Google Drive

Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Import libraries

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

Load Dataframe

In [3]:
input_path = "/content/drive/MyDrive/cellia_drive/Datasets/"

df = pd.read_excel(input_path + "bio_dataset.xlsx")

In [4]:
print("Dataset shape:", df.shape)

Dataset shape: (5214, 32)


Dataset example

In [5]:
display(df.head(10))

Unnamed: 0,SEQN,GENDER,AGE_YEARS,AGE_MONTHS,BODY_MASS_INDEX,HEIGHT,WEIGHT,TOTAL_CHOL,CHOLESTEROL,CREATININE,...,HEART_FAILURE_AGE,CORO_HEART_DISEASE_AGE,ANGINA_PECTORIS_AGE,HEART_ATTACK_AGE,STROKE_AGE,HEART_ATTACK_RELATIVES,SMOKING,ALCOHOL,ACTIVE_1,ACTIVE_2
0,73557,1,69,828,26.7,171.3,78.3,167,168.0,1.21,...,0,0,0,0,62,0,0.0,1.0,2.6,4.2
1,73558,1,54,648,28.6,176.8,89.5,170,167.0,0.79,...,0,0,0,0,0,0,1.0,0.8,3.6,3.8
2,73559,1,72,864,28.9,175.3,88.9,126,127.0,1.22,...,0,0,0,0,0,1,0.0,1.0,3.6,1.0
3,73561,0,73,876,19.7,162.4,52.0,201,207.0,0.73,...,0,0,0,0,0,1,0.468972,0.459701,3.307826,3.485977
4,73562,1,56,672,41.7,158.7,105.0,226,230.0,0.89,...,0,54,0,55,0,0,0.0,1.0,2.6,4.2
5,73564,0,61,732,35.7,161.8,93.4,168,167.0,0.92,...,0,0,0,0,0,0,0.6,1.0,2.8,5.2
6,73566,0,56,672,26.5,152.8,61.8,278,278.0,0.55,...,0,0,0,0,0,0,1.0,0.8,3.6,3.8
7,73567,1,65,780,22.0,172.4,65.3,173,170.0,0.97,...,0,0,0,0,0,1,1.0,0.8,3.6,3.8
8,73568,0,26,312,20.3,152.5,47.1,168,174.0,0.74,...,0,0,0,0,0,0,0.6,0.4,3.8,4.0
9,73571,1,76,912,34.4,172.5,102.4,167,157.0,1.19,...,0,0,0,72,0,0,0.8,1.0,2.0,2.0


### Heart condition indicator (HCI)

In [6]:
heart_condition_cols = [
    'HEART_FAILURE',
    'CORO_HEART_DISEASE',
    'ANGINA_PECTORIS',
    'HEART_ATTACK',
    'STROKE'
]

In [7]:
df['HEART_CONDITION'] = (df[heart_condition_cols].sum(axis=1) > 0).astype(int)

In [8]:
display(df.head(5))

Unnamed: 0,SEQN,GENDER,AGE_YEARS,AGE_MONTHS,BODY_MASS_INDEX,HEIGHT,WEIGHT,TOTAL_CHOL,CHOLESTEROL,CREATININE,...,CORO_HEART_DISEASE_AGE,ANGINA_PECTORIS_AGE,HEART_ATTACK_AGE,STROKE_AGE,HEART_ATTACK_RELATIVES,SMOKING,ALCOHOL,ACTIVE_1,ACTIVE_2,HEART_CONDITION
0,73557,1,69,828,26.7,171.3,78.3,167,168.0,1.21,...,0,0,0,62,0,0.0,1.0,2.6,4.2,1
1,73558,1,54,648,28.6,176.8,89.5,170,167.0,0.79,...,0,0,0,0,0,1.0,0.8,3.6,3.8,0
2,73559,1,72,864,28.9,175.3,88.9,126,127.0,1.22,...,0,0,0,0,1,0.0,1.0,3.6,1.0,0
3,73561,0,73,876,19.7,162.4,52.0,201,207.0,0.73,...,0,0,0,0,1,0.468972,0.459701,3.307826,3.485977,0
4,73562,1,56,672,41.7,158.7,105.0,226,230.0,0.89,...,54,0,55,0,0,0.0,1.0,2.6,4.2,1


In [9]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5214 entries, 0 to 5213
Data columns (total 33 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   SEQN                    5214 non-null   int64  
 1   GENDER                  5214 non-null   int64  
 2   AGE_YEARS               5214 non-null   int64  
 3   AGE_MONTHS              5214 non-null   int64  
 4   BODY_MASS_INDEX         5214 non-null   float64
 5   HEIGHT                  5214 non-null   float64
 6   WEIGHT                  5214 non-null   float64
 7   TOTAL_CHOL              5214 non-null   int64  
 8   CHOLESTEROL             5214 non-null   float64
 9   CREATININE              5214 non-null   float64
 10  TRIGLYCERIDES_R         5214 non-null   float64
 11  LDL                     5214 non-null   float64
 12  TRIGLYCERIDE            5214 non-null   float64
 13  HDL                     5214 non-null   int64  
 14  GLUCOSE                 5214 non-null   

## Preprocessing

Drop irrelevant columns

In [10]:
cols_to_drop = [
    'AGE_MONTHS', 'SEQN',
    'TRIGLYCERIDES_R', 'HEART_FAILURE',
    'CORO_HEART_DISEASE','ANGINA_PECTORIS',
    'HEART_ATTACK', 'STROKE', 'TRIGLYCERIDE',
    'HEART_FAILURE_AGE', 'CORO_HEART_DISEASE_AGE',
    'ANGINA_PECTORIS_AGE', 'HEART_ATTACK_AGE', 'STROKE_AGE',
    'CHOLESTEROL'
]

In [11]:
df_preprocess = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

### Categorical Dataset

In [12]:
print(df_preprocess.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5214 entries, 0 to 5213
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   GENDER                  5214 non-null   int64  
 1   AGE_YEARS               5214 non-null   int64  
 2   BODY_MASS_INDEX         5214 non-null   float64
 3   HEIGHT                  5214 non-null   float64
 4   WEIGHT                  5214 non-null   float64
 5   TOTAL_CHOL              5214 non-null   int64  
 6   CREATININE              5214 non-null   float64
 7   LDL                     5214 non-null   float64
 8   HDL                     5214 non-null   int64  
 9   GLUCOSE                 5214 non-null   float64
 10  HYPERTENSION            5214 non-null   int64  
 11  HIGH_CHOLESTEROL        5214 non-null   int64  
 12  HEART_ATTACK_RELATIVES  5214 non-null   int64  
 13  SMOKING                 5214 non-null   float64
 14  ALCOHOL                 5214 non-null   

In [13]:
display(df_preprocess.head(5))

Unnamed: 0,GENDER,AGE_YEARS,BODY_MASS_INDEX,HEIGHT,WEIGHT,TOTAL_CHOL,CREATININE,LDL,HDL,GLUCOSE,HYPERTENSION,HIGH_CHOLESTEROL,HEART_ATTACK_RELATIVES,SMOKING,ALCOHOL,ACTIVE_1,ACTIVE_2,HEART_CONDITION
0,1,69,26.7,171.3,78.3,167,1.21,91.065171,65,125.945215,1,1,0,0.0,1.0,2.6,4.2,1
1,1,54,28.6,176.8,89.5,170,0.79,87.38731,50,124.69618,1,1,0,1.0,0.8,3.6,3.8,0
2,1,72,28.9,175.3,88.9,126,1.22,56.0,60,126.151344,1,1,1,0.0,1.0,3.6,1.0,0
3,0,73,19.7,162.4,52.0,201,0.73,101.0,85,123.728065,1,0,1,0.468972,0.459701,3.307826,3.485977,0
4,1,56,41.7,158.7,105.0,226,0.89,119.628631,38,159.236039,1,1,0,0.0,1.0,2.6,4.2,1


In [49]:
df_clean = df_preprocess.copy()

# === GENDER ===
# 0 = female, 1 = male
df_clean["GENDER"] = df_clean["GENDER"].astype(int)

# === AGE_YEARS ===
df_clean["AGE_YEARS"] = df_clean["AGE_YEARS"].astype(int)

# === BODY MASS INDEX CATEGORICAL ===
def bmi_category(bmi):
    if bmi < 18.5:
        return 0  # underweight
    elif bmi < 25:
        return 1  # normal
    elif bmi < 30:
        return 2  # overweight
    else:
        return 3  # obese

df_clean["BMI"] = df_clean["BODY_MASS_INDEX"].apply(bmi_category)
df_clean.drop(columns=["BODY_MASS_INDEX"], inplace=True)

# === HEIGHT & WEIGHT  ===
df_clean["HEIGHT"] = df_clean["HEIGHT"].astype(float)
df_clean["WEIGHT"] = df_clean["WEIGHT"].astype(float)

# === CHOLESTEROL ===
df_clean["CHOL_CATEGORY"] = pd.cut(
    df_clean["TOTAL_CHOL"],
    bins=[0, 200, 239, np.inf],
    labels=[1, 2, 3]
).astype(int)
df_clean.drop(columns=["TOTAL_CHOL"], inplace=True)

# === LDL CATEGORY ===
bins=[0, 100, 159, np.inf]
df_clean["LDL_CATEGORY"] = np.digitize(df_clean["LDL"], bins, right=False)
df_clean.drop(columns=["LDL"], inplace=True)

# === HDL CATEGORY ===
bins = [0, 40, 59, np.inf]
df_clean["HDL_CATEGORY"] = np.digitize(df_clean["HDL"], bins, right=False)
df_clean.drop(columns=["HDL"], inplace=True)

# === GLUCOSE CATEGORY ===
df_clean["GLUCOSE_CATEGORY"] = pd.cut(
    df_clean["GLUCOSE"],
    bins=[0, 100, 125, np.inf],
    labels=[1, 2, 3]
).astype(int)
df_clean.drop(columns=["GLUCOSE"], inplace=True)

# === binary vars ===
binary_vars = [
    "HYPERTENSION", "HIGH_CHOLESTEROL",
    "HEART_ATTACK_RELATIVES"
]
for var in binary_vars:
    df_clean[var] = df_clean[var].astype(int)

# === SMOKING & ALCOHOL ===
df_clean["SMOKING_BINARY"] = (df_clean["SMOKING"] > 0.5).astype(int)
df_clean["ALCOHOL_BINARY"] = (df_clean["ALCOHOL"] > 0.5).astype(int)

df_clean.drop(columns=["SMOKING", "ALCOHOL"], inplace=True)

# === ACTIVITY ===
df_clean["ACTIVE"] = ((df_clean["ACTIVE_1"] > 4) | (df_clean["ACTIVE_2"] > 2)).astype(int)
df_clean.drop(columns=["ACTIVE_1", "ACTIVE_2"], inplace=True)


# === CREATININE ===
def categorize_creatinine(row):
    if row["GENDER"] == 1:  # male
        if row["CREATININE"] < 0.74:
            return 1  # low
        elif row["CREATININE"] <= 1.35:
            return 2  # normal
        else:
            return 3  # high
    else:  # female
        if row["CREATININE"] < 0.59:
            return 1
        elif row["CREATININE"] <= 1.04:
            return 2
        else:
            return 3

df_clean["CREATININE_CATEGORY"] = df_clean.apply(categorize_creatinine, axis=1)
df_clean.drop(columns=["CREATININE"], inplace=True)

# === Target ===
df_clean["HEART_CONDITION"] = df_clean["HEART_CONDITION"].astype(int)

In [27]:
df_clean = df_clean.rename(columns={
    "AGE_YEARS": "AGE",
    "CHOL_CATEGORY": "CHOLESTEROL",
    "HDL_CATEGORY": "HDL",
    "LDL_CATEGORY": "LDL",
    "GLUCOSE_CATEGORY": "GLUCOSE",
    "SMOKING_BINARY": "SMOKING",
    "ALCOHOL_BINARY": "ALCOHOL",
    "CREATININE_CATEGORY": "CREATININE",
})

In [28]:
print(df_clean.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5214 entries, 0 to 5213
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   GENDER                  5214 non-null   int64  
 1   AGE                     5214 non-null   int64  
 2   HEIGHT                  5214 non-null   float64
 3   WEIGHT                  5214 non-null   float64
 4   HYPERTENSION            5214 non-null   int64  
 5   HIGH_CHOLESTEROL        5214 non-null   int64  
 6   HEART_ATTACK_RELATIVES  5214 non-null   int64  
 7   HEART_CONDITION         5214 non-null   int64  
 8   BMI                     5214 non-null   int64  
 9   CHOLESTEROL             5214 non-null   int64  
 10  LDL                     5214 non-null   int64  
 11  HDL                     5214 non-null   int64  
 12  GLUCOSE                 5214 non-null   int64  
 13  SMOKING                 5214 non-null   int64  
 14  ALCOHOL                 5214 non-null   

In [30]:
column_order = [
    "GENDER", "AGE", "HEIGHT", "WEIGHT", "BMI",
    "CHOLESTEROL", "LDL", "HDL", "GLUCOSE", "CREATININE",
    "HYPERTENSION", "HIGH_CHOLESTEROL", "SMOKING", "ALCOHOL", "ACTIVE",
    "HEART_ATTACK_RELATIVES", "HEART_CONDITION"
]

df_clean = df_clean[column_order]

In [31]:
display(df_clean.head(5))

Unnamed: 0,GENDER,AGE,HEIGHT,WEIGHT,BMI,CHOLESTEROL,LDL,HDL,GLUCOSE,CREATININE,HYPERTENSION,HIGH_CHOLESTEROL,SMOKING,ALCOHOL,ACTIVE,HEART_ATTACK_RELATIVES,HEART_CONDITION
0,1,69,171.3,78.3,2,1,1,3,3,2,1,1,0,1,1,0,1
1,1,54,176.8,89.5,2,1,1,2,2,2,1,1,1,1,1,0,0
2,1,72,175.3,88.9,2,1,1,3,3,2,1,1,0,1,0,1,0
3,0,73,162.4,52.0,1,2,2,3,2,2,1,0,0,0,1,1,0
4,1,56,158.7,105.0,3,2,2,1,3,2,1,1,0,1,1,0,1


#### Split data

Imports

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
import pandas as pd

Types of features

In [34]:
numeric_features = [
    "AGE", "HEIGHT", "WEIGHT",
]

categorical_features = [
    "BMI", "CHOLESTEROL", "LDL",
    "HDL", "GLUCOSE", "CREATININE",
]

binary_features = [
    "GENDER", "HYPERTENSION", "HIGH_CHOLESTEROL",
    "SMOKING", "ALCOHOL", "ACTIVE", "HEART_ATTACK_RELATIVES",
]

target = ["HEART_CONDITION"]

final_columns = numeric_features + categorical_features + binary_features + target
df_final = df_clean[final_columns].copy()

Numeric variables scalling

In [35]:
scaler = StandardScaler()
df_final[numeric_features] = scaler.fit_transform(df_final[numeric_features])

OneHot codification for categories

In [36]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_cats = encoder.fit_transform(df_final[categorical_features])
encoded_df = pd.DataFrame(
    encoded_cats,
    columns=encoder.get_feature_names_out(categorical_features)
)

Merge all

In [37]:
df_ready = pd.concat(
    [df_final[numeric_features + binary_features], encoded_df, df_final[target]],
    axis=1
)

In [38]:
print("✅ Dataset ready to split:")
print(df_ready.head())
print(f"\nFinal Shape: {df_ready.shape}")

✅ Dataset ready to split:
        AGE    HEIGHT    WEIGHT  GENDER  HYPERTENSION  HIGH_CHOLESTEROL  \
0  1.157711  0.412171 -0.145572       1             1                 1   
1  0.295956  0.952128  0.362307       1             1                 1   
2  1.330062  0.804867  0.335099       1             1                 1   
3  1.387512 -0.461577 -1.338178       0             1                 0   
4  0.410857 -0.824821  1.065174       1             1                 1   

   SMOKING  ALCOHOL  ACTIVE  HEART_ATTACK_RELATIVES  ...  HDL_1  HDL_2  HDL_3  \
0        0        1       1                       0  ...    0.0    0.0    1.0   
1        1        1       1                       0  ...    0.0    1.0    0.0   
2        0        1       0                       1  ...    0.0    0.0    1.0   
3        0        0       1                       1  ...    0.0    0.0    1.0   
4        0        1       1                       0  ...    1.0    0.0    0.0   

   GLUCOSE_1  GLUCOSE_2  GLUCOSE_3  

In [44]:
X = df_ready.drop(columns=['HEART_CONDITION'])
y = df_ready['HEART_CONDITION']

#### Split data 70:15:15

In [45]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

#### Save preprocessed datasets

Save datasets

In [46]:
output_path = "/content/drive/MyDrive/cellia_drive/Datasets/"

In [47]:
X_train.to_csv(output_path + "X_train_cat.csv", index=False)
y_train.to_csv(output_path + "y_train_cat.csv", index=False)

X_val.to_csv(output_path + "X_val_cat.csv", index=False)
y_val.to_csv(output_path + "y_val_cat.csv", index=False)

X_test.to_csv(output_path + "X_test_cat.csv", index=False)
y_test.to_csv(output_path + "y_test_cat.csv", index=False)

In [48]:
print("✅ Files saved:")
print(f"   - {output_path}X_train_cat.csv")
print(f"   - {output_path}y_train_cat.csv")
print(f"   - {output_path}X_val_cat.csv")
print(f"   - {output_path}y_val_cat.csv")
print(f"   - {output_path}X_test_cat.csv")
print(f"   - {output_path}y_test_cat.csv")

✅ Files saved:
   - /content/drive/MyDrive/cellia_drive/Datasets/X_train_cat.csv
   - /content/drive/MyDrive/cellia_drive/Datasets/y_train_cat.csv
   - /content/drive/MyDrive/cellia_drive/Datasets/X_val_cat.csv
   - /content/drive/MyDrive/cellia_drive/Datasets/y_val_cat.csv
   - /content/drive/MyDrive/cellia_drive/Datasets/X_test_cat.csv
   - /content/drive/MyDrive/cellia_drive/Datasets/y_test_cat.csv
