<a href="https://colab.research.google.com/github/FrancoPalavicinoG/cellia/blob/main/notebooks/11_preprocessing_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load DF from Google Drive

Mount Drive

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Import libraries

In [21]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

Load Dataframe

In [22]:
input_path = "/content/drive/MyDrive/cellia_drive/Datasets/"

df = pd.read_excel(input_path + "cardio_disease.xlsx")

In [23]:
print("Dataset shape:", df.shape)

Dataset shape: (70000, 13)


Dataset example

In [24]:
display(df.head(10))

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
5,8,21914,1,151,67.0,120,80,2,2,0,0,0,0
6,9,22113,1,157,93.0,130,80,3,1,0,0,1,0
7,12,22584,2,178,95.0,130,90,3,3,0,0,1,1
8,13,17668,1,158,71.0,110,70,1,1,0,0,1,0
9,14,19834,1,164,68.0,110,60,1,1,0,0,0,0


### Feature Engineering

In [25]:
def create_advanced_feature_engineering(df):
    """
    Create advanced features for the entire dataset
    """
    # Create a copy to avoid modifying original
    engineered_df = df.copy()

    # Age features - convert from days to years first
    age_years = engineered_df['age'] / 365.25

    # Age features
    engineered_df['age_normalized'] = (engineered_df['age'] - (25 * 365.25)) / ((70*365.25) - (25*365.25))
    engineered_df['age_risk_exponential'] = np.where(age_years > 45,
                                                      np.exp(np.clip((age_years - 45) / 10, 0, 5)), 1.0)
    engineered_df['age_squared'] = age_years ** 2
    engineered_df['age_log'] = np.log1p(age_years)

    # Blood pressure features
    engineered_df['pulse_pressure'] = engineered_df['ap_hi'] - engineered_df['ap_lo']
    engineered_df['mean_arterial_pressure'] = engineered_df['ap_lo'] + (engineered_df['pulse_pressure'] / 3)

    # Metabolic features
    engineered_df['metabolic_profile'] = engineered_df['cholesterol'] / age_years.clip(lower=1)
    engineered_df['metabolic_syndrome_risk'] = ((engineered_df['cholesterol'] > 1).astype(int) +
                                                 (engineered_df['gluc'] > 1).astype(int) +
                                                 (engineered_df['ap_hi'] > 140).astype(int))

    # Gender interaction features
    engineered_df['male_age_interaction'] = (engineered_df['gender'] == 2).astype(int) * age_years
    engineered_df['female_chol_interaction'] = (engineered_df['gender'] == 1).astype(int) * engineered_df['cholesterol']
    engineered_df['gender_specific_risk'] = np.where(engineered_df['gender'] == 1,
                                                      engineered_df['cholesterol'] * 0.008,
                                                      age_years * 0.1 + engineered_df['cholesterol'] * 0.005)

    # Medical risk scores
    engineered_df['framingham_score'] = (age_years * 0.04 +
                                          (engineered_df['ap_hi'] - 120) * 0.02 +
                                          engineered_df['cholesterol'] * 15)
    engineered_df['traditional_risk_score'] = (age_years * 0.04 + engineered_df['gender'] * 10 +
                                                (engineered_df['cholesterol'] - 1) * 20 +
                                                engineered_df['ap_hi'] * 0.1 +
                                                engineered_df['gluc'] * 20)
    engineered_df['cardiac_risk_score'] = (engineered_df['pulse_pressure'] * 0.2 +
                                            engineered_df['ap_hi'] * 0.1)
    engineered_df['combined_risk_score'] = (engineered_df['traditional_risk_score'] * 0.4 +
                                             engineered_df['cardiac_risk_score'] * 0.6)

    # Statistical aggregations for key features
    key_features = ['age', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc']
    feature_data = engineered_df[key_features]

    engineered_df['feature_mean'] = feature_data.mean(axis=1)
    engineered_df['feature_std'] = feature_data.std(axis=1)
    engineered_df['feature_median'] = feature_data.median(axis=1)
    engineered_df['feature_max'] = feature_data.max(axis=1)
    engineered_df['feature_min'] = feature_data.min(axis=1)
    engineered_df['feature_range'] = engineered_df['feature_max'] - engineered_df['feature_min']

    # Age-based categorical encoding - with fillna
    engineered_df['age_group_encoded'] = pd.cut(age_years,
                                                  bins=[0, 45, 55, 65, 100],
                                                  labels=[0, 1, 2, 3],
                                                  include_lowest=True)
    engineered_df['age_group_encoded'] = engineered_df['age_group_encoded'].fillna(3).astype(int)

    # Cholesterol category encoding - with fillna
    engineered_df['chol_category_encoded'] = pd.cut(engineered_df['cholesterol'],
                                                      bins=[0, 1.5, 2.5, 3.5, 10],
                                                      labels=[0, 1, 2, 3],
                                                      include_lowest=True)
    engineered_df['chol_category_encoded'] = engineered_df['chol_category_encoded'].fillna(0).astype(int)

    # Blood pressure category encoding - with fillna
    engineered_df['bp_category_encoded'] = pd.cut(engineered_df['ap_hi'],
                                                    bins=[0, 120, 140, 160, 180, 300],
                                                    labels=[0, 1, 2, 3, 4],
                                                    include_lowest=True)
    engineered_df['bp_category_encoded'] = engineered_df['bp_category_encoded'].fillna(2).astype(int)

    return engineered_df

In [26]:
engineered_df = create_advanced_feature_engineering(df)

print("Original shape:", df.shape)
print("Engineered shape:", engineered_df.shape)
print("\nNew features created:")
new_features = [col for col in engineered_df.columns if col not in df.columns]
print(new_features)
print(f"\nTotal new features: {len(new_features)}")

Original shape: (70000, 13)
Engineered shape: (70000, 37)

New features created:
['age_normalized', 'age_risk_exponential', 'age_squared', 'age_log', 'pulse_pressure', 'mean_arterial_pressure', 'metabolic_profile', 'metabolic_syndrome_risk', 'male_age_interaction', 'female_chol_interaction', 'gender_specific_risk', 'framingham_score', 'traditional_risk_score', 'cardiac_risk_score', 'combined_risk_score', 'feature_mean', 'feature_std', 'feature_median', 'feature_max', 'feature_min', 'feature_range', 'age_group_encoded', 'chol_category_encoded', 'bp_category_encoded']

Total new features: 24


In [27]:
display(engineered_df.head(5))

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,...,combined_risk_score,feature_mean,feature_std,feature_median,feature_max,feature_min,feature_range,age_group_encoded,chol_category_encoded,bp_category_encoded
0,0,18393,2,168,62.0,110,80,1,1,0,...,31.405717,3717.0,8204.274892,80.0,18393,1,18392,1,0,0
1,1,20228,1,156,85.0,140,90,3,1,0,...,48.8861,4092.4,9020.268915,90.0,20228,1,20227,2,2,1
2,2,18857,1,165,64.0,130,70,3,1,0,...,49.026042,3812.2,8410.4684,70.0,18857,1,18856,1,2,1
3,3,17623,2,169,82.0,150,100,1,1,0,...,37.771986,3575.0,7853.335374,100.0,17623,1,17622,1,0,2
4,4,17474,1,156,56.0,100,60,1,1,0,...,27.565459,3527.2,7796.611104,60.0,17474,1,17473,1,0,0


In [28]:
print(engineered_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 37 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       70000 non-null  int64  
 1   age                      70000 non-null  int64  
 2   gender                   70000 non-null  int64  
 3   height                   70000 non-null  int64  
 4   weight                   70000 non-null  float64
 5   ap_hi                    70000 non-null  int64  
 6   ap_lo                    70000 non-null  int64  
 7   cholesterol              70000 non-null  int64  
 8   gluc                     70000 non-null  int64  
 9   smoke                    70000 non-null  int64  
 10  alco                     70000 non-null  int64  
 11  active                   70000 non-null  int64  
 12  cardio                   70000 non-null  int64  
 13  age_normalized           70000 non-null  float64
 14  age_risk_exponential  

## Split data

Imports

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
import pandas as pd

In [30]:
X = engineered_df.drop(columns=['id', 'cardio'], errors='ignore')
y = engineered_df['cardio']

X = X.select_dtypes(include=[np.number]).fillna(0)

#### Split data 70:15:15

In [31]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

In [32]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_val_scaled = scaler.transform(X_val)
# X_test_scaled = scaler.transform(X_test)

#### Save preprocessed datasets

Save datasets

In [33]:
output_path = "/content/drive/MyDrive/cellia_drive/Datasets/"

In [34]:
X_train.to_csv(output_path + "X_train_v2.csv", index=False)
y_train.to_csv(output_path + "y_train_v2.csv", index=False)

X_val.to_csv(output_path + "X_val_v2.csv", index=False)
y_val.to_csv(output_path + "y_val_v2.csv", index=False)

X_test.to_csv(output_path + "X_test_v2.csv", index=False)
y_test.to_csv(output_path + "y_test_v2.csv", index=False)

In [35]:
X_train.shape

(49000, 35)

In [36]:
print("✅ Files saved:")
print(f"   - {output_path}X_train_v2.csv")
print(f"   - {output_path}y_train_v2.csv")
print(f"   - {output_path}X_val_v2.csv")
print(f"   - {output_path}y_val_v2.csv")
print(f"   - {output_path}X_test_v2.csv")
print(f"   - {output_path}y_test_v2.csv")

✅ Files saved:
   - /content/drive/MyDrive/cellia_drive/Datasets/X_train_v2.csv
   - /content/drive/MyDrive/cellia_drive/Datasets/y_train_v2.csv
   - /content/drive/MyDrive/cellia_drive/Datasets/X_val_v2.csv
   - /content/drive/MyDrive/cellia_drive/Datasets/y_val_v2.csv
   - /content/drive/MyDrive/cellia_drive/Datasets/X_test_v2.csv
   - /content/drive/MyDrive/cellia_drive/Datasets/y_test_v2.csv
