# IMPORT LIB

In [12]:
from sklearn.preprocessing import RobustScaler, OneHotEncoder, PowerTransformer, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

# IMPORT DATASET

In [13]:
df_train = pd.read_csv("../data/raw/train.csv")
df_test = pd.read_csv("../data/raw/test.csv")

### Adding Bin to train

In [14]:
# AGE
df_train["age_bin"] = pd.cut(
    df_train["age"],
    bins=[0, 19, 29, 39, 49, 64, np.inf],
    labels=["Teen", "Young Adult", "Adult", "Middle-aged", "Senior", "Elderly"]
)

# PHYSICAL ACTIVITY
df_train["activity_bin"] = pd.cut(
    df_train["physical_activity_minutes_per_week"],
    bins=[-1, 59, 149, 299, 449, np.inf],
    labels=["Sedentary", "Low", "Moderate", "Active", "Very Active"]
)

# DIET SCORE
df_train["diet_bin"] = pd.cut(
    df_train["diet_score"],
    bins=[-0.1, 3, 5, 7, 10],
    labels=["Poor", "Fair", "Good", "Excellent"]
)

# SLEEP HOURS
df_train["sleep_bin"] = pd.cut(
    df_train["sleep_hours_per_day"],
    bins=[0, 5, 6, 8, np.inf],
    labels=["Very Low", "Low", "Optimal", "High"]
)

# BMI
df_train["bmi_bin"] = pd.cut(
    df_train["bmi"],
    bins=[0, 18.5, 24.9, 29.9, 34.9, np.inf],
    labels=["Underweight", "Normal", "Overweight", "Obese I", "Obese II+"]
)

# SYSTOLIC BP
df_train["sys_bp_bin"] = pd.cut(
    df_train["systolic_bp"],
    bins=[0, 119, 129, 139, np.inf],
    labels=["Normal", "Elevated", "HTN Stage 1", "HTN Stage 2"]
)

# DIASTOLIC BP
df_train["dia_bp_bin"] = pd.cut(
    df_train["diastolic_bp"],
    bins=[0, 79, 89, 99, np.inf],
    labels=["Normal", "Elevated", "HTN Stage 1", "HTN Stage 2"]
)

# HEART RATE
df_train["heart_rate_bin"] = pd.cut(
    df_train["heart_rate"],
    bins=[0, 59, 100, np.inf],
    labels=["Low", "Normal", "High"]
)

# TOTAL CHOLESTEROL
df_train["chol_total_bin"] = pd.cut(
    df_train["cholesterol_total"],
    bins=[0, 199, 239, np.inf],
    labels=["Desirable", "Borderline", "High"]
)

# TRIGLYCERIDES
df_train["triglycerides_bin"] = pd.cut(
    df_train["triglycerides"],
    bins=[0, 149, 199, 499, np.inf],
    labels=["Normal", "Borderline", "High", "Very High"]
)

# Quick sanity check
df_train.filter(like="_bin").head()

Unnamed: 0,age_bin,activity_bin,diet_bin,sleep_bin,bmi_bin,sys_bp_bin,dia_bp_bin,heart_rate_bin,chol_total_bin,triglycerides_bin
0,Adult,Sedentary,Excellent,Optimal,Obese I,Normal,Normal,Normal,Desirable,Normal
1,Senior,Low,Good,Optimal,Normal,Elevated,Normal,Normal,Desirable,Normal
2,Adult,Moderate,Excellent,Optimal,Normal,Normal,Elevated,Normal,Desirable,Normal
3,Senior,Low,Fair,Optimal,Overweight,Elevated,Normal,Normal,Desirable,Normal
4,Senior,Sedentary,Good,Optimal,Overweight,Normal,Normal,Normal,Borderline,Normal


### Adding Bin to test

In [18]:
# AGE
df_test["age_bin"] = pd.cut(
    df_test["age"],
    bins=[0, 19, 29, 39, 49, 64, np.inf],
    labels=["Teen", "Young Adult", "Adult", "Middle-aged", "Senior", "Elderly"]
)

# PHYSICAL ACTIVITY
df_test["activity_bin"] = pd.cut(
    df_test["physical_activity_minutes_per_week"],
    bins=[-1, 59, 149, 299, 449, np.inf],
    labels=["Sedentary", "Low", "Moderate", "Active", "Very Active"]
)

# DIET SCORE
df_test["diet_bin"] = pd.cut(
    df_test["diet_score"],
    bins=[-0.1, 3, 5, 7, 10],
    labels=["Poor", "Fair", "Good", "Excellent"]
)

# SLEEP HOURS
df_test["sleep_bin"] = pd.cut(
    df_test["sleep_hours_per_day"],
    bins=[0, 5, 6, 8, np.inf],
    labels=["Very Low", "Low", "Optimal", "High"]
)

# BMI
df_test["bmi_bin"] = pd.cut(
    df_test["bmi"],
    bins=[0, 18.5, 24.9, 29.9, 34.9, np.inf],
    labels=["Underweight", "Normal", "Overweight", "Obese I", "Obese II+"]
)

# SYSTOLIC BP
df_test["sys_bp_bin"] = pd.cut(
    df_test["systolic_bp"],
    bins=[0, 119, 129, 139, np.inf],
    labels=["Normal", "Elevated", "HTN Stage 1", "HTN Stage 2"]
)

# DIASTOLIC BP
df_test["dia_bp_bin"] = pd.cut(
    df_test["diastolic_bp"],
    bins=[0, 79, 89, 99, np.inf],
    labels=["Normal", "Elevated", "HTN Stage 1", "HTN Stage 2"]
)

# HEART RATE
df_test["heart_rate_bin"] = pd.cut(
    df_test["heart_rate"],
    bins=[0, 59, 100, np.inf],
    labels=["Low", "Normal", "High"]
)

# TOTAL CHOLESTEROL
df_test["chol_total_bin"] = pd.cut(
    df_test["cholesterol_total"],
    bins=[0, 199, 239, np.inf],
    labels=["Desirable", "Borderline", "High"]
)

# TRIGLYCERIDES
df_test["triglycerides_bin"] = pd.cut(
    df_test["triglycerides"],
    bins=[0, 149, 199, 499, np.inf],
    labels=["Normal", "Borderline", "High", "Very High"]
)

# Quick sanity check
df_test.filter(like="_bin").head()

Unnamed: 0,age_bin,activity_bin,diet_bin,sleep_bin,bmi_bin,sys_bp_bin,dia_bp_bin,heart_rate_bin,chol_total_bin,triglycerides_bin
0,Middle-aged,Low,Fair,Optimal,Overweight,Elevated,Normal,Normal,Borderline,Normal
1,Adult,Low,Fair,Very Low,Overweight,Elevated,Normal,Low,Desirable,Normal
2,Middle-aged,Low,Excellent,Optimal,Overweight,Normal,Normal,Normal,Desirable,Borderline
3,Senior,Low,Excellent,Optimal,Overweight,Normal,Elevated,Normal,Borderline,Normal
4,Elderly,Sedentary,Excellent,Optimal,Normal,HTN Stage 1,Normal,Normal,Desirable,Normal


# DEFINING COLUMN LIST

In [19]:
# Seperate num and cat cols

# Num Cols
num_cols = df_train.select_dtypes(include=["int64", "float64"]).columns
# Cat Cols
cat_cols = df_train.select_dtypes(include=["object", "category"]).columns

# Binary Cols
bin_col = ["family_history_diabetes", "hypertension_history", "cardiovascular_history"]

# Cols to remove
# Features with little to no linear relationship (Correlation approx 0.0)
uncorrelated_features = [
    "alcohol_consumption_per_week",
    "sleep_hours_per_day",
    "screen_time_hours_per_day",
    "diastolic_bp"  # surprisingly weak correlation in this specific dataset
]

dropped_highly_correlated = [
    "ldl_cholesterol",
    "waist_to_hip_ratio"
]

# Target Col
target_col = "diagnosed_diabetes"

# Drop target Col from num col
if target_col in num_cols:
    num_cols = num_cols.drop(target_col)

# Id
id = "id"

# Drop Id col from num col
if id in num_cols:
    num_cols = num_cols.drop(id)

# Remove Binary Cols
existing_bin_cols = [c for c in bin_col if c in num_cols]
num_cols = num_cols.drop(existing_bin_cols)

# Remove unwanted features
# num_cols = [col for col in num_cols if col not in uncorrelated_features + dropped_highly_correlated]

print(num_cols)

col_df = [num_cols, cat_cols]
print(col_df)

Index(['age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides'],
      dtype='object')
[Index(['age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides'],
      dtype='object'), Index(['gender', 'ethnicity', 'education_level', 'income_level',
       'smoking_status', 'employment_status', 'age_bin', 'activity_bin',
       'diet_bin', 'sleep_bin', 'bmi_bin', 'sys_bp_bin', 'dia_bp_bin',
       'heart_rate_bin', 'chol_total_bin', 'triglycerides_bin'],
      dt

# PREPROCESSING

In [20]:
class SafeLogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.shifts_ = []

    def fit(self, X, y=None):
        X = np.array(X)
        self.shifts_ = []
        for i in range(X.shape[1]):
            min_val = np.nanmin(X[:, i])
            shift = 1 - min_val if min_val <= 0 else 0
            self.shifts_.append(shift)
        return self

    def transform(self, X):
        X = np.array(X, copy=True)
        for i in range(X.shape[1]):
            X[:, i] = np.log(X[:, i] + self.shifts_[i])
        return X

    # --- NEW METHOD ADDED HERE ---
    def get_feature_names_out(self, input_features=None):
        # Since this transformer doesn't change column names or count,
        # we simply return the input feature names as they are.
        return np.array(input_features)

In [21]:
# Create Transformation instance
dt = PowerTransformer()
onc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [22]:
# Numerical Features PipeLine
num_pipeline = Pipeline(
    steps = [
        ("log", SafeLogTransformer()), 
        ("scaler", dt)
    ]
)


# Categorical Features Pipeline 
cat_pipeline = Pipeline(
    steps = [
        ("onehot", onc)
    ]
)

In [23]:
# Combine in ColumnTransformer 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('bin', 'passthrough', bin_col),
        ('cat', cat_pipeline, cat_cols)
    ],
    verbose_feature_names_out=False
)


In [24]:
# X and y split 
X = df_train.drop(columns=target_col, axis=1)
y = df_train[target_col]

In [25]:
# Train Test Split 
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Reattach column names explicitly (safe even if not needed)
X_train = pd.DataFrame(X_train, columns=X.columns, index=X_train.index)
X_test  = pd.DataFrame(X_test,  columns=X.columns, index=X_test.index)

In [26]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape) 
print(y_test.shape)

(560000, 35)
(140000, 35)
(560000,)
(140000,)


In [27]:
# Fit prpreprocessor
preprocessor.fit(X_train)

0,1,2
,transformers,"[('num', ...), ('bin', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [28]:
X_train.columns

Index(['id', 'age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'age_bin', 'activity_bin', 'diet_bin',
       'sleep_bin', 'bmi_bin', 'sys_bp_bin', 'dia_bp_bin', 'heart_rate_bin',
       'chol_total_bin', 'triglycerides_bin'],
      dtype='object')

In [29]:
# Final Features 
feature_cols = [
    'age',
    'alcohol_consumption_per_week',
    'physical_activity_minutes_per_week',
    'diet_score',
    'sleep_hours_per_day',
    'screen_time_hours_per_day',
    'bmi',
    'waist_to_hip_ratio',
    'systolic_bp',
    'diastolic_bp',
    'heart_rate',
    'cholesterol_total',
    'hdl_cholesterol',
    'ldl_cholesterol',
    'triglycerides',
    'gender',
    'ethnicity',
    'education_level',
    'income_level',
    'smoking_status',
    'employment_status',
    'family_history_diabetes',
    'hypertension_history',
    'cardiovascular_history',
    'age_bin',
    'activity_bin',
    'diet_bin',
    'sleep_bin',
    'bmi_bin',
    'sys_bp_bin',
    'dia_bp_bin',
    'heart_rate_bin',
    'chol_total_bin',
    'triglycerides_bin'
]
feature_cols

['age',
 'alcohol_consumption_per_week',
 'physical_activity_minutes_per_week',
 'diet_score',
 'sleep_hours_per_day',
 'screen_time_hours_per_day',
 'bmi',
 'waist_to_hip_ratio',
 'systolic_bp',
 'diastolic_bp',
 'heart_rate',
 'cholesterol_total',
 'hdl_cholesterol',
 'ldl_cholesterol',
 'triglycerides',
 'gender',
 'ethnicity',
 'education_level',
 'income_level',
 'smoking_status',
 'employment_status',
 'family_history_diabetes',
 'hypertension_history',
 'cardiovascular_history',
 'age_bin',
 'activity_bin',
 'diet_bin',
 'sleep_bin',
 'bmi_bin',
 'sys_bp_bin',
 'dia_bp_bin',
 'heart_rate_bin',
 'chol_total_bin',
 'triglycerides_bin']

In [30]:
# Transform
X_train_filtered = X_train[feature_cols].copy()
X_test_filtered = X_test[feature_cols].copy()
df_test_filtered = df_test[feature_cols].copy() 

In [31]:
X_train_processed = preprocessor.transform(X_train_filtered)
X_test_processed = preprocessor.transform(X_test_filtered)
df_testprocessed = preprocessor.transform(df_test_filtered)

In [32]:
X_test_processed

array([[ 1.90567564,  0.24906533, -0.38007471, ...,  0.        ,
         0.        ,  1.        ],
       [-0.7998998 , -1.24007969,  1.05174055, ...,  0.        ,
         0.        ,  1.        ],
       [-1.77442834, -1.24007969,  0.07020889, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-1.32952472,  1.46652157, -0.92905669, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.99973172,  0.24906533,  1.08175335, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.15409251,  0.24906533,  1.88964703, ...,  1.        ,
         0.        ,  0.        ]], shape=(140000, 83))

In [33]:
y_test

7760      1.0
594954    0.0
480236    0.0
26944     1.0
616593    1.0
         ... 
113871    1.0
83546     1.0
95006     1.0
298122    1.0
368348    1.0
Name: diagnosed_diabetes, Length: 140000, dtype: float64

In [34]:
# Get the feature names generated by the preprocessor
feature_names = preprocessor.get_feature_names_out()

In [35]:
# Convert the Processed Arrays back to DataFrames
X_train_df = pd.DataFrame(X_train_processed, columns=feature_names)
X_test_df = pd.DataFrame(X_test_processed, columns=feature_names)
df_test_kaggle_df = pd.DataFrame(df_testprocessed, columns=feature_names)

In [36]:
X_train_df.head()

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,dia_bp_bin_Normal,heart_rate_bin_High,heart_rate_bin_Low,heart_rate_bin_Normal,chol_total_bin_Borderline,chol_total_bin_Desirable,chol_total_bin_High,triglycerides_bin_Borderline,triglycerides_bin_High,triglycerides_bin_Normal
0,-0.363309,0.249065,0.215037,0.371196,-1.667056,0.133389,0.642166,1.861035,0.086625,-1.969552,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.999732,0.984844,-0.169888,-0.389397,1.21897,1.435719,0.850384,1.084197,0.086625,1.547603,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-1.596189,0.249065,0.095118,-1.555227,-1.001814,1.169165,0.119421,0.56287,0.711409,-1.676958,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.832294,-1.24008,0.044972,0.23326,0.554509,1.303238,-0.758435,-0.224919,-1.315602,-0.797849,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.748259,0.249065,-0.113865,2.331894,-0.890678,0.94206,0.988911,1.343798,0.44579,-0.944463,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [37]:
# Add the Target Column back to Train and Validation sets
X_train_df[target_col] = y_train.values
X_test_df[target_col] = y_test.values 

In [38]:
# Add id to submission set
df_test_kaggle_df['id'] = df_test[id]

In [39]:
df_test_kaggle_df

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,heart_rate_bin_High,heart_rate_bin_Low,heart_rate_bin_Normal,chol_total_bin_Borderline,chol_total_bin_Desirable,chol_total_bin_High,triglycerides_bin_Borderline,triglycerides_bin_High,triglycerides_bin_Normal,id
0,-0.450247,1.466522,0.668842,-1.148226,-0.223337,0.133389,-0.125571,-0.489199,0.623219,-0.797849,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,700000
1,-1.329525,-1.240080,0.413701,-1.688707,-2.654068,1.435719,0.954300,0.562870,0.356543,-0.211225,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,700001
2,-0.450247,-1.240080,-0.227608,1.124630,-0.223337,0.521222,0.919676,2.118731,-0.370738,-0.651213,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,700002
3,0.410000,0.249065,0.283557,0.920156,0.332482,-0.473985,0.363754,1.343798,-0.186643,0.815226,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,700003
4,2.228956,0.249065,-1.525717,0.920156,0.665434,1.214035,-1.359637,-0.754380,1.319269,0.375416,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,700004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0.748259,0.984844,1.819038,0.233260,0.332482,-0.789300,-1.076324,-1.287594,-0.743692,-0.064553,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,999995
299996,-0.017556,0.249065,-1.777073,-0.112411,0.887080,-0.736226,1.299749,1.861035,-0.370738,-0.797849,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,999996
299997,1.083134,-1.240080,2.410560,-0.527874,0.554509,1.214035,-0.265855,-2.366538,1.147251,-1.823292,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,999997
299998,-0.190028,0.984844,0.070209,-0.735354,-0.112114,-2.200988,0.642166,0.823898,0.445790,-1.237596,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,999998


In [40]:
X_train_df.to_csv('../data/cleaned/processed_train_bin.csv', index=False)
X_test_df.to_csv('../data/cleaned/processed_validation_bin.csv', index=False)
df_test_kaggle_df.to_csv('../data/cleaned/processed_kaggle_test_bin.csv', index=False)

print("Export Complete!")
print(f"Train File Shape: {X_train_df.shape}")
print(f"Validation File Shape: {X_test_df.shape}")
print(f"Kaggle Test File Shape: {df_test_kaggle_df.shape}")

Export Complete!
Train File Shape: (560000, 84)
Validation File Shape: (140000, 84)
Kaggle Test File Shape: (300000, 84)
