# IMPORT LIB

In [45]:
from sklearn.preprocessing import RobustScaler, OneHotEncoder, PowerTransformer, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

# IMPORT DATASET

In [46]:
df_train = pd.read_csv("../data/raw/train.csv")
df_test = pd.read_csv("../data/raw/test.csv")

# DEFINING COLUMN LIST

In [47]:
def create_new_features(df):
    # 1. Lipid Ratios (Adding 1e-5 avoids division by zero)
    df['tg_hdl_ratio'] = df['triglycerides'] / (df['hdl_cholesterol'] + 1e-5)
    df['non_hdl_c'] = df['cholesterol_total'] - df['hdl_cholesterol']
    df['tc_hdl_ratio'] = df['cholesterol_total'] / (df['hdl_cholesterol'] + 1e-5)

    # 2. Interactions
    df['bmi_age'] = df['bmi'] * df['age']
    # Higher is worse: High BMI divided by low activity
    df['bmi_activity'] = df['bmi'] / (df['physical_activity_minutes_per_week'] + 1) 
    df['bp_heart_product'] = df['systolic_bp'] * df['heart_rate']

    # 3. Comorbidity Count (Summing the binary flags)
    # Ensure these are numeric (0/1)
    history_cols = ['family_history_diabetes', 'hypertension_history', 'cardiovascular_history']
    df['risk_factor_count'] = df[history_cols].sum(axis=1)

    # 4. Log Transformations (for skewed data)
    df['log_triglycerides'] = np.log1p(df['triglycerides'])
    
    return df

# --- Apply to ALL Dataframes ---
df_train = create_new_features(df_train)
df_test = create_new_features(df_test)

# Check the new shape
print(f"New Train Shape: {df_train.shape}")
print(df_train[['tg_hdl_ratio', 'risk_factor_count']].head())

New Train Shape: (700000, 34)
   tg_hdl_ratio  risk_factor_count
0      1.758620                  0
1      2.480000                  0
2      1.830508                  0
3      2.277777                  1
4      2.530612                  1


In [48]:
# Seperate num and cat cols

# Num Cols
num_cols = df_train.select_dtypes(include=["int64", "float64"]).columns
# Cat Cols
cat_cols = df_train.select_dtypes(include=["object", "category"]).columns

# Binary Cols
bin_col = ["family_history_diabetes", "hypertension_history", "cardiovascular_history"]

# Cols to remove
# Features with little to no linear relationship (Correlation approx 0.0)
uncorrelated_features = [
    "alcohol_consumption_per_week",
    "sleep_hours_per_day",
    "screen_time_hours_per_day",
    "diastolic_bp"  # surprisingly weak correlation in this specific dataset
]

dropped_highly_correlated = [
    "ldl_cholesterol",
    "waist_to_hip_ratio"
]

# Target Col
target_col = "diagnosed_diabetes"

# Drop target Col from num col
if target_col in num_cols:
    num_cols = num_cols.drop(target_col)

# Id
id = "id"

# Drop Id col from num col
if id in num_cols:
    num_cols = num_cols.drop(id)

# Remove Binary Cols
existing_bin_cols = [c for c in bin_col if c in num_cols]
num_cols = num_cols.drop(existing_bin_cols)

# Remove unwanted features
num_cols = [col for col in num_cols if col not in uncorrelated_features + dropped_highly_correlated]

print(num_cols)

col_df = [num_cols, cat_cols]
print(col_df)

['age', 'physical_activity_minutes_per_week', 'diet_score', 'bmi', 'systolic_bp', 'hdl_cholesterol', 'triglycerides', 'tg_hdl_ratio', 'non_hdl_c', 'tc_hdl_ratio', 'bmi_age', 'bmi_activity', 'bp_heart_product', 'risk_factor_count', 'log_triglycerides']
[['age', 'physical_activity_minutes_per_week', 'diet_score', 'bmi', 'systolic_bp', 'hdl_cholesterol', 'triglycerides', 'tg_hdl_ratio', 'non_hdl_c', 'tc_hdl_ratio', 'bmi_age', 'bmi_activity', 'bp_heart_product', 'risk_factor_count', 'log_triglycerides'], Index(['gender', 'ethnicity', 'education_level', 'income_level',
       'smoking_status', 'employment_status'],
      dtype='object')]


# PREPROCESSING

In [49]:
class SafeLogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.shifts_ = []

    def fit(self, X, y=None):
        X = np.array(X)
        self.shifts_ = []
        for i in range(X.shape[1]):
            min_val = np.nanmin(X[:, i])
            shift = 1 - min_val if min_val <= 0 else 0
            self.shifts_.append(shift)
        return self

    def transform(self, X):
        X = np.array(X, copy=True)
        for i in range(X.shape[1]):
            X[:, i] = np.log(X[:, i] + self.shifts_[i])
        return X

    # --- NEW METHOD ADDED HERE ---
    def get_feature_names_out(self, input_features=None):
        # Since this transformer doesn't change column names or count,
        # we simply return the input feature names as they are.
        return np.array(input_features)

In [50]:
# Create Transformation instance
dt = PowerTransformer()
onc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [51]:
# Numerical Features PipeLine
num_pipeline = Pipeline(
    steps = [
        ("log", SafeLogTransformer()), 
        ("scaler", dt)
    ]
)


# Categorical Features Pipeline 
cat_pipeline = Pipeline(
    steps = [
        ("onehot", onc)
    ]
)

In [52]:
# Combine in ColumnTransformer 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('bin', 'passthrough', bin_col),
        ('cat', cat_pipeline, cat_cols)
    ],
    verbose_feature_names_out=False
)


In [53]:
# X and y split 
X = df_train.drop(columns=target_col, axis=1)
y = df_train[target_col]

In [54]:
# Train Test Split 
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Reattach column names explicitly (safe even if not needed)
X_train = pd.DataFrame(X_train, columns=X.columns, index=X_train.index)
X_test  = pd.DataFrame(X_test,  columns=X.columns, index=X_test.index)

In [55]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape) 
print(y_test.shape)

(560000, 33)
(140000, 33)
(560000,)
(140000,)


In [56]:
# Fit prpreprocessor
preprocessor.fit(X_train)

0,1,2
,transformers,"[('num', ...), ('bin', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [57]:
# Final Features 
feature_cols = [
 'age',
 'physical_activity_minutes_per_week',
 'diet_score',
 'bmi',
 'systolic_bp',
 'heart_rate',
 'cholesterol_total',
 'hdl_cholesterol',
 'triglycerides',
 'family_history_diabetes',
 'hypertension_history',
 'cardiovascular_history',
 'gender',
 'ethnicity',
 'education_level',
 'income_level',
 'smoking_status',
 'employment_status'
]
feature_cols

['age',
 'physical_activity_minutes_per_week',
 'diet_score',
 'bmi',
 'systolic_bp',
 'hdl_cholesterol',
 'triglycerides',
 'family_history_diabetes',
 'hypertension_history',
 'cardiovascular_history',
 'smoking_status',
 'tg_hdl_ratio',
 'non_hdl_c',
 'tc_hdl_ratio',
 'bmi_age',
 'bmi_activity',
 'bp_heart_product',
 'risk_factor_count',
 'log_triglycerides']

In [58]:
# Transform
X_train_filtered = X_train[feature_cols].copy()
X_test_filtered = X_test[feature_cols].copy()
df_test_filtered = df_test[feature_cols].copy() 

In [59]:
X_train_processed = preprocessor.transform(X_train_filtered)
X_test_processed = preprocessor.transform(X_test_filtered)
df_testprocessed = preprocessor.transform(df_test_filtered)

ValueError: columns are missing: {'ethnicity', 'gender', 'employment_status', 'education_level', 'income_level'}

In [60]:
X_test_processed

array([[ 1.90567564, -0.38007471, -1.01104766, ...,  0.        ,
         0.        ,  0.        ],
       [-0.7998998 ,  1.05174055, -1.07970667, ...,  0.        ,
         0.        ,  0.        ],
       [-1.77442834,  0.07020889,  0.02597225, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-1.32952472, -0.92905669,  1.39591047, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.99973172,  1.08175335,  0.30225691, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.15409251,  1.88964703,  2.26577262, ...,  0.        ,
         0.        ,  0.        ]], shape=(140000, 44))

In [None]:
y_test

In [None]:
# Get the feature names generated by the preprocessor
feature_names = preprocessor.get_feature_names_out()

In [None]:
# Convert the Processed Arrays back to DataFrames
X_train_df = pd.DataFrame(X_train_processed, columns=feature_names)
X_test_df = pd.DataFrame(X_test_processed, columns=feature_names)
df_test_kaggle_df = pd.DataFrame(df_testprocessed, columns=feature_names)

In [None]:
X_train_df.head()

In [None]:
# Add the Target Column back to Train and Validation sets
X_train_df[target_col] = y_train.values
X_test_df[target_col] = y_test.values 

In [None]:
# Add id to submission set
df_test_kaggle_df['id'] = df_test[id]

In [None]:
df_test_kaggle_df

In [26]:
X_train_df.to_csv('../data/cleaned/processed_train_added.csv', index=False)
X_test_df.to_csv('../data/cleaned/processed_validation_added.csv', index=False)
df_test_kaggle_df.to_csv('../data/cleaned/processed_kaggle_test_added.csv', index=False)

print("Export Complete!")
print(f"Train File Shape: {X_train_df.shape}")
print(f"Validation File Shape: {X_test_df.shape}")
print(f"Kaggle Test File Shape: {df_test_kaggle_df.shape}")

Export Complete!
Train File Shape: (560000, 45)
Validation File Shape: (140000, 45)
Kaggle Test File Shape: (300000, 45)
