# IMPORT LIB

In [29]:
from sklearn.preprocessing import RobustScaler, OneHotEncoder, PowerTransformer, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

# IMPORT DATASET

In [30]:
df_train = pd.read_csv("../data/raw/train.csv")
df_test = pd.read_csv("../data/raw/test.csv")

# DEFINING COLUMN LIST

In [31]:
# Seperate num and cat cols

# Num Cols
num_cols = df_train.select_dtypes(include=["int64", "float64"]).columns
# Cat Cols
cat_cols = df_train.select_dtypes(include=["object", "category"]).columns

# Binary Cols
bin_col = ["family_history_diabetes", "hypertension_history", "cardiovascular_history"]

# Cols to remove
# Features with little to no linear relationship (Correlation approx 0.0)
uncorrelated_features = [
    "alcohol_consumption_per_week",
    "screen_time_hours_per_day",
    "heart_rate",
    "cholesterol_total",
    "ldl_cholesterol",
    "gender",
    "ethnicity",
    "education_level",
    "income_level",
    "employment_status",
    "sleep_hours_per_day",
    "diastolic_bp"
]

dropped_highly_correlated = [
    "waist_to_hip_ratio"
]

# Target Col
target_col = "diagnosed_diabetes"

# Drop target Col from num col
if target_col in num_cols:
    num_cols = num_cols.drop(target_col)

# Id
id = "id"

# Drop Id col from num col
if id in num_cols:
    num_cols = num_cols.drop(id)

# Remove Binary Cols
existing_bin_cols = [c for c in bin_col if c in num_cols]
num_cols = num_cols.drop(existing_bin_cols)

# Remove unwanted features
num_cols = [col for col in num_cols if col not in uncorrelated_features + dropped_highly_correlated]

print(num_cols)

col_df = [num_cols, cat_cols]
print(col_df)

['age', 'physical_activity_minutes_per_week', 'diet_score', 'bmi', 'systolic_bp', 'hdl_cholesterol', 'triglycerides']
[['age', 'physical_activity_minutes_per_week', 'diet_score', 'bmi', 'systolic_bp', 'hdl_cholesterol', 'triglycerides'], Index(['gender', 'ethnicity', 'education_level', 'income_level',
       'smoking_status', 'employment_status'],
      dtype='object')]


# PREPROCESSING

In [32]:
class SafeLogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.shifts_ = []

    def fit(self, X, y=None):
        X = np.array(X)
        self.shifts_ = []
        for i in range(X.shape[1]):
            min_val = np.nanmin(X[:, i])
            shift = 1 - min_val if min_val <= 0 else 0
            self.shifts_.append(shift)
        return self

    def transform(self, X):
        X = np.array(X, copy=True)
        for i in range(X.shape[1]):
            X[:, i] = np.log(X[:, i] + self.shifts_[i])
        return X

    # --- NEW METHOD ADDED HERE ---
    def get_feature_names_out(self, input_features=None):
        # Since this transformer doesn't change column names or count,
        # we simply return the input feature names as they are.
        return np.array(input_features)

In [33]:
# Create Transformation instance
dt = PowerTransformer()
onc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [34]:
# Numerical Features PipeLine
num_pipeline = Pipeline(
    steps = [
        ("log", SafeLogTransformer()), 
        ("scaler", dt)
    ]
)


# Categorical Features Pipeline 
cat_pipeline = Pipeline(
    steps = [
        ("onehot", onc)
    ]
)

In [35]:
# Combine in ColumnTransformer 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('bin', 'passthrough', bin_col),
        ('cat', cat_pipeline, cat_cols)
    ],
    verbose_feature_names_out=False
)


In [36]:
# X and y split 
X = df_train.drop(columns=target_col, axis=1)
y = df_train[target_col]

In [37]:
# Train Test Split 
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Reattach column names explicitly (safe even if not needed)
X_train = pd.DataFrame(X_train, columns=X.columns, index=X_train.index)
X_test  = pd.DataFrame(X_test,  columns=X.columns, index=X_test.index)

In [38]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape) 
print(y_test.shape)

(560000, 25)
(140000, 25)
(560000,)
(140000,)


In [39]:
# Fit prpreprocessor
preprocessor.fit(X_train)

0,1,2
,transformers,"[('num', ...), ('bin', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [51]:
# Final Features 
feature_cols = [
    'age',
    'physical_activity_minutes_per_week',
    'diet_score',
    'bmi',
    'systolic_bp',
    'hdl_cholesterol',
    'triglycerides',
    'family_history_diabetes',
    'hypertension_history',
    'cardiovascular_history',
    'smoking_status',
    'ethnicity', 
    'education_level', 
    'income_level', 
    'employment_status', 
    'gender'
]
feature_cols

['age',
 'physical_activity_minutes_per_week',
 'diet_score',
 'bmi',
 'systolic_bp',
 'hdl_cholesterol',
 'triglycerides',
 'family_history_diabetes',
 'hypertension_history',
 'cardiovascular_history',
 'smoking_status',
 'ethnicity',
 'education_level',
 'income_level',
 'employment_status',
 'gender']

In [52]:
missing_cols = set(feature_cols) - set(X_train.columns)
missing_cols

set()

In [53]:
# Transform
X_train_filtered = X_train[feature_cols].copy()
X_test_filtered = X_test[feature_cols].copy()
df_test_filtered = df_test[feature_cols].copy() 

In [54]:
X_train_processed = preprocessor.transform(X_train_filtered)
X_test_processed = preprocessor.transform(X_test_filtered)
df_testprocessed = preprocessor.transform(df_test_filtered)

In [55]:
X_test_processed

array([[ 1.90567564, -0.38007471, -1.01104766, ...,  0.        ,
         0.        ,  0.        ],
       [-0.7998998 ,  1.05174055, -1.07970667, ...,  0.        ,
         0.        ,  0.        ],
       [-1.77442834,  0.07020889,  0.02597225, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-1.32952472, -0.92905669,  1.39591047, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.99973172,  1.08175335,  0.30225691, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.15409251,  1.88964703,  2.26577262, ...,  0.        ,
         0.        ,  0.        ]], shape=(140000, 34))

In [56]:
y_test

7760      1.0
594954    0.0
480236    0.0
26944     1.0
616593    1.0
         ... 
113871    1.0
83546     1.0
95006     1.0
298122    1.0
368348    1.0
Name: diagnosed_diabetes, Length: 140000, dtype: float64

In [57]:
# Get the feature names generated by the preprocessor
feature_names = preprocessor.get_feature_names_out()

In [58]:
# Convert the Processed Arrays back to DataFrames
X_train_df = pd.DataFrame(X_train_processed, columns=feature_names)
X_test_df = pd.DataFrame(X_test_processed, columns=feature_names)
df_test_kaggle_df = pd.DataFrame(df_testprocessed, columns=feature_names)

In [59]:
X_train_df.head()

Unnamed: 0,age,physical_activity_minutes_per_week,diet_score,bmi,systolic_bp,hdl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history,...,income_level_Lower-Middle,income_level_Middle,income_level_Upper-Middle,smoking_status_Current,smoking_status_Former,smoking_status_Never,employment_status_Employed,employment_status_Retired,employment_status_Student,employment_status_Unemployed
0,-0.363309,0.215037,0.371196,0.642166,0.086625,0.626492,0.617588,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.999732,-0.169888,-0.389397,0.850384,0.086625,1.233406,1.843836,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,-1.596189,0.095118,-1.555227,0.119421,0.711409,-1.188345,0.140157,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0.832294,0.044972,0.23326,-0.758435,-1.315602,-1.188345,-0.061848,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.748259,-0.113865,2.331894,0.988911,0.44579,0.019817,0.459576,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [60]:
# Add the Target Column back to Train and Validation sets
X_train_df[target_col] = y_train.values
X_test_df[target_col] = y_test.values 

In [61]:
# Add id to submission set
df_test_kaggle_df['id'] = df_test[id]

In [62]:
df_test_kaggle_df

Unnamed: 0,age,physical_activity_minutes_per_week,diet_score,bmi,systolic_bp,hdl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history,...,income_level_Middle,income_level_Upper-Middle,smoking_status_Current,smoking_status_Former,smoking_status_Never,employment_status_Employed,employment_status_Retired,employment_status_Student,employment_status_Unemployed,id
0,-0.450247,0.668842,-1.148226,-0.125571,0.623219,0.141088,-0.471546,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,700000
1,-1.329525,0.413701,-1.688707,0.954300,0.356543,-0.827104,0.891471,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,700001
2,-0.450247,-0.227608,1.124630,0.919676,-0.370738,-1.308422,2.360843,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,700002
3,0.410000,0.283557,0.920156,0.363754,-0.186643,0.626492,0.220444,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,700003
4,2.228956,-1.525717,0.920156,-1.359637,1.319269,0.626492,0.419899,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,700004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0.748259,1.819038,0.233260,-1.076324,-0.743692,0.019817,0.140157,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,999995
299996,-0.017556,-1.777073,-0.112411,1.299749,-0.370738,-1.308422,-0.430227,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,999996
299997,1.083134,2.410560,-0.527874,-0.265855,1.147251,-2.494544,0.499184,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,999997
299998,-0.190028,0.070209,-0.735354,0.642166,0.445790,-0.464671,0.617588,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,999998


In [63]:
X_train_df.to_csv('../data/cleaned/processed_train_remove.csv', index=False)
X_test_df.to_csv('../data/cleaned/processed_validation_remove.csv', index=False)
df_test_kaggle_df.to_csv('../data/cleaned/processed_kaggle_test_remove.csv', index=False)

print("Export Complete!")
print(f"Train File Shape: {X_train_df.shape}")
print(f"Validation File Shape: {X_test_df.shape}")
print(f"Kaggle Test File Shape: {df_test_kaggle_df.shape}")

Export Complete!
Train File Shape: (560000, 35)
Validation File Shape: (140000, 35)
Kaggle Test File Shape: (300000, 35)
