In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from concurrent.futures import ThreadPoolExecutor
import os
from tqdm import tqdm
from sklearn.impute import KNNImputer
from sklearn.base import clone
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, accuracy_score, cohen_kappa_score
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Constants
SEED = 42

In [3]:
# Loading the datasets
train_file_path = "train_values_impute.csv"
train_df = pd.read_csv(train_file_path)
test_file_path = "test_values_impute.csv"
test_df = pd.read_csv(test_file_path)
df = train_df

# DATA EXPLORATION

# Feature Engineering

In [4]:
train_ft = train_df.copy()
test_ft = test_df.copy()

In [5]:
def create_new_max_min_col(df):
    df['GS_max'] = df[['FGC-FGC_GSND', 'FGC-FGC_GSD']].max(axis=1)
    df['GS_min'] = df[['FGC-FGC_GSND', 'FGC-FGC_GSD']].min(axis=1)

    df["SR_min"] = df[['FGC-FGC_SRL', 'FGC-FGC_SRR']].min(axis=1)
    df["SR_max"] = df[['FGC-FGC_SRL', 'FGC-FGC_SRR']].max(axis=1)

In [6]:
create_new_max_min_col(train_ft)

In [7]:
create_new_max_min_col(test_ft)

In [8]:
thresholds = [5, 10, 15, 18, 22]
cu_map = {}
pu_map = {}
tl_map = {}
gs_max_map = {}
gs_min_map = {}
bmr_map = {}
dee_map = {}
sr_min_map = {}
sr_max_map = {}
ffmi_map = {}


prev = 0
for i in range(len(thresholds)):
    curr = thresholds[i]
    mean_cu = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['FGC-FGC_CU'].mean()
    mean_pu = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['FGC-FGC_PU'].mean()
    mean_tl = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['FGC-FGC_TL'].mean()
    mean_gs_max = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['GS_max'].mean()
    mean_gs_min = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['GS_min'].mean()
    mean_bmr = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['BIA-BIA_BMR'].mean()
    mean_dee = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['BIA-BIA_DEE'].mean()
    mean_sr_min = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['SR_min'].mean()
    mean_sr_max = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['SR_max'].mean()
    mean_ffmi = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['BIA-BIA_FFMI'].mean()
    cu_map[i] = mean_cu
    pu_map[i] = mean_pu
    tl_map[i] = mean_tl
    gs_max_map[i] = mean_gs_max
    gs_min_map[i] = mean_gs_min
    bmr_map[i] = mean_bmr
    dee_map[i] = mean_dee
    sr_min_map[i] = mean_sr_min
    sr_max_map[i] = mean_sr_max
    ffmi_map[i] = mean_ffmi
    
    prev = curr

In [9]:
print("cu_map: ", cu_map)
print("pu_map: ", pu_map)
print("tl_map: ", tl_map)
print("gs_max_map", gs_max_map)
print("gs_min_map", gs_min_map)
print("bmr_map", bmr_map)
print("dee_map", dee_map)
print("sr_min_map", sr_min_map)
print("sr_max_map", sr_max_map)
print("ffmi_map", ffmi_map)

cu_map:  {0: 1.4894075826009996, 1: 7.476433412668704, 2: 16.73709242262834, 3: 21.407771986194827, 4: 25.30853654685808}
pu_map:  {0: 1.6124985934130887, 1: 4.5169419747697965, 2: 6.784395686334561, 3: 8.478444895075162, 4: 6.821968965779475}
tl_map:  {0: 8.120492018715137, 1: 8.869554898368646, 2: 9.838626506051705, 3: 9.917901720651995, 4: 9.692428580093807}
gs_max_map {0: 3.9099297688947945, 1: 10.55672195840937, 2: 23.292464861164753, 3: 32.78846762533555, 4: 38.36685722870873}
gs_min_map {0: 2.736800716173679, 1: 10.002960767347604, 2: 21.30362495880886, 3: 29.72703462536007, 4: 35.603563385045724}
bmr_map {0: 923.5225434249359, 1: 1044.0911206823437, 2: 1349.4340136900846, 3: 1570.3551673344439, 4: 1600.771724721695}
dee_map {0: 1482.2122961876526, 1: 1727.4929346670194, 2: 2287.120935391911, 3: 2659.7318780880137, 4: 2689.29223414726}
sr_min_map {0: 9.096963629930148, 1: 8.625937676320195, 2: 8.155678517119915, 3: 8.436815249729667, 4: 8.8197600575827}
sr_max_map {0: 9.87557177

In [10]:
def bin_data(train, test, columns, n_bins=10):
    # Combine train and test for consistent bin edges
    combined = pd.concat([train, test], axis=0)

    bin_edges = {}
    for col in columns:
        # Compute quantile bin edges correctly
        edges = pd.qcut(combined[col], n_bins, retbins=True, labels=False, duplicates="drop")[1]
        bin_edges[col] = edges

    # Apply the same bin edges to both train and test
    for col, edges in bin_edges.items():
        num_bins = len(edges) - 1  # Ensure the correct number of labels
        labels = range(num_bins)   # Matching labels with bins

        train[col] = pd.cut(train[col], bins=edges, labels=labels, include_lowest=True).astype(float)
        test[col] = pd.cut(test[col], bins=edges, labels=labels, include_lowest=True).astype(float)

    return train, test

In [11]:
def feature_engineering(df):

    df["CU_norm"] = df['FGC-FGC_CU'] / df['age_group'].map(cu_map)
    df["PU_norm"] = df['FGC-FGC_PU'] / df['age_group'].map(pu_map)
    df["TL_norm"] = df['FGC-FGC_TL'] / df['age_group'].map(tl_map)

    df['GS_max_norm'] = df['GS_max'] / df["age_group"].map(gs_max_map)
    df['GS_min_norm'] = df['GS_min'] / df["age_group"].map(gs_min_map)

    df['SR_max_norm'] = df['SR_max'] / df["age_group"].map(gs_max_map)
    df['SR_min_norm'] = df['SR_min'] / df["age_group"].map(gs_min_map)

    df["BMR_norm"] = df["BIA-BIA_BMR"] / df["age_group"].map(bmr_map)
    df["DEE_norm"] = df["BIA-BIA_DEE"] / df["age_group"].map(dee_map)

    df["FFMI_norm"] = df["BIA-BIA_FFMI"] / df["age_group"].map(ffmi_map)

    df["ECW_ICW_ratio"] = df["BIA-BIA_ECW"] / df["BIA-BIA_ICW"]

In [12]:
columns_to_bin = [
    "CU_norm", "PU_norm", "TL_norm", "GS_min_norm", "GS_max_norm", 
    "SR_min_norm", "SR_max_norm", "BMR_norm", "DEE_norm", "FFMI_norm", "Physical-HeartRate", "Physical-Waist_Circumference", "Physical-Height" ,"Physical-Weight"
]

In [13]:
# 'BIA-BIA_BMI' already removed, so no need to add here
columns_to_remove = ['FGC-FGC_CU', 'FGC-FGC_GSND', 'FGC-FGC_GSD', 'FGC-FGC_PU', 'FGC-FGC_SRL', 'FGC-FGC_SRR', 'FGC-FGC_TL', 
                    'BIA-BIA_FFM', 'BIA-BIA_FMI','BIA-BIA_Frame_num', 'BIA-BIA_LDM']

In [14]:
feature_engineering(train_ft)

In [15]:
feature_engineering(test_ft)

In [16]:
train_ft, test_ft = bin_data(train_ft, test_ft, columns_to_bin, n_bins=10)

In [17]:
train_ft = train_ft.drop(columns_to_remove, axis=1)

In [18]:
test_ft = test_ft.drop(columns_to_remove, axis=1)

In [19]:
train_ft.shape

(3960, 79)

In [20]:
train_ft.to_csv("train_ft.csv", index=False)
test_ft.to_csv("test_ft.csv", index=False)