In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from concurrent.futures import ThreadPoolExecutor
import os
from tqdm import tqdm
from sklearn.impute import KNNImputer
from sklearn.base import clone
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, accuracy_score, cohen_kappa_score
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [23]:
# Constants
SEED = 42

In [35]:
# Loading the datasets
train_file_path = "train.csv"
train_df = pd.read_csv(train_file_path)
test_file_path = "test.csv"
test_df = pd.read_csv(test_file_path)
df = train_df

# DATA EXPLORATION

# Feature Engineering

In [111]:
train_ft = train_df.copy()
test_ft = test_df.copy()

In [112]:
def create_new_max_min_col(df):
    df['GS_max'] = df[['FGC-FGC_GSND', 'FGC-FGC_GSD']].max(axis=1)
    df['GS_min'] = df[['FGC-FGC_GSND', 'FGC-FGC_GSD']].min(axis=1)

    df["SR_min"] = df[['FGC-FGC_SRL', 'FGC-FGC_SRR']].min(axis=1)
    df["SR_max"] = df[['FGC-FGC_SRL', 'FGC-FGC_SRR']].max(axis=1)

In [113]:
create_new_max_min_col(train_ft)

In [114]:
create_new_max_min_col(test_ft)

In [115]:
cu_map = {}
pu_map = {}
tl_map = {}
gs_max_map = {}
gs_min_map = {}
bmr_map = {}
dee_map = {}
sr_min_map = {}
sr_max_map = {}
ffmi_map = {}


prev = 0
for i in range(len(thresholds)):
    curr = thresholds[i]
    mean_cu = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['FGC-FGC_CU'].mean()
    mean_pu = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['FGC-FGC_PU'].mean()
    mean_tl = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['FGC-FGC_TL'].mean()
    mean_gs_max = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['GS_max'].mean()
    mean_gs_min = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['GS_min'].mean()
    mean_bmr = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['BIA-BIA_BMR'].mean()
    mean_dee = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['BIA-BIA_DEE'].mean()
    mean_sr_min = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['SR_min'].mean()
    mean_sr_max = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['SR_max'].mean()
    mean_ffmi = train_ft[(train_ft['Basic_Demos-Age'] > prev) & (train_ft['Basic_Demos-Age'] <= curr)]['BIA-BIA_FFMI'].mean()
    cu_map[i] = mean_cu
    pu_map[i] = mean_pu
    tl_map[i] = mean_tl
    gs_max_map[i] = mean_gs_max
    gs_min_map[i] = mean_gs_min
    bmr_map[i] = mean_bmr
    dee_map[i] = mean_dee
    sr_min_map[i] = mean_sr_min
    sr_max_map[i] = mean_sr_max
    ffmi_map[i] = mean_ffmi
    
    prev = curr

In [116]:
print("cu_map: ", cu_map)
print("pu_map: ", pu_map)
print("tl_map: ", tl_map)
print("gs_max_map", gs_max_map)
print("gs_min_map", gs_min_map)
print("bmr_map", bmr_map)
print("dee_map", dee_map)
print("sr_min_map", sr_min_map)
print("sr_max_map", sr_max_map)
print("ffmi_map", ffmi_map)

cu_map:  {0: 1.4891304347826086, 1: 7.3164362519201225, 2: 17.046195652173914, 3: 20.4, 4: 23.142857142857142}
pu_map:  {0: 0.7065217391304348, 1: 4.097842835130971, 2: 7.517808219178082, 3: 10.437158469945356, 4: 15.285714285714286}
tl_map:  {0: 7.9021739130434785, 1: 8.714543361473522, 2: 10.135054347826086, 3: 10.193548387096774, 4: 9.428571428571429}
gs_max_map {0: nan, 1: 16.049668874172184, 2: 23.792010943912448, 3: 32.56924731182796, 4: 40.642857142857146}
gs_min_map {0: nan, 1: 14.086026490066224, 2: 21.14311901504788, 3: 28.671505376344083, 4: 34.800000000000004}
bmr_map {0: 934.2936000000002, 1: 1044.4710360036004, 2: 1351.50534318555, 3: 1562.4434319526626, 4: 1615.6395}
dee_map {0: 1471.1115, 1: 1733.4984878487849, 2: 2303.2957471264367, 3: 2621.1362721893493, 4: 2722.585}
sr_min_map {0: 9.256043956043955, 1: 8.503477588871716, 2: 7.748975409836065, 3: 8.251612903225807, 4: 8.25}
sr_max_map {0: 10.187912087912089, 1: 9.44308346213292, 2: 8.690437158469946, 3: 9.350537634408

In [117]:
def bin_data(train, test, columns, n_bins=10):
    # Combine train and test for consistent bin edges
    combined = pd.concat([train, test], axis=0)

    bin_edges = {}
    for col in columns:
        # Compute quantile bin edges correctly
        edges = pd.qcut(combined[col], n_bins, retbins=True, labels=False, duplicates="drop")[1]
        bin_edges[col] = edges

    # Apply the same bin edges to both train and test
    for col, edges in bin_edges.items():
        num_bins = len(edges) - 1  # Ensure the correct number of labels
        labels = range(num_bins)   # Matching labels with bins

        train[col] = pd.cut(train[col], bins=edges, labels=labels, include_lowest=True).astype(float)
        test[col] = pd.cut(test[col], bins=edges, labels=labels, include_lowest=True).astype(float)

    return train, test

In [118]:
def feature_engineering(df):

    df["CU_norm"] = df['FGC-FGC_CU'] / df['age_group'].map(cu_map)
    df["PU_norm"] = df['FGC-FGC_PU'] / df['age_group'].map(pu_map)
    df["TL_norm"] = df['FGC-FGC_TL'] / df['age_group'].map(tl_map)

    df['GS_max_norm'] = df['GS_max'] / df["age_group"].map(gs_max_map)
    df['GS_min_norm'] = df['GS_min'] / df["age_group"].map(gs_min_map)

    df['SR_max_norm'] = df['SR_max'] / df["age_group"].map(gs_max_map)
    df['SR_min_norm'] = df['SR_min'] / df["age_group"].map(gs_min_map)

    df["BMR_norm"] = df["BIA-BIA_BMR"] / df["age_group"].map(bmr_map)
    df["DEE_norm"] = df["BIA-BIA_DEE"] / df["age_group"].map(dee_map)

    df["FFMI_norm"] = df["BIA-BIA_FFMI"] / df["age_group"].map(ffmi_map)

    df["ECW_ICW_ratio"] = df["BIA-BIA_ECW"] / df["BIA-BIA_ICW"]

In [119]:
columns_to_bin = [
    "CU_norm", "PU_norm", "TL_norm", "GS_min_norm", "GS_max_norm", 
    "SR_min_norm", "SR_max_norm", "BMR_norm", "DEE_norm", "FFMI_norm", "Physical-HeartRate", "Physical-Waist_Circumference", "Physical-Height" ,"Physical-Weight"
]

In [None]:
# 'BIA-BIA_BMI' already removed, so no need to add here
columns_to_remove = ['FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR_Zone', 'FGC-FGC_SRR', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 
                    'BIA-BIA_FFM', 'BIA-BIA_FMI','BIA-BIA_Frame_num', 'BIA-BIA_LDM']

In [121]:
feature_engineering(train_ft)

In [122]:
feature_engineering(test_ft)

In [123]:
train_ft, test_ft = bin_data(train_ft, test_ft, columns_to_bin, n_bins=10)

In [124]:
train_ft = train_ft.drop(columns_to_remove, axis=1)

In [128]:
test_ft = test_ft.drop(columns_to_remove, axis=1)

# ACTIGRAPHY

In [125]:
def time_features(df):
    # Convert time_of_day to hours
    df["hours"] = df["time_of_day"] // (3_600 * 1_000_000_000)
    # Basic features 
    features = [
        df["non-wear_flag"].mean(),
        df["enmo"][df["enmo"] >= 0.05].sum(), # Filters out low level noise
    ]
    
    # Define conditions for night, day, and no mask (full data)
    night = ((df["hours"] >= 21) | (df["hours"] <= 5))
    day = ((df["hours"] <= 20) & (df["hours"] >= 6))
    no_mask = np.ones(len(df), dtype=bool)
    
    # List of columns of interest and masks
    keys = ["enmo", "anglez", "light", "battery_voltage"]
    masks = [no_mask, night, day]
    
    # Helper function for feature extraction
    def extract_stats(data):
        return [
            data.mean(), 
            data.std(), 
            data.max(), 
            data.min(), 
            data.diff().mean(), 
            data.diff().std()
        ]
    
    # Iterate over keys and masks to generate the statistics
    for key in keys:
        for mask in masks:
            filtered_data = df.loc[mask, key]
            features.extend(extract_stats(filtered_data))

    return features

def process_file(filename, dirname):
    # Process file and extract time features
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return time_features(df), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    # Load time series from directory in parallel
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    
    return df

In [126]:
train_ts = load_time_series("series_train.parquet")


100%|██████████| 996/996 [00:45<00:00, 22.07it/s]


In [127]:
test_ts = load_time_series("series_test.parquet")

100%|██████████| 2/2 [00:00<00:00, 14.38it/s]


In [129]:
# Principal Component Analysis
def perform_pca(train, test, n_components=None, random_state=42):
    
    pca = PCA(n_components=n_components, random_state=random_state)
    train_pca = pca.fit_transform(train)
    test_pca = pca.transform(test)
    
    explained_variance_ratio = pca.explained_variance_ratio_
    print(f"Explained variance ratio of the components:\n {explained_variance_ratio}")
    print(np.sum(explained_variance_ratio))
    
    train_pca_df = pd.DataFrame(train_pca, columns=[f'PC_{i+1}' for i in range(train_pca.shape[1])])
    test_pca_df = pd.DataFrame(test_pca, columns=[f'PC_{i+1}' for i in range(test_pca.shape[1])])
    
    return train_pca_df, test_pca_df, pca

In [130]:
# Processing the time series data and merging with the main dataset
df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

scaler = StandardScaler() 
df_train = pd.DataFrame(scaler.fit_transform(df_train), columns=df_train.columns)
df_test = pd.DataFrame(scaler.transform(df_test), columns=df_test.columns)

for c in df_train.columns:
    m = np.mean(df_train[c])
    df_train[c].fillna(m, inplace=True)
    df_test[c].fillna(m, inplace=True)

print(df_train.shape)

df_train_pca, df_test_pca, pca = perform_pca(df_train, df_test, n_components=15, random_state=SEED)

df_train_pca['id'] = train_ts['id']
df_test_pca['id'] = test_ts['id']

train = pd.merge(train_ft, df_train_pca, how="left", on='id')
test = pd.merge(test_ft, df_test_pca, how="left", on='id')
train.shape

(996, 74)
Explained variance ratio of the components:
 [0.2297803  0.11595334 0.07236788 0.06201337 0.05346817 0.04800129
 0.0408818  0.03782128 0.03380817 0.02967952 0.02575016 0.0241788
 0.02183349 0.02151956 0.01905743]
0.8361145330748827


(3960, 81)

In [None]:
train = train.drop(columns=['age_group_label'])

In [1]:
train.to_csv("train_values_merge.csv", index=False)
test.to_csv("test_values_merge.csv", index=False)

NameError: name 'train' is not defined