# LightGBM-DeepAutoEncoder Code
This code is for training LightGBM and was used for submission. It was originally run on Google Colaboratory and transferred into a notebook, so there might be some missing parts.

In [None]:
TARGET_COL = "sii"

FEATURE_COLS = [
    "Basic_Demos-Enroll_Season",
    "Basic_Demos-Age",
    "Basic_Demos-Sex",
    "CGAS-Season",
    "CGAS-CGAS_Score",
    "Physical-Season",
    "Physical-BMI",
    "Physical-Height",
    "Physical-Weight",
    "Physical-Waist_Circumference",
    "Physical-Diastolic_BP",
    "Physical-HeartRate",
    "Physical-Systolic_BP",
    "Fitness_Endurance-Season",
    "Fitness_Endurance-Max_Stage",
    "Fitness_Endurance-Time_Mins",
    "Fitness_Endurance-Time_Sec",
    "FGC-Season",
    "FGC-FGC_CU",
    "FGC-FGC_CU_Zone",
    "FGC-FGC_GSND",
    "FGC-FGC_GSND_Zone",
    "FGC-FGC_GSD",
    "FGC-FGC_GSD_Zone",
    "FGC-FGC_PU",
    "FGC-FGC_PU_Zone",
    "FGC-FGC_SRL",
    "FGC-FGC_SRL_Zone",
    "FGC-FGC_SRR",
    "FGC-FGC_SRR_Zone",
    "FGC-FGC_TL",
    "FGC-FGC_TL_Zone",
    "BIA-Season",
    "BIA-BIA_Activity_Level_num",
    "BIA-BIA_BMC",
    "BIA-BIA_BMI",
    "BIA-BIA_BMR",
    "BIA-BIA_DEE",
    "BIA-BIA_ECW",
    "BIA-BIA_FFM",
    "BIA-BIA_FFMI",
    "BIA-BIA_FMI",
    "BIA-BIA_Fat",
    "BIA-BIA_Frame_num",
    "BIA-BIA_ICW",
    "BIA-BIA_LDM",
    "BIA-BIA_LST",
    "BIA-BIA_SMM",
    "BIA-BIA_TBW",
    "PAQ_A-Season",
    "PAQ_A-PAQ_A_Total",
    "PAQ_C-Season",
    "PAQ_C-PAQ_C_Total",
    "SDS-Season",
    "SDS-SDS_Total_Raw",
    "SDS-SDS_Total_T",
    "PreInt_EduHx-Season",
    "PreInt_EduHx-computerinternet_hoursday",
]

In [None]:
def feature_engineering(df):
    added_columns = []

    normal_values = pd.DataFrame({
        'Basic_Demos-Age': list(range(5, 23)) * 2,  
        'Basic_Demos-Sex': [0] * 18 + [1] * 18,  
        'Normal_BMI': [15 + i * 0.5 for i in range(18)] * 2,  
        'Normal_BMR': [1100 + i * 50 for i in range(18)] * 2,  
        'Normal_HeartRate': [80 for _ in range(36)],  
        'Normal_Systolic_BP': [100 + i for i in range(18)] * 2,  
        'Normal_Diastolic_BP': [65 + i for i in range(18)] * 2,  
    })
    df = df.merge(normal_values, on=['Basic_Demos-Age', 'Basic_Demos-Sex'], how='left')

    df['cal_BMI_Inflation'] = df['Physical-BMI'] / df['Normal_BMI']  
    added_columns.append('cal_BMI_Inflation')
    df['cal_BMR_Inflation'] = df['BIA-BIA_BMR'] / df['Normal_BMR']  
    added_columns.append('cal_BMR_Inflation')
    df['cal_HeartRate_Inflation'] = df['Physical-HeartRate'] / df['Normal_HeartRate']  
    added_columns.append('cal_HeartRate_Inflation')
    df['cal_Systolic_BP_Inflation'] = df['Physical-Systolic_BP'] / df['Normal_Systolic_BP']  
    added_columns.append('cal_Systolic_BP_Inflation')
    df['cal_Diastolic_BP_Inflation'] = df['Physical-Diastolic_BP'] / df['Normal_Diastolic_BP']  
    added_columns.append('cal_Diastolic_BP_Inflation')


    season_start_month = {'Spring': 3, 'Summer': 6, 'Fall': 9, 'Winter': 12}
    season_cols = [col for col in df.columns if 'Season' in col and 'PCIAT' not in col] 
    for col in season_cols:
        df[col + '_StartMonth'] = df[col].map(season_start_month) 
        added_columns.append(col + '_StartMonth')
        for season, start_month in season_start_month.items():
            df[f'{col}_{season}'] = (df[col] == season).astype(float)  
            added_columns.append(f'{col}_{season}')


    if 'Basic_Demos-Enroll_Season' in season_cols:
        for col in season_cols:
            if col != 'Basic_Demos-Enroll_Season':
                df[f'{col}_MonthDifference'] = df.apply(
                    lambda row: (
                        12 - abs(row['Basic_Demos-Enroll_Season_StartMonth'] - row[col + '_StartMonth'])
                        if row['Basic_Demos-Enroll_Season_StartMonth'] > row[col + '_StartMonth'] else
                        abs(row['Basic_Demos-Enroll_Season_StartMonth'] - row[col + '_StartMonth'])
                    ) if pd.notna(row['Basic_Demos-Enroll_Season_StartMonth']) and pd.notna(row[col + '_StartMonth'])
                    else np.nan, axis=1
                )
                added_columns.append(f'{col}_MonthDifference')

    
    df['cal_BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    added_columns.append('cal_BMI_Age')
    df['cal_Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    added_columns.append('cal_Internet_Hours_Age')
    df['cal_BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    added_columns.append('cal_BMI_Internet_Hours')
    df['cal_BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI'].replace(0, np.nan)
    added_columns.append('cal_BFP_BMI')
    df['cal_FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat'].replace(0, np.nan)
    added_columns.append('cal_FFMI_BFP')
    df['cal_FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat'].replace(0, np.nan)
    added_columns.append('cal_FMI_BFP')
    df['cal_LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW'].replace(0, np.nan)
    added_columns.append('cal_LST_TBW')
    df['cal_BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    added_columns.append('cal_BFP_BMR')
    df['cal_BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    added_columns.append('cal_BFP_DEE')
    df['cal_BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight'].replace(0, np.nan)
    added_columns.append('cal_BMR_Weight')
    df['cal_DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight'].replace(0, np.nan)
    added_columns.append('cal_DEE_Weight')
    df['cal_SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height'].replace(0, np.nan)
    added_columns.append('cal_SMM_Height')
    df['cal_Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI'].replace(0, np.nan)
    added_columns.append('cal_Muscle_to_Fat')
    df['cal_Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight'].replace(0, np.nan)
    added_columns.append('cal_Hydration_Status')
    df['cal_ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW'].replace(0, np.nan)
    added_columns.append('cal_ICW_TBW')

    df['cal_Sleep_Disturbance_Index'] = (df['SDS-SDS_Total_Raw'] * df['SDS-SDS_Total_T']) / 100
    added_columns.append('cal_Sleep_Disturbance_Index')
    df['cal_Strength_Flexibility_Score'] = (
        df['FGC-FGC_GSD'] + df['FGC-FGC_GSND'] + df['FGC-FGC_SRL'] + df['FGC-FGC_SRR']
    ) / 4
    added_columns.append('cal_Strength_Flexibility_Score')
    df['cal_Hydration_BMI'] = df['BIA-BIA_TBW'] / df['Physical-BMI']
    added_columns.append('cal_Hydration_BMI')
    df['cal_Metabolic_Risk'] = (
        df['BIA-BIA_Fat'] + df['Physical-BMI'] + df['Physical-Systolic_BP'] + df['Physical-Diastolic_BP']
    ) / 4
    added_columns.append('cal_Metabolic_Risk')

    return df, added_columns


In [None]:
# Time Series
def extract_advanced_features(data):
    # Initial data preprocessing
    data = data.copy()
    data['timestamp'] = pd.to_datetime(data['relative_date_PCIAT'], unit='D') + pd.to_timedelta(data['time_of_day'])
    data = data[data['non-wear_flag'] == 0]

    # Calculate basic metrics
    data['magnitude'] = np.sqrt(data['X']**2 + data['Y']**2 + data['Z']**2)
    data['velocity'] = data['magnitude']
    data['distance'] = data['velocity'] * 5  # 5 seconds per observation
    data['date'] = data['timestamp'].dt.date
    hour = pd.to_datetime(data['time_of_day']).dt.hour

    # Calculate aggregated distances
    distances = {
        'daily': data.groupby('date')['distance'].sum(),
        'monthly': data.groupby(data['timestamp'].dt.to_period('M'))['distance'].sum(),
        'quarterly': data.groupby('quarter')['distance'].sum()
    }

    # Initialize features dictionary
    features = {}

    # Time masks for different periods
    time_masks = {
        'morning': (hour >= 6) & (hour < 12),
        'afternoon': (hour >= 12) & (hour < 18),
        'evening': (hour >= 18) & (hour < 22),
        'night': (hour >= 22) | (hour < 6)
    }

    # 1. Activity Pattern Features
    for period, mask in time_masks.items():
        features.update({
            f'{period}_activity_mean': data.loc[mask, 'magnitude'].mean(),
            f'{period}_activity_std': data.loc[mask, 'magnitude'].std(),
            f'{period}_enmo_mean': data.loc[mask, 'enmo'].mean()
        })

    # 2. Sleep Quality Features
    sleep_hours = time_masks['night']
    magnitude_threshold = data['magnitude'].mean() + data['magnitude'].std()

    features.update({
        'sleep_movement_mean': data.loc[sleep_hours, 'magnitude'].mean(),
        'sleep_movement_std': data.loc[sleep_hours, 'magnitude'].std(),
        'sleep_disruption_count': len(data.loc[sleep_hours & (data['magnitude'] >
            data['magnitude'].mean() + 2 * data['magnitude'].std())]),
        'light_exposure_during_sleep': data.loc[sleep_hours, 'light'].mean(),
        'sleep_position_changes': len(data.loc[sleep_hours &
            (abs(data['anglez'].diff()) > 45)]),
        'good_sleep_cycle': int(data.loc[sleep_hours, 'light'].mean() < 50)
    })

    # 3. Activity Intensity Features
    features.update({
        'sedentary_time_ratio': (data['magnitude'] < magnitude_threshold * 0.5).mean(),
        'moderate_activity_ratio': ((data['magnitude'] >= magnitude_threshold * 0.5) &
            (data['magnitude'] < magnitude_threshold * 1.5)).mean(),
        'vigorous_activity_ratio': (data['magnitude'] >= magnitude_threshold * 1.5).mean(),
        'activity_peaks_per_day': len(data[data['magnitude'] >
            data['magnitude'].quantile(0.95)]) / len(data.groupby('relative_date_PCIAT'))
    })

    # 4. Circadian Rhythm Features
    hourly_activity = data.groupby(hour)['magnitude'].mean()
    features.update({
        'circadian_regularity': hourly_activity.std() / hourly_activity.mean(),
        'peak_activity_hour': hourly_activity.idxmax(),
        'trough_activity_hour': hourly_activity.idxmin(),
        'activity_range': hourly_activity.max() - hourly_activity.min()
    })

    # 5-11. Additional Feature Groups
    weekend_mask = data['weekday'].isin([6, 7])

    features.update({
        # Movement Patterns
        'movement_entropy': stats.entropy(pd.qcut(data['magnitude'], q=10, duplicates='drop').value_counts()),
        'direction_changes': len(data[abs(data['anglez'].diff()) > 30]) / len(data),
        'sustained_activity_periods': len(data[data['magnitude'].rolling(12).mean() >
            magnitude_threshold]) / len(data),

        # Weekend vs Weekday
        'weekend_activity_ratio': data.loc[weekend_mask, 'magnitude'].mean() /
            data.loc[~weekend_mask, 'magnitude'].mean(),
        'weekend_sleep_difference': data.loc[weekend_mask & sleep_hours, 'magnitude'].mean() -
            data.loc[~weekend_mask & sleep_hours, 'magnitude'].mean(),

        # Non-wear Time
        'wear_time_ratio': (data['non-wear_flag'] == 0).mean(),
        'wear_consistency': len(data['non-wear_flag'].value_counts()),
        'longest_wear_streak': data['non-wear_flag'].eq(0).astype(int).groupby(
            data['non-wear_flag'].ne(0).cumsum()).sum().max(),

        # Device Usage
        'screen_time_proxy': (data['light'] > data['light'].quantile(0.75)).mean(),
        'dark_environment_ratio': (data['light'] < data['light'].quantile(0.25)).mean(),
        'light_variation': data['light'].std() / data['light'].mean() if data['light'].mean() != 0 else 0,

        # Battery Usage
        'battery_drain_rate': -np.polyfit(range(len(data)), data['battery_voltage'], 1)[0],
        'battery_variability': data['battery_voltage'].std(),
        'low_battery_time': (data['battery_voltage'] < data['battery_voltage'].quantile(0.1)).mean(),

        # Time-based
        'days_monitored': data['relative_date_PCIAT'].nunique(),
        'total_active_hours': len(data[data['magnitude'] > magnitude_threshold * 0.5]) * 5 / 3600,
        'activity_regularity': data.groupby('weekday')['magnitude'].mean().std()
    })

    # Variability Features for multiple columns
    for col in ['X', 'Y', 'Z', 'enmo', 'anglez']:
        features.update({
            f'{col}_skewness': data[col].skew(),
            f'{col}_kurtosis': data[col].kurtosis(),
            f'{col}_trend': np.polyfit(range(len(data)), data[col], 1)[0]
        })

    return pd.DataFrame([features])

def process_file(filename, dirname):
    df= pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    data=extract_advanced_features(df)
    array_1=data.values[0]
    array_2=df.describe().values.reshape(-1), filename.split('=')[1]
    # Combine the two arrays
    combined_array = np.concatenate((array_1, array_2[0]))
    combined_tuple=(array_1,array_2[1])
    return combined_tuple

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)

    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))

    stats, indexes = zip(*results)

    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm


class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded


class DeepAutoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(DeepAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim * 3),
            nn.BatchNorm1d(encoding_dim * 3),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(encoding_dim * 3, encoding_dim * 2),
            nn.BatchNorm1d(encoding_dim * 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(encoding_dim * 2, encoding_dim),
            nn.BatchNorm1d(encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, encoding_dim * 2),
            nn.BatchNorm1d(encoding_dim * 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(encoding_dim * 2, encoding_dim * 3),
            nn.BatchNorm1d(encoding_dim * 3),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(encoding_dim * 3, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded


def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32, learning_rate=0.001,
                        use_deep=True, save_model_path=None, seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    df.fillna(df.mean(), inplace=True)
    df = df.loc[:, df.std() > 0]

    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    data_tensor = torch.tensor(df_scaled, dtype=torch.float32).to(device)

    dataset = TensorDataset(data_tensor, data_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    input_dim = df_scaled.shape[1]
    if use_deep:
        model = DeepAutoencoder(input_dim=input_dim, encoding_dim=encoding_dim).to(device)
    else:
        model = Autoencoder(input_dim=input_dim, encoding_dim=encoding_dim).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

    best_loss = float('inf')
    losses = []
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{epochs}", leave=False)

        for batch_features, _ in progress_bar:
            batch_features = batch_features.to(device)

            _, decoded = model(batch_features)
            loss = criterion(decoded, batch_features)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        epoch_loss /= len(dataloader)
        losses.append(epoch_loss)

        if epoch_loss < best_loss:
            best_loss = epoch_loss
            if save_model_path:
                torch.save(model.state_dict(), save_model_path)
                print(f"Best model saved at Epoch {epoch + 1} with Loss: {best_loss:.4f}")

        scheduler.step(epoch_loss)
        print(f"Epoch {epoch + 1}/{epochs}, Avg Loss: {epoch_loss:.4f}")

    plt.plot(range(1, epochs + 1), losses)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.show()

    model.eval()
    with torch.no_grad():
        encoded_data = model.encoder(data_tensor).cpu().numpy()

    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])

    return df_encoded


In [None]:
def convert_object_to_category(df):
    return df.astype({col: "category" for col in df.select_dtypes(include=["object"]).columns})

In [None]:
class OptunaRounder:
    def __init__(self, y_true, y_pred):
        self.y_true = np.array(y_true)
        self.y_pred = np.array(y_pred)
        self.labels = np.unique(self.y_true)

    def __call__(self, trial):
        thresholds = []
        for i in range(len(self.labels) - 1):
            low = thresholds[-1] if i > 0 else min(self.labels)
            high = max(self.labels)
            t = trial.suggest_uniform(f't{i}', low, high)
            thresholds.append(t)

        thresholds = sorted(thresholds)
        try:
            if not isinstance(self.y_true, (np.ndarray, list)):
                raise ValueError(f"Invalid y_true type: {type(self.y_true)}")
            if not isinstance(self.y_pred, np.ndarray):
                raise ValueError(f"Invalid y_pred type: {type(self.y_pred)}")

            opt_y_pred = self.adjust(self.y_pred, thresholds)
            return cohen_kappa_score(self.y_true, opt_y_pred, weights='quadratic')
        except Exception as e:
            print(f"Error in OptunaRounder call: {e}")
            return 0

    def adjust(self, y_pred, thresholds):
        if not isinstance(y_pred, np.ndarray):
            y_pred = np.array(y_pred)
        opt_y_pred = np.digitize(y_pred, [-np.inf] + thresholds + [np.inf]) - 1
        return opt_y_pred

    @staticmethod
    def adjust_static(y_pred, thresholds):
        return np.digitize(y_pred, [-np.inf] + thresholds + [np.inf]) - 1


In [None]:
df_train = pd.read_csv(f"{DATA_DIR}/train.csv")
ts_train = load_time_series(f"{DATA_DIR}/series_train.parquet")
# DeepAutoencoderを使用
train_ts_encoded = perform_autoencoder(
    df=ts_train.set_index("id"),
    encoding_dim=60,
    epochs=100,
    batch_size=32,
    use_deep=True,               # DeepAutoencoderを使用
    seed=SEED,
    # save_model_path="deep_autoencoder_model.pth"  # モデル保存パス
)


train_ts_encoded["id"] = ts_train["id"]
train_ts_encoded.set_index("id", inplace=True)

In [None]:
df_train, added_cols = feature_engineering(df_train)
FEATURE_COLS = FEATURE_COLS + added_cols

df_train = pd.merge(df_train, train_ts_encoded, how="left", on='id')
df_train = pd.merge(df_train, ts_train, how="left", on='id')
df_train = df_train.set_index('id')

time_series_cols = ts_train.columns.tolist()
ts_encoded_cols = train_ts_encoded.columns.tolist()
time_series_cols.remove('id')
FEATURE_COLS = FEATURE_COLS + ts_encoded_cols + time_series_cols

df_train = convert_object_to_category(df_train)

In [None]:
df_train = df_train.dropna(subset=TARGET_COL)
y = df_train[TARGET_COL]
X = df_train[FEATURE_COLS]

In [None]:
a = y.mean()
b = y.var(ddof=0)

y_min = y.min()
y_max = y.max()
print(f"y_min:{y_min}, y_max:{y_max}")

def quadratic_weighted_kappa(preds, data):
    y_true = data.get_label()
    y_pred = preds.clip(y_min, y_max).round()
    qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    return 'qwk', qwk, True

def qwk_obj(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.array(preds).clip(y_min, y_max)

    f = 1/2 * np.sum((preds - labels) ** 2)
    g = 1/2 * np.sum((preds - a) ** 2 + b)

    df = preds - labels
    dg = preds - a
    grad = (df / g - f * dg / g ** 2) * len(labels)
    hess = np.ones(len(labels))

    return grad, hess


In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective_with_kfold(trial, X, y, n_splits=10, seed=42):
    params = {
        "objective": "regression",
        "metric": "None",
        "verbosity": -1,
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.05, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 50),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 50),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 0.9),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 0.9),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "seed": seed
    }
    params["objective"] = qwk_obj

    if gpu_available:
        params.update({
            "device": "gpu",
            "gpu_platform_id": 0,
            "gpu_device_id": 0
        })

    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    y_stratified = pd.qcut(y, q=n_splits, labels=False, duplicates="drop")

    fold_scores = []
    fold_thresholds = []

    for train_idx, valid_idx in kf.split(X, y_stratified):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

        model = lgb.train(
            params=params,
            train_set=lgb_train,
            num_boost_round=10000,
            valid_sets=[lgb_train, lgb_valid],
            valid_names=["train", "valid"],
            feval=quadratic_weighted_kappa,
            callbacks=[
                lgb.early_stopping(stopping_rounds=100, verbose=True),
                lgb.log_evaluation(100)
            ]
        )

        y_pred = model.predict(X_valid, num_iteration=model.best_iteration)

        rounder = OptunaRounder(y_valid, y_pred)
        study = optuna.create_study(direction="maximize")
        study.optimize(rounder, n_trials=50)
        best_thresholds = sorted(study.best_params.values())
        fold_thresholds.append(best_thresholds)

        y_pred_adjusted = rounder.adjust(y_pred, best_thresholds)
        fold_score = cohen_kappa_score(y_valid, y_pred_adjusted, weights="quadratic")
        fold_scores.append(fold_score)

    return np.mean(fold_scores), fold_thresholds


def optimize_with_kfold(X, y, n_trials=50, n_splits=10, seed=42):
    trial_fold_thresholds = {}

    def objective_wrapper(trial):
        mean_score, fold_thresholds = objective_with_kfold(trial, X, y, n_splits, seed)
        trial_fold_thresholds[trial.number] = fold_thresholds
        return mean_score

    study = optuna.create_study(direction="maximize")
    study.optimize(objective_wrapper, n_trials=n_trials)

    best_trial_number = study.best_trial.number
    best_fold_thresholds = trial_fold_thresholds[best_trial_number]

    print("Best trial:")
    print(f"  Value: {study.best_trial.value}")
    print(f"  Params: {study.best_trial.params}")
    print(f"Best Trial Number: {best_trial_number}")
    print(f"Best Fold Thresholds: {best_fold_thresholds}")

    return study.best_trial.params, study, best_fold_thresholds


In [None]:
N_TRIALS = 50
wandb.config.update({"optimize_n_trials": N_TRIALS})
best_params, study, fold_thresholds = optimize_with_kfold(X, y, n_trials=N_TRIALS, n_splits=10, seed=SEED)

file_path = os.path.join(save_path, f"fold_thresholds_{SEED}.pkl")
with open(file_path, "wb") as f:
    pickle.dump(fold_thresholds, f)

optuna_params = {
    "objective": "regression",
    "metric": "None",
    "verbosity": -1,
    "learning_rate": best_params["learning_rate"],
    "num_leaves": best_params["num_leaves"],
    "min_data_in_leaf": best_params["min_data_in_leaf"],
    "feature_fraction": best_params["feature_fraction"],
    "bagging_fraction": best_params["bagging_fraction"],
    "bagging_freq": best_params["bagging_freq"],
    "seed": SEED
}
optuna_params["objective"] = qwk_obj
if gpu_available:
    optuna_params.update({
        "device": "gpu",
        "gpu_platform_id": 0,
        "gpu_device_id": 0
    })


In [None]:
best_values = [trial.value for trial in study.trials]
plt.plot(best_values)
plt.xlabel("Trial")
plt.ylabel("Best Value")
plt.title("Optuna Trials Performance")
plt.show()

In [None]:
preds_oof = np.zeros(len(df_train))

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
try:
    y_stratified = pd.qcut(y, q=kf.get_n_splits(), labels=False, duplicates="drop")
except ValueError:
    y_stratified = y

fold_scores = []
models = []

for fold, (idx_train, idx_valid) in enumerate(kf.split(X, y_stratified)):
    X_train = X.iloc[idx_train]
    y_train = y.iloc[idx_train]
    X_valid = X.iloc[idx_valid]
    y_valid = y.iloc[idx_valid]

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid)

    callbacks = [
        lgb.early_stopping(stopping_rounds=100, verbose=True),
        lgb.log_evaluation(100)
    ]

    model = lgb.train(
        params=optuna_params,
        train_set=lgb_train,
        num_boost_round=10000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=["train", "valid"],
        feval=quadratic_weighted_kappa,
        callbacks=callbacks
    )

    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    best_thresholds = fold_thresholds[fold]
    y_pred_adjusted = OptunaRounder.adjust_static(y_pred, best_thresholds)
    fold_score = cohen_kappa_score(y_valid, y_pred_adjusted, weights="quadratic")
    fold_scores.append(fold_score)
    wandb.log({f"fold_{fold + 1}_qwk_score": fold_score})

    models.append(model)
    preds_oof[idx_valid] = y_pred_adjusted

cv_score = cohen_kappa_score(y, preds_oof, weights="quadratic")

average_fold_score = np.mean(fold_scores)
