In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasetsdatasets/train (1).csv
/kaggle/input/datasetsdatasets/test.csv


In [3]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import TimeSeriesSplit
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import warnings
from sklearn.preprocessing import PolynomialFeatures

# Load file paths
file_path_train = "/kaggle/input/datasetsdatasets/train (1).csv"
file_path_test = "/kaggle/input/datasetsdatasets/test.csv"

# Load training data
try:
    df_train = pd.read_csv(file_path_train)
except FileNotFoundError:
    raise FileNotFoundError(f"File not found at {file_path_train}")

# Load test data
try:
    df_test = pd.read_csv(file_path_test)
except FileNotFoundError:
    raise FileNotFoundError(f"File not found at {file_path_test}")

# Define features and targets
features = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N"]
targets = ["Y1", "Y2"]

# Verify columns exist in training data
required_columns = features + targets
missing_cols_train = [col for col in required_columns if col not in df_train.columns]
if missing_cols_train:
    raise ValueError(f"Missing columns in training dataset: {missing_cols_train}")

# Verify columns in test data (allow missing targets)
missing_cols_test = [col for col in features if col not in df_test.columns]
if missing_cols_test:
    raise ValueError(f"Missing columns in test dataset: {missing_cols_test}")

# Check data types
print("Training DataFrame dtypes:\n", df_train.dtypes)
print("Test DataFrame dtypes:\n", df_test.dtypes)
if not all(df_train[features].dtypes.apply(lambda x: np.issubdtype(x, np.number))):
    raise ValueError(f"Non-numeric data in training features: {df_train[features].dtypes}")
if not all(df_test[features].dtypes.apply(lambda x: np.issubdtype(x, np.number))):
    raise ValueError(f"Non-numeric data in test features: {df_test[features].dtypes}")
if all(col in df_train.columns for col in targets) and not all(df_train[targets].dtypes.apply(lambda x: np.issubdtype(x, np.number))):
    raise ValueError(f"Non-numeric data in training targets: {df_train[targets].dtypes}")

# Check for NaNs in original datasets
print("NaNs in original training DataFrame:", df_train.isna().sum().sum())
print("NaN counts per column in training DataFrame:\n", df_train.isna().sum())
print("NaNs in original test DataFrame:", df_test.isna().sum().sum())
print("NaN counts per column in test DataFrame:\n", df_test.isna().sum())

# Forward fill and backward fill NaNs in original datasets
df_train = df_train[features + targets].ffill().bfill()
df_test = df_test[features + (targets if all(col in df_test.columns for col in targets) else [])].ffill().bfill()
print("NaNs after ffill/bfill in training DataFrame:", df_train.isna().sum().sum())
print("NaNs after ffill/bfill in test DataFrame:", df_test.isna().sum().sum())

Training DataFrame dtypes:
 time      int64
A       float64
B       float64
C       float64
D       float64
E       float64
F       float64
G       float64
H       float64
I       float64
J       float64
K       float64
L       float64
M       float64
N       float64
Y1      float64
Y2      float64
dtype: object
Test DataFrame dtypes:
 id        int64
time      int64
A       float64
B       float64
C       float64
D       float64
E       float64
F       float64
G       float64
H       float64
I       float64
J       float64
K       float64
L       float64
M       float64
N       float64
dtype: object
NaNs in original training DataFrame: 0
NaN counts per column in training DataFrame:
 time    0
A       0
B       0
C       0
D       0
E       0
F       0
G       0
H       0
I       0
J       0
K       0
L       0
M       0
N       0
Y1      0
Y2      0
dtype: int64
NaNs in original test DataFrame: 0
NaN counts per column in test DataFrame:
 id      0
time    0
A       0
B       0
C      

In [4]:
# 1. Lag and Shift Features
def create_lag_features(dframe, feature_vars, lags=[1, 3, 7]):
    df_lagged = dframe.copy()
    new_columns = {}
    for var in feature_vars:
        for lag in lags:
            new_columns[f'{var}_lag{lag}'] = df_lagged[var].shift(lag)
    new_columns_df = pd.DataFrame(new_columns, index=df_lagged.index)
    df_lagged = pd.concat([df_lagged, new_columns_df], axis=1)
    lag_columns = list(new_columns.keys())
    print("NaNs in lagged columns before ffill:", df_lagged[lag_columns].isna().sum().sum())
    df_lagged[lag_columns] = df_lagged[lag_columns].ffill().bfill()
    print("NaNs in lagged columns after ffill/bfill:", df_lagged[lag_columns].isna().sum().sum())
    return df_lagged

# 2. Rolling Window Statistics
def create_rolling_features(dframe, feature_vars, windows=[1, 3, 7], ema_span=7):
    df_metrics = dframe.copy()
    new_columns = {}
    for var in feature_vars:
        for window in windows:
            new_columns[f'{var}_mean_{window}d'] = df_metrics[var].rolling(window=window, min_periods=1).mean()
            new_columns[f'{var}_var_{window}d'] = df_metrics[var].rolling(window=window, min_periods=1).var().fillna(0)
            new_columns[f'{var}_std_{window}d'] = df_metrics[var].rolling(window=window, min_periods=1).std()
            new_columns[f'{var}_min_{window}d'] = df_metrics[var].rolling(window=window, min_periods=1).min()
            new_columns[f'{var}_max_{window}d'] = df_metrics[var].rolling(window=window, min_periods=1).max()
            new_columns[f'{var}_median_{window}d'] = df_metrics[var].rolling(window=window, min_periods=1).median()
        new_columns[f'{var}_ema_{ema_span}d'] = df_metrics[var].ewm(span=ema_span, adjust=False).mean()
    new_columns_df = pd.DataFrame(new_columns, index=df_metrics.index)
    df_metrics = pd.concat([df_metrics, new_columns_df], axis=1)
    rolling_columns = list(new_columns.keys())
    print("NaNs in rolling columns before ffill:", df_metrics[rolling_columns].isna().sum().sum())
    df_metrics[rolling_columns] = df_metrics[rolling_columns].ffill().bfill()
    print("NaNs in rolling columns after ffill/bfill:", df_metrics[rolling_columns].isna().sum().sum())
    return df_metrics

# 3. Seasonal and Periodic features 
def create_periodic_features(dframe, windows = [1, 3, 7]):
    df_metrics = dframe.copy()
    t = np.arange(len(dframe))
    new_columns = {}
    
    for window in windows:
        new_columns[f'sin_{window}d'] = np.sin(2 * np.pi * t / window)
        new_columns[f'cos_{window}d'] = np.cos(2 * np.pi * t / window)
    new_columns_df = pd.DataFrame(new_columns, index = dframe.index)
    df_metrics = pd.concat([df_metrics, new_columns_df], axis = 1)

    return df_metrics

# 4. Multivariate features 
def multivariate_features(dframe, feature_vars):

    if not isinstance(dframe, pd.DataFrame):
        raise ValueError("Input 'dframe' must be a pandas DataFrame")
    if not all(var in dframe.columns for var in feature_vars):
        raise ValueError("Not all 'feature_vars' are columns in 'dframe'")
    if dframe.empty:
        raise ValueError("Input 'dframe' is empty")
    
    df_metrics = dframe.copy()
    poly = PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)
    poly_features = poly.fit_transform(dframe[feature_vars])
    poly_feature_names = poly.get_feature_names_out(feature_vars)
    
    poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=df_metrics.index)
    poly_df = poly_df.drop(columns=feature_vars, errors='ignore')
    
    df_metrics = pd.concat([df_metrics, poly_df], axis=1)
    
    if not df_metrics.index.equals(dframe.index):
        raise ValueError("Index misalignment after concatenation")
    
    return df_metrics
    
# === Process training data === 
df_train_lag = create_lag_features(df_train, features)
df_train_rolling = create_rolling_features(df_train_lag, features)

rolling_columns = [col for col in df_train_rolling.columns if col not in df_train_lag.columns]

df_train_combined = pd.concat([df_train_lag, df_train_rolling[rolling_columns]], axis=1)

# === Process test data === 
df_test_lag = create_lag_features(df_test, features)
df_test_rolling = create_rolling_features(df_test_lag, features)

rolling_columns = [col for col in df_test_rolling.columns if col not in df_test_lag.columns]
    
df_test_combined = pd.concat([df_test_lag, df_test_rolling[rolling_columns]], axis=1)

# === Process Seasonal and Periodic features ==== 
df_train_combined = create_periodic_features(df_train_combined)
df_test_combined = create_periodic_features(df_test_combined)

df_train_combined = multivariate_features(df_train_combined, features)
df_test_combined = multivariate_features(df_test_combined, features)

NaNs in lagged columns before ffill: 154
NaNs in lagged columns after ffill/bfill: 0
NaNs in rolling columns before ffill: 1120028
NaNs in rolling columns after ffill/bfill: 1120000
NaNs in lagged columns before ffill: 154
NaNs in lagged columns after ffill/bfill: 0
NaNs in rolling columns before ffill: 223972
NaNs in rolling columns after ffill/bfill: 223944


In [5]:
selected_features = ['A B K', 'G J', 'G H J', 'G', 'G H M', 'A B F', 'A I K', 'C G H', 
                      'A F K', 'K', 'G M', 'C G', 'G H', 'A B', 'A K', 'B D K', 'D', 
                      'E G M', 'A D F', 'A D K']

# Hyperpatameter optimization

In [6]:
import optuna

def objective(trial):
    # Define hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 60, 70),
        'max_depth': trial.suggest_int('max_depth', 8, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05, log=True),
        'random_state': 42
    }
    
    # Initialize model
    model = MultiOutputRegressor(XGBRegressor(objective='reg:squarederror', **params))
    
    # TimeSeriesSplit for cross-validation
    tscv = TimeSeriesSplit(n_splits=5, test_size=10)
    r2_scores = []

    # Preparing data
    X_train = df_train_combined[selected_features] 
    y_train = df_train_combined[targets]
    
    # Perform cross-validation
    for train_idx, test_idx in tscv.split(X_train):
        X_train_tmp, X_test_tmp = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_train_tmp, y_test_tmp = y_train.iloc[train_idx], y_train.iloc[test_idx]
        
        # Fit model
        model.fit(X_train_tmp, y_train_tmp)
        
        # Predict and calculate R²
        y_pred = model.predict(X_test_tmp)
        r2 = r2_score(y_test_tmp, y_pred, multioutput='uniform_average')
        r2_scores.append(r2)
    
    # Return average R² across folds (maximize)
    return np.mean(r2_scores)

# Create Optuna study and optimize
study = optuna.create_study(direction='maximize')  # Maximize R²
study.optimize(objective, n_trials=50)  # Adjust n_trials as needed

# Print best parameters and value
print("Best Parameters:", study.best_params)
print("Best R² Score:", study.best_value)

# Train final model with best parameters
best_params = study.best_params
final_model = MultiOutputRegressor(XGBRegressor(objective='reg:squarederror', **best_params))

[I 2025-09-21 12:27:19,280] A new study created in memory with name: no-name-72e45767-faa8-424e-a732-4e0aa957257c
[I 2025-09-21 12:27:49,614] Trial 0 finished with value: 0.2898385333323804 and parameters: {'n_estimators': 59, 'max_depth': 10, 'learning_rate': 0.029711869646755013}. Best is trial 0 with value: 0.2898385333323804.
[I 2025-09-21 12:28:17,040] Trial 1 finished with value: 0.23932836745444983 and parameters: {'n_estimators': 80, 'max_depth': 9, 'learning_rate': 0.029957112844895974}. Best is trial 0 with value: 0.2898385333323804.
[I 2025-09-21 12:28:37,791] Trial 2 finished with value: 0.2604203954814598 and parameters: {'n_estimators': 60, 'max_depth': 9, 'learning_rate': 0.035209607408044116}. Best is trial 0 with value: 0.2898385333323804.
[I 2025-09-21 12:29:18,930] Trial 3 finished with value: 0.274188378174761 and parameters: {'n_estimators': 82, 'max_depth': 10, 'learning_rate': 0.007753762634550341}. Best is trial 0 with value: 0.2898385333323804.
[I 2025-09-21 12

Best Parameters: {'n_estimators': 79, 'max_depth': 9, 'learning_rate': 0.013125447012695931}
Best R² Score: 0.33362248002164074
