# The main notebook for training with autogluon


In [17]:
import pandas as pd
import os
from autogluon.tabular import TabularPredictor
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

In [18]:
os.environ['RAY_IGNORE_UNHANDLED_ERRORS'] = '1'

## Load the data

In [19]:
train_path = "/Users/akseljoonas/Documents/mlfortnight/data/raw/train_fixed.csv"
sub_path = "/Users/akseljoonas/Documents/mlfortnight/data/raw/test_fixed.csv"

train_data = pd.read_csv(train_path)
sub_data = pd.read_csv(sub_path)

identifier = "cbrt_hourcos_isweekend_pruned_3ma60_inter_-13_tempcos-cor"

## Feature engineering

In [20]:
# HOUR COS



train_data["hour_of_day"] = pd.to_datetime(train_data["measurement_time"]).dt.hour
sub_data["hour_of_day"] = pd.to_datetime(sub_data["measurement_time"]).dt.hour

train_data["hour_of_day_cos"] = np.cos(2 * np.pi * train_data["hour_of_day"] / 24)
sub_data["hour_of_day_cos"] = np.cos(2 * np.pi * sub_data["hour_of_day"] / 24)

In [21]:
# IS_WEEKEND

train_data["is_weekend"] = (
    pd.to_datetime(train_data["measurement_time"]).dt.weekday >= 5
)
sub_data["is_weekend"] = pd.to_datetime(sub_data["measurement_time"]).dt.weekday >= 5


In [22]:
# MOVING AVERAGES

columns_to_average = [
    "source_1_temperature",
    "mean_room_temperature",
    "outside_temperature",
]

# Create moving averages for each column in the list
for column in columns_to_average:
    train_data[f"{column}_moving_avg"] = (
        train_data[column].rolling(window=12, min_periods=1).mean()
    )
    sub_data[f"{column}_moving_avg"] = (
        sub_data[column].rolling(window=12, min_periods=1).mean()
    )

In [23]:
train_data["source_3_temperature_cos"] = np.cos(2 * np.pi * train_data["source_3_temperature"] / 24)
sub_data["source_3_temperature_cos"] = np.cos(2 * np.pi * sub_data["source_3_temperature"] / 24)

train_data["source_2_temperature_cos"] = np.cos(2 * np.pi * train_data["source_2_temperature"] / 24)
sub_data["source_2_temperature_cos"] = np.cos(2 * np.pi * sub_data["source_2_temperature"] / 24)

train_data["source_4_temperature_cos"] = np.cos(2 * np.pi * train_data["source_4_temperature"] / 24)
sub_data["source_4_temperature_cos"] = np.cos(2 * np.pi * sub_data["source_4_temperature"] / 24)

In [24]:
# PRUNING

base_drop = ["hour_of_day", "ID", "measurement_time"]
prune_drop = [
    "clouds",
    "wind_direction",
    "wind_speed",
    "sun_radiation_perpendicular",
]

columns_to_drop = base_drop + prune_drop
train_data.drop(columns=columns_to_drop, inplace=True)
sub_data.drop(columns=columns_to_drop, inplace=True)

In [25]:
# INTERACTION FEATURES



def add_interaction_features(
    data: pd.DataFrame, submission_data: pd.DataFrame
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Adds interaction features to the datasets.

    Args:
        data (pd.DataFrame): The main dataset.
        submission_data (pd.DataFrame): The submission dataset.

    Returns:
        tuple[pd.DataFrame, pd.DataFrame]: The datasets with added interaction features.
    """
    # List of selected features for interaction
    selected_features = [
        "mean_room_temperature",
        "outside_temperature",
        "source_1_temperature",
        "source_2_temperature",
        "source_3_temperature",
        "source_4_temperature",
        "hour_of_day_cos",
        "source_1_temperature_moving_avg",
        "mean_room_temperature_moving_avg",
        "outside_temperature_moving_avg",
    ]
    # Initialize the PolynomialFeatures transformer
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

    # Transform the main dataset
    X_selected_data = data[selected_features]
    X_interactions_data = poly.fit_transform(X_selected_data)

    interaction_feature_names = poly.get_feature_names_out(selected_features)


    X_interactions_df_data = pd.DataFrame(
        X_interactions_data, columns=interaction_feature_names
    )
    X_interactions_df_data = X_interactions_df_data.drop(columns=selected_features)
    data = pd.concat([data, X_interactions_df_data], axis=1)

    # Transform the submission dataset
    X_selected_submission = submission_data[selected_features]
    X_interactions_submission = poly.transform(X_selected_submission)
    X_interactions_df_submission = pd.DataFrame(
        X_interactions_submission, columns=interaction_feature_names
    )
    X_interactions_df_submission = X_interactions_df_submission.drop(
        columns=selected_features
    )

    submission_data = pd.concat([submission_data, X_interactions_df_submission], axis=1)

    return data, submission_data


train_data, sub_data = add_interaction_features(train_data, sub_data)

In [26]:
# CUBE ROOT TRANSFORMATION OF SELECTED FEATURES

include_columns = [
    "source_1_temperature",
    "source_2_temperature",
    "source_3_temperature",
    "mean_room_temperature",
    "source_4_temperature",
    "sun_radiation_west",
    "mean_room_temperature source_1_temperature",
    "source_2_temperature source_4_temperature",
    "source_3_temperature source_4_temperature",
    "source_2_temperature source_3_temperature",
    "mean_room_temperature source_3_temperature",
]

# Apply cube root transformation and rename columns
train_data = train_data.apply(
    lambda x: x ** (1 / 3) if x.name in include_columns else x
).rename(columns={col: f"cbrt_{col}" for col in include_columns})

sub_data = sub_data.apply(
    lambda x: x ** (1 / 3) if x.name in include_columns else x
).rename(columns={col: f"cbrt_{col}" for col in include_columns})

In [27]:
# FINAL PRUNING BASED ON CORRELATION MATRIX

columns_to_drop = [
    "outside_temperature_moving_avg",
    "mean_room_temperature outside_temperature_moving_avg",
    "mean_room_temperature_moving_avg outside_temperature_moving_avg",
    "mean_room_temperature outside_temperature",
    "outside_temperature",
    "source_2_temperature outside_temperature_moving_avg",
    "outside_temperature mean_room_temperature_moving_avg",
    "hour_of_day_cos outside_temperature_moving_avg",
    "source_4_temperature outside_temperature_moving_avg",
    "source_3_temperature outside_temperature_moving_avg",
    "hour_of_day_cos mean_room_temperature_moving_avg",
    "outside_temperature outside_temperature_moving_avg",
    "source_2_temperature source_1_temperature_moving_avg",
    "source_3_temperature mean_room_temperature_moving_avg",
    "mean_room_temperature source_1_temperature_moving_avg",
    "hour_of_day_cos",
    "source_2_temperature hour_of_day_cos",
    "hour_of_day_cos source_1_temperature_moving_avg",
    "mean_room_temperature hour_of_day_cos",
    "outside_temperature source_3_temperature",
]


train_data.drop(columns=columns_to_drop, inplace=True)
sub_data.drop(columns=columns_to_drop, inplace=True)

In [None]:
# COMPUTE CORRELATION MATRIX
correlation_matrix = train_data.corr()

# Flatten the correlation matrix and sort by absolute value, excluding self-correlations
sorted_correlations = (
    correlation_matrix.where(~np.eye(correlation_matrix.shape[0], dtype=bool))
    .stack()
    .abs()
    .sort_values(ascending=False)
)

# Display the sorted correlation pairs
print("Cross-correlation matrix (sorted):")
print(sorted_correlations)


In [29]:
# SAVE THE DATA
train_data.to_csv(f"{identifier}.csv", index=False)
sub_data.to_csv(f"{identifier}_submit.csv", index=False)

## Train the model with autogluon

In [30]:
label = "target"
problem_type = "regression"

In [31]:
predictor = TabularPredictor(
    label=label,
    problem_type=problem_type,
    eval_metric="mean_absolute_error",
    path=f"./AutogluonModels/{identifier}_{train_path.split('/')[-1].split('.')[0]}",
)

In [None]:
predictor = predictor.fit(
    # presets="best",
    train_data=train_data,
    auto_stack=True,
    time_limit=600,
)


## Predict and save the results

In [None]:
print(predictor.model_best)

In [34]:
preds = pd.DataFrame(predictor.predict(sub_data))
preds["ID"] = pd.read_csv("/Users/akseljoonas/Documents/mlfortnight/data/raw/test.csv")[
    "ID"
]

In [35]:
preds.to_csv(
    f"lol_{identifier}_{train_path.split('/')[-1]}",
    columns=["ID", "target"],
    index=False,
)

## Feature importance and leaderboard of the models

In [None]:
predictor.leaderboard(train_data)

In [None]:
feature_importance = predictor.feature_importance(train_data)
feature_importance.to_csv(f"importance_{identifier}_{train_path.split('/')[-1]}")
feature_importance