In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score


In [15]:
# Load training and validation CSVs from the same GCS bucket used in logreg notebook
df_train = pd.read_csv("gs://aura_datasets_training_validation/AURA_aug_sep_60k.csv")
df_val = pd.read_csv("gs://aura_datasets_training_validation/AURA_validation_sep_12k.csv")

feature_cols = ["noise_db", "light_lux", "crowd_count"]
target_col = "discomfort_level"

X_train = df_train[feature_cols]
y_train = df_train[target_col]
X_val = df_val[feature_cols]
y_val = df_val[target_col]

print('Train shape:', X_train.shape, 'Val shape:', X_val.shape)

Train shape: (60000, 3) Val shape: (12000, 3)


## Preprocessing + Pipeline

In [None]:
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
preprocess = ColumnTransformer(transformers=[("num", numeric_transformer, feature_cols)], remainder="drop")

gb = GradientBoostingRegressor(random_state=42)
pipe = Pipeline(steps=[("preprocess", preprocess), ("clf", gb)])

## Quick grid search and training

In [None]:
param_grid = {
    "clf__n_estimators": [100, 200],
    "clf__max_depth": [3, 5],
    "clf__learning_rate": [0.05, 0.1],
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=3, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

best_pipe = grid_search.best_estimator_
print('Best params:', grid_search.best_params_)

In [None]:
# Evaluate on validation set
y_pred = best_pipe.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print(f'Validation MSE: {mse:.6f}  R2: {r2:.6f}')

In [None]:
# Show sample predictions + labels
val_df = df_val.copy()
val_df['discomfort_pred'] = y_pred
val_df['comfort_pred'] = val_df['discomfort_pred'].apply(discomfort_to_label)
val_df[["noise_db","light_lux","crowd_count","discomfort_level","discomfort_pred","comfort_pred"]].head()

## Upload best model to GCS (module call)

In [None]:
# Ensure MODEL_BUCKET env var is set before running this cell.
# This will run the training+upload routine defined in package_aura.gradient_boosting_gcs
train_gradient_boosting_model()

## Example predict via GCS-loaded model

In [None]:
sample = {"noise_db": 85.0, "light_lux": 700.0, "crowd_count": 10.0}
print('Local predict (best_pipe):', float(best_pipe.predict(pd.DataFrame([sample]))[0]))
print('GCS predict:', gradient_boosting_predict(**sample))