In [7]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go

# Load the CSV files
occupancy = pd.read_csv('occupancy.csv', parse_dates=['DS'])
occupancy_future = pd.read_csv('occupancy_future.csv', parse_dates=['DS'])
weather = pd.read_csv('weather_parkplatz-bad-kohlgrub-schwebebahn-1.csv', parse_dates=['Timestamp'])

# Preprocess weather data
weather['Timestamp'] = pd.to_datetime(weather['Timestamp'])
weather['Condition'] = weather['Condition'].ffill()  # Forward fill missing weather conditions
weather['Sunshine'] = weather['Sunshine'].fillna(0)  # Fill missing sunshine with 0
weather['Temperature'] = weather['Temperature'].fillna(weather['Temperature'].mean())  # Fill missing temperature with mean

# Extract time-based features from the timestamp
def create_time_features(df, timestamp_col):
    df['hour'] = df[timestamp_col].dt.hour
    df['day_of_week'] = df[timestamp_col].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    return df

occupancy = create_time_features(occupancy, 'DS')
occupancy_future = create_time_features(occupancy_future, 'DS')
weather = create_time_features(weather, 'Timestamp')

# Convert both columns to the same timezone (or make them timezone-naive)
occupancy['DS'] = occupancy['DS'].dt.tz_localize(None)  # Make the occupancy timestamp timezone-naive
occupancy_future['DS'] = occupancy_future['DS'].dt.tz_localize(None)  # Same for future occupancy data
weather['Timestamp'] = weather['Timestamp'].dt.tz_localize(None)  # Make the weather timestamp timezone-naive

# Merge occupancy data with weather data based on timestamps
occupancy_merged = pd.merge_asof(occupancy, weather, left_on='DS', right_on='Timestamp', direction='nearest')
occupancy_future_merged = pd.merge_asof(occupancy_future, weather, left_on='DS', right_on='Timestamp', direction='nearest')

# Drop unnecessary columns
occupancy_merged.drop(columns=['DS', 'Timestamp'], inplace=True)
occupancy_future_merged.drop(columns=['DS', 'Timestamp'], inplace=True)

# Prepare training and testing data
X = occupancy_merged.drop(columns='Y')
y = occupancy_merged['Y']
X_test = occupancy_future_merged.drop(columns='Y')
y_test = occupancy_future_merged['Y']

# Train-test split for validation (80/20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Expanded hyperparameter search space
params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 6, 9, 12, 15],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'n_estimators': [100, 200, 300, 400, 500],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [1, 1.5, 2, 2.5]
}

# RandomizedSearchCV with more iterations to explore the expanded parameter space
random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=params,
    n_iter=50,  # Increase this number for a more thorough search (e.g., 50 or even higher)
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the model
random_search.fit(X_train, y_train)

# Best model from search
best_xgb = random_search.best_estimator_

# Predict on the validation and test sets
y_val_pred = best_xgb.predict(X_val)
y_test_pred = best_xgb.predict(X_test)

# Calculate RMSE on validation and test sets
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"Validation RMSE: {val_rmse}")
print(f"Test RMSE: {test_rmse}")

# Visualize the results with Plotly (zoomable plot)
fig = go.Figure()

# True occupancy
fig.add_trace(go.Scatter(x=occupancy_future['DS'], y=y_test, mode='lines', name='True Occupancy'))

# Predicted occupancy
fig.add_trace(go.Scatter(x=occupancy_future['DS'], y=y_test_pred, mode='lines', name='Predicted Occupancy'))

# Update layout for better interaction
fig.update_layout(
    title="True vs Predicted Occupancy",
    xaxis_title="Date",
    yaxis_title="Occupancy",
    xaxis_rangeslider_visible=True,  # Add a range slider for easier zoom
    hovermode="x unified"
)

fig.show()

Fitting 3 folds for each of 50 candidates, totalling 150 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



Validation RMSE: 4.784705524562342
Test RMSE: 10.370653711235391


In [8]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go

# Load the CSV files
occupancy = pd.read_csv('occupancy.csv', parse_dates=['DS'])
occupancy_future = pd.read_csv('occupancy_future.csv', parse_dates=['DS'])
weather = pd.read_csv('weather_parkplatz-bad-kohlgrub-schwebebahn-1.csv', parse_dates=['Timestamp'])

# Preprocess weather data
weather['Timestamp'] = pd.to_datetime(weather['Timestamp'])
weather['Condition'] = weather['Condition'].ffill()  # Forward fill missing weather conditions
weather['Sunshine'] = weather['Sunshine'].fillna(0)  # Fill missing sunshine with 0
weather['Temperature'] = weather['Temperature'].fillna(weather['Temperature'].mean())  # Fill missing temperature with mean

# Extract time-based features from the timestamp
def create_time_features(df, timestamp_col):
    df['hour'] = df[timestamp_col].dt.hour
    df['day_of_week'] = df[timestamp_col].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    return df

occupancy = create_time_features(occupancy, 'DS')
occupancy_future = create_time_features(occupancy_future, 'DS')
weather = create_time_features(weather, 'Timestamp')

# Convert both columns to the same timezone (or make them timezone-naive)
occupancy['DS'] = occupancy['DS'].dt.tz_localize(None)  # Make the occupancy timestamp timezone-naive
occupancy_future['DS'] = occupancy_future['DS'].dt.tz_localize(None)  # Same for future occupancy data
weather['Timestamp'] = weather['Timestamp'].dt.tz_localize(None)  # Make the weather timestamp timezone-naive

# Merge occupancy data with weather data based on timestamps
occupancy_merged = pd.merge_asof(occupancy, weather, left_on='DS', right_on='Timestamp', direction='nearest')
occupancy_future_merged = pd.merge_asof(occupancy_future, weather, left_on='DS', right_on='Timestamp', direction='nearest')

# Drop unnecessary columns
occupancy_merged.drop(columns=['DS', 'Timestamp'], inplace=True)
occupancy_future_merged.drop(columns=['DS', 'Timestamp'], inplace=True)

# Prepare training and testing data
X = occupancy_merged.drop(columns='Y')
y = occupancy_merged['Y']
X_test = occupancy_future_merged.drop(columns='Y')
y_test = occupancy_future_merged['Y']

# Train-test split for validation (80/20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Expanded hyperparameter search space
params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 6, 9, 12, 15],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'n_estimators': [100, 200, 300, 400, 500],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [1, 1.5, 2, 2.5]
}

# RandomizedSearchCV with more iterations to explore the expanded parameter space
random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=params,
    n_iter=50,  # Increase this number for a more thorough search (e.g., 50 or even higher)
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the model
random_search.fit(X_train, y_train)

# Best model from search
best_xgb = random_search.best_estimator_

# Predict on the validation and test sets
y_val_pred = best_xgb.predict(X_val)
y_test_pred = best_xgb.predict(X_test)

# Calculate RMSE on validation and test sets
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"Validation RMSE: {val_rmse}")
print(f"Test RMSE: {test_rmse}")

# Option 1: Interpolation (Linear)
y_test_pred_interpolated = pd.Series(y_test_pred).interpolate(method='linear').values

# Option 2: Smoothing (Moving Average)
#window_size = 5
#y_test_pred_smooth = pd.Series(y_test_pred).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(method='ffill').values

# Plot the results with Plotly
fig = go.Figure()

# True occupancy
fig.add_trace(go.Scatter(x=occupancy_future['DS'], y=y_test, mode='lines', name='True Occupancy'))

# Predicted occupancy (before smoothing)
fig.add_trace(go.Scatter(x=occupancy_future['DS'], y=y_test_pred, mode='lines', name='Predicted Occupancy (Original)', line=dict(dash='dash')))

# Smoothed predicted occupancy (choose one: interpolated or moving average)
fig.add_trace(go.Scatter(x=occupancy_future['DS'], y=y_test_pred_smooth, mode='lines', name='Predicted Occupancy (Smoothed)'))

# Update layout for better interaction
fig.update_layout(
    title="True vs Predicted Occupancy (Smoothed)",
    xaxis_title="Date",
    yaxis_title="Occupancy",
    xaxis_rangeslider_visible=True,
    hovermode="x unified"
)

fig.show()

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Validation RMSE: 4.784705524562342
Test RMSE: 10.370653711235391



Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.

