In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
import joblib
import matplotlib.pyplot as plt


# -----------------------
# Load dataset
# -----------------------
url = "https://raw.githubusercontent.com/CeylonSmartCitizen/SigSegV_Datathon/main/data/raw/bookings_train.csv"
df = pd.read_csv(url)
print("Initial dataset shape:", df.shape)


# Convert datetime columns
df['check_in_time'] = pd.to_datetime(df['check_in_time'], errors='coerce')
df['check_out_time'] = pd.to_datetime(df['check_out_time'], errors='coerce')
df['appointment_time'] = pd.to_datetime(df['appointment_time'], format='%H:%M', errors='coerce')


# Target variable (processing time in minutes)
df['processing_time_minutes'] = (
    (df['check_out_time'] - df['check_in_time']).dt.total_seconds() / 60
)


# Drop rows with missing target
df = df.dropna(subset=['processing_time_minutes'])
print("Cleaned dataset shape:", df.shape)


# -----------------------
# Feature Engineering
# -----------------------
# Time-based features
df['hour'] = df['appointment_time'].dt.hour
df['dayofweek'] = df['appointment_time'].dt.dayofweek
df['is_weekend'] = df['dayofweek'].apply(lambda x: 1 if x >= 5 else 0)
df['is_peak'] = df['hour'].apply(lambda x: 1 if 9 <= x <= 12 or 14 <= x <= 17 else 0)


# Business logic features
df['queue_density'] = df['queue_number'] / (df['num_documents'] + 1)
df['wait_time'] = (df['check_in_time'] - df['appointment_time']).dt.total_seconds() / 60


# Feature matrix
X = df[['hour', 'dayofweek', 'is_weekend', 'is_peak',
        'num_documents', 'queue_number', 'queue_density', 'wait_time']]


# One-hot encode task_id
X = pd.concat([X, pd.get_dummies(df['task_id'], prefix='task')], axis=1)


y = df['processing_time_minutes']


# -----------------------
# Train-Test Split
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=100
)


# -----------------------
# Model: HistGradientBoosting (fast + accurate)
# -----------------------
model = HistGradientBoostingRegressor(
    max_depth=10,
    learning_rate=0.05,
    max_iter=300,
    random_state=100
)
model.fit(X_train, y_train)


# -----------------------
# Evaluation
# -----------------------
y_pred = model.predict(X_test)  # Fixed indentation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print(f"Test MSE: {mse:.2f}")
print(f"Test R^2: {r2:.3f}")


# Cross-validation score
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"CV R^2 Mean: {cv_scores.mean():.3f}, Std: {cv_scores.std():.3f}")


# -----------------------
# Permutation Feature Importance (Top 10)
# -----------------------
result = permutation_importance(
    model, X_test, y_test, n_repeats=10, random_state=100, n_jobs=-1
)


importances = result.importances_mean
indices = np.argsort(importances)[::-1]
features = X.columns


plt.figure(figsize=(10,6))
plt.bar(range(10), importances[indices][:10], align="center")
plt.xticks(range(10), [features[i] for i in indices[:10]], rotation=45, ha="right")
plt.title("Top 10 Feature Importances (Permutation Importance)")
plt.tight_layout()
plt.show()


# -----------------------
# Save Model
# -----------------------
joblib.dump(model, "task1_completion_time_model.pkl")
print("Model saved as task1_completion_time_model.pkl")


# Download in Colab (only if running in Colab)
try:
    from google.colab import files
    files.download("task1_completion_time_model.pkl")
    print("Model downloaded successfully!")
except ImportError:
    print("Not running in Colab - model saved locally")
