In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib 

# ------------------ Load dataset ------------------
url = "https://raw.githubusercontent.com/CeylonSmartCitizen/SigSegV_Datathon/refs/heads/main/data/raw/staffing_train.csv"
df = pd.read_csv(url)

# ------------------ Feature engineering ------------------
df["date"] = pd.to_datetime(df["date"])
df["day_of_week"] = df["date"].dt.dayofweek
df["month"] = df["date"].dt.month
df["year"] = df["date"].dt.year
df["day_of_year"] = df["date"].dt.dayofyear
df["quarter"] = df["date"].dt.quarter
df["is_weekend"] = (df["day_of_week"] >= 5).astype(int)

# Encode section_id
le = LabelEncoder()
df["section_encoded"] = le.fit_transform(df["section_id"])

# Simple lag feature: last week's employees (per section, same weekday)
df = df.sort_values(["section_id", "date"])
df["lag_7"] = df.groupby("section_id")["employees_on_duty"].shift(7)
df["lag_7"] = df["lag_7"].fillna(df["employees_on_duty"].median())

# Features and target
X = df[["day_of_week", "month", "year", "day_of_year",
        "quarter", "is_weekend", "section_encoded", "lag_7"]]
y = df["employees_on_duty"]

# ------------------ Train-test split ------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------ Random Forest model ------------------
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# ------------------ Evaluation ------------------
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))  # works in all versions
r2 = r2_score(y_test, y_pred)

print("📊 Model Evaluation (Improved Task 2)")
print(f"MAE  (Mean Absolute Error): {mae:.2f}")
print(f"RMSE (Root Mean Squared Error): {rmse:.2f}")
print(f"R²   (Explained Variance): {r2:.4f}")

# ------------------ Save model ------------------
joblib.dump(model, r"..\models\task2_staffing_model.pkl")
print("Model saved to ../models/task2_staffing_model.pkl")



📊 Model Evaluation (Improved Task 2)
MAE  (Mean Absolute Error): 1.56
RMSE (Root Mean Squared Error): 2.17
R²   (Explained Variance): 0.3441
Model saved to ../models/task2_staffing_model.pkl
