In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

#Load dataset
url = "https://raw.githubusercontent.com/CeylonSmartCitizen/SigSegV_Datathon/main/data/raw/bookings_train.csv"
df = pd.read_csv(url)

print("Initial dataset shape:", df.shape)

#Convert datetime columns
df['check_in_time'] = pd.to_datetime(df['check_in_time'], errors='coerce')
df['check_out_time'] = pd.to_datetime(df['check_out_time'], errors='coerce')

# Step 3: Create target variable
df['processing_time_minutes'] = (
    (df['check_out_time'] - df['check_in_time']).dt.total_seconds() / 60
)

#Drop rows with missing target values Nan
before_drop = df.shape[0]
df = df.dropna(subset=['processing_time_minutes'])
after_drop = df.shape[0]
print(f"Dropped {before_drop - after_drop} rows due to missing target values.")
print("Cleaned dataset shape:", df.shape)

#Feature engineering
df['appointment_time'] = pd.to_datetime(df['appointment_time'], format='%H:%M').dt.hour

X = df[['appointment_time', 'num_documents', 'queue_number']]
y = df['processing_time_minutes']

#One hot encode task_id
X = pd.concat([X, pd.get_dummies(df['task_id'], prefix='task')], axis=1)

#Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=100
)

#Train model
model = RandomForestRegressor(max_depth=10, random_state=100)
model.fit(X_train, y_train)

#Evaluate model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test MSE: {mse:.2f}")
print(f"Test R^2: {r2:.3f}")

#Save model
joblib.dump(model, r"F:\SigSegV_Datathon\models\task1_completion_time_model.pkl")
print("Model saved to models/task1_completion_time_model.pkl")
