In [4]:
# ---------------------------------------------------------
# 1. IMPORTS
# ---------------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import pickle

In [6]:
# ---------------------------------------------------------
# 2. LOAD YOUR DATASETS
# ---------------------------------------------------------
from pathlib import Path

# Navigate to project root (parent of Model/ directory)
project_root = Path.cwd().parent if Path.cwd().name == "Model" else Path.cwd()
data_dir = project_root / "data"
print("Project root:", project_root)
print("Loading datasets from:", data_dir)

# Example:
df_synth = pd.read_excel(data_dir / "Trip Summary.xlsx")          # synthetic generated driver behavior
df_acc = pd.read_excel(data_dir / "Traffic Accident Statistics.xlsx")                  # accident real data
df_env = pd.read_excel(data_dir / "Riyadh Roadway Environment.xlsx")            # number of signs, lanes, lights, etc.

Project root: c:\Users\HP\RoadRank-Absher-hackathon
Loading datasets from: c:\Users\HP\RoadRank-Absher-hackathon\data


In [7]:
# ---------------------------------------------------------
# 3. MERGE DATASETS (inspect and adapt based on actual columns)
# ---------------------------------------------------------
# First, let's inspect the columns in each dataframe
print("df_synth columns:", df_synth.columns.tolist())
print("df_synth shape:", df_synth.shape)
print()
print("df_acc columns:", df_acc.columns.tolist())
print("df_acc shape:", df_acc.shape)
print()
print("df_env columns:", df_env.columns.tolist())
print("df_env shape:", df_env.shape)
print()

# For now, merge on the common column 'driver_id' if available
# Adjust the merge keys based on the actual columns available
if 'driver_id' in df_synth.columns and 'driver_id' in df_env.columns:
    df = df_synth.merge(df_env, on="driver_id", how="left", suffixes=('_synth', '_env'))
else:
    # If no common key, just use the first dataframe
    print("Warning: No common merge key found. Using df_synth only.")
    df = df_synth.copy()

if 'driver_id' in df.columns and 'driver_id' in df_acc.columns:
    df = df.merge(df_acc, on="driver_id", how="left", suffixes=('_main', '_acc'))

print("Final shape:", df.shape)

df_synth columns: ['trip_id', 'driver_id', 'timestamp', 'safe_driving_score', 'driver_category', 'driver_category_ar', 'avg_speed', 'max_speed', 'harsh_brakes_count', 'harsh_accels_count', 'lane_changes_count', 'speeding_percentage', 'avg_congestion', 'avg_visibility', 'road_type', 'actual_driver_type', 'time_of_day', 'weather', 'recommendation', 'recommendation_ar']
df_synth shape: (200, 20)

df_acc columns: ['السنه', 'الشهر', 'حادث تلفيات', 'حادث اصابات', 'حادث وفيات', ' مجموع عدد الحوادث', 'السبت', 'الا حد', 'الاثنين', 'الثلاثاء', 'الا ربعاء', 'الخميس', 'الجمعة', 'مجموع عدد الحوادث لجميع الأيام', 'نهارا', 'ليلا', 'مجموع عدد الحوادث حسب الوقت', 'داخل المدينة', 'خارج المدينة', 'مجموع عدد الحوادث حسب مكان الحادث', 'صغيرة', 'جيب', 'حافلة', 'ونيت', 'نقل', 'وايت', 'اخرى', 'مجموع عدد السيارات المشتركة في الحوادث حسب الطراز', -18, '18+', '30+', '40+', '50+', 'المجموع حسب العمر', 'سعودي ', 'اجنبي', 'المجموع حسب الجنسية', 'متزوج', 'اعزب', 'المجموع حسب الحالة الاجتماعية', 'متعلم ', 'امى', 'الم

In [20]:
# ---------------------------------------------------------
# 4. SELECT FEATURES + TARGET
# ---------------------------------------------------------

# Print all columns to identify the target
print("Available columns:")
print(df.columns.tolist())
print()

# Use a numeric column as target (e.g., safe_driving_score if available, otherwise the first numeric column)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns:", numeric_cols)

# Choose target column (adjust based on actual column names)
if 'safe_driving_score' in df.columns:
    TARGET = 'safe_driving_score'
elif len(numeric_cols) > 0:
    TARGET = numeric_cols[0]
    print(f"Using '{TARGET}' as target (first numeric column)")
else:
    print("No numeric columns found!")
    TARGET = None

if TARGET is not None:
    # Drop rows where target is missing
    df = df.dropna(subset=[TARGET])
    
    # Example of removing irrelevant columns
    drop_cols = ["trip_id", "timestamp", "trip_summary", "index"]
    df = df.drop(columns=[c for c in drop_cols if c in df.columns])
    
    # Split features / labels
    X = df.drop(columns=[TARGET])
    y = df[TARGET]
    
    print(f"Features shape: {X.shape}")
    print(f"Target shape: {y.shape}")
else:
    print("Cannot proceed without a target column")

Available columns:
['driver_id', 'safe_driving_score', 'driver_category', 'driver_category_ar', 'avg_speed', 'max_speed', 'harsh_brakes_count', 'harsh_accels_count', 'lane_changes_count', 'speeding_percentage', 'avg_congestion', 'avg_visibility', 'road_type', 'actual_driver_type', 'time_of_day', 'weather', 'recommendation', 'recommendation_ar']

Numeric columns: ['safe_driving_score', 'avg_speed', 'max_speed', 'harsh_brakes_count', 'harsh_accels_count', 'lane_changes_count', 'speeding_percentage', 'avg_congestion', 'avg_visibility']
Features shape: (200, 17)
Target shape: (200,)


In [27]:
# ---------------------------------------------------------
# 5. HANDLE CATEGORICAL FEATURES
# ---------------------------------------------------------
categorical_cols = X.select_dtypes(include=["object"]).columns
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    encoders[col] = le

# Ensure all columns are numeric (int or float)
X = X.astype({col: 'float64' for col in X.columns if X[col].dtype == 'int64'})

print(f"Categorical columns encoded: {list(categorical_cols)}")
print(f"Final X dtypes:\n{X.dtypes}")

Categorical columns encoded: []
Final X dtypes:
driver_id              float64
driver_category        float64
driver_category_ar     float64
avg_speed              float64
max_speed              float64
harsh_brakes_count     float64
harsh_accels_count     float64
lane_changes_count     float64
speeding_percentage    float64
avg_congestion         float64
avg_visibility         float64
road_type              float64
actual_driver_type     float64
time_of_day            float64
weather                float64
recommendation         float64
recommendation_ar      float64
dtype: object


In [28]:
# ---------------------------------------------------------
# 6. TRAIN / TEST SPLIT
# ---------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [29]:
# ---------------------------------------------------------
# 7. TRAIN XGBOOST MODEL
# ---------------------------------------------------------
model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42,
    verbosity=1
)

model.fit(X_train, y_train)
print("Model training completed successfully!")

Model training completed successfully!


In [30]:
# ---------------------------------------------------------
# 8. EVALUATE MODEL
# ---------------------------------------------------------
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("------------------------------------------------")
print("MODEL PERFORMANCE")
print("RMSE:", rmse)
print("MAE:", mae)
print("R² Score:", r2)
print("------------------------------------------------")

------------------------------------------------
MODEL PERFORMANCE
RMSE: 5.275619891446808
MAE: 2.712159679790144
R² Score: 0.9743347355773767
------------------------------------------------


In [31]:
# ---------------------------------------------------------
# 9. SAVE MODEL + ENCODERS
# ---------------------------------------------------------
# Save the model directly using joblib for better compatibility
import joblib

# Save the model
model_path = "xgboost_model.joblib"
joblib.dump(model, model_path)

# Save encoders
encoders_path = "encoders.joblib"
joblib.dump(encoders, encoders_path)

print("Model & encoders saved successfully.")
print(f"- Model saved to: {model_path}")
print(f"- Encoders saved to: {encoders_path}")

Model & encoders saved successfully.
- Model saved to: xgboost_model.joblib
- Encoders saved to: encoders.joblib
