In [21]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [22]:
file_path = 'Injured and Dead in Accidents.csv'
df = pd.read_csv(file_path)
df['gregorian_date'] = pd.to_datetime(df['gregorian_date'])
df.drop(columns=['ID'], inplace=True)  # not needed
df.drop(columns=['region_name'], inplace=True)  # not needed

df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1152 entries, 0 to 1151
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   hijri_year                1152 non-null   int64         
 1   gregorian_date            1152 non-null   datetime64[ns]
 2   gregorian_year            1152 non-null   int64         
 3   gregorian_month           1152 non-null   int64         
 4   gregorian_day             1152 non-null   int64         
 5   region_number             1152 non-null   int64         
 6   male_count                1152 non-null   int64         
 7   female_count              1152 non-null   int64         
 8   age_under_18              1152 non-null   int64         
 9   age_18_to_30              1152 non-null   int64         
 10  age_30_to_40              1152 non-null   int64         
 11  age_40_to_50              1152 non-null   int64         
 12  age_over_50         

Unnamed: 0,hijri_year,gregorian_date,gregorian_year,gregorian_month,gregorian_day,region_number,male_count,female_count,age_under_18,age_18_to_30,age_30_to_40,age_40_to_50,age_over_50,saudi_national_count,non_saudi_national_count,death_flag,injured_flag,total_deaths,total_injuries
0,1437,2015-10-14,2015,10,14,1,279,31,65,94,125,15,11,172,138,0,1,0,310
1,1437,2015-11-13,2015,11,13,1,245,93,61,89,82,63,43,204,134,0,1,0,338
2,1437,2015-12-12,2015,12,12,1,244,89,59,88,85,59,42,187,146,0,1,0,333
3,1437,2016-01-11,2016,1,11,1,303,36,61,89,81,64,44,177,162,0,1,0,339
4,1437,2016-02-10,2016,2,10,1,294,29,56,84,79,62,42,158,165,0,1,0,323
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1147,1439,2018-04-17,2018,4,17,16,0,0,0,0,0,0,0,0,0,1,0,0,0
1148,1439,2018-05-16,2018,5,16,16,5,3,1,3,1,0,3,3,5,1,0,8,0
1149,1439,2018-06-15,2018,6,15,16,1,1,1,0,1,0,0,2,0,1,0,2,0
1150,1439,2018-07-14,2018,7,14,16,4,0,0,3,0,0,1,4,0,1,0,4,0


In [23]:
import pandas as pd

# ✅ Ensure dates are datetime
df['gregorian_date'] = pd.to_datetime(df['gregorian_date'])

# ✅ Sort by date and region_number for lag creation
df = df.sort_values(['region_number', 'gregorian_date']).reset_index(drop=True)

# ============================================================
# 🔷 1️⃣ Keep only safe features
# ============================================================
df_model = df[['gregorian_year', 'gregorian_month', 'region_number', 'total_deaths', 'total_injuries']].copy()

# ============================================================
# 🔷 2️⃣ Create lag features by region_number
# ============================================================
df_model['deaths_lag1'] = df_model.groupby('region_number')['total_deaths'].shift(1)
df_model['injuries_lag1'] = df_model.groupby('region_number')['total_injuries'].shift(1)

# ============================================================
# 🔷 3️⃣ Drop rows with NA lags (first month of each region)
# ============================================================
df_model = df_model.dropna().reset_index(drop=True)

# ============================================================
# 🔷 4️⃣ Final dataset preview
# ============================================================
df_model.head()


Unnamed: 0,gregorian_year,gregorian_month,region_number,total_deaths,total_injuries,deaths_lag1,injuries_lag1
0,2015,10,1,97,0,0.0,310.0
1,2015,11,1,0,338,97.0,0.0
2,2015,11,1,102,0,0.0,338.0
3,2015,12,1,0,333,102.0,0.0
4,2015,12,1,103,0,0.0,333.0


In [24]:
df_model.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1136 entries, 0 to 1135
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   gregorian_year   1136 non-null   int64  
 1   gregorian_month  1136 non-null   int64  
 2   region_number    1136 non-null   int64  
 3   total_deaths     1136 non-null   int64  
 4   total_injuries   1136 non-null   int64  
 5   deaths_lag1      1136 non-null   float64
 6   injuries_lag1    1136 non-null   float64
dtypes: float64(2), int64(5)
memory usage: 62.3 KB


In [10]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# ============================================================
# 🔷 1️⃣ Select target column
# ============================================================
target = 'total_deaths'  # Change to 'total_injuries' in next run

# ============================================================
# 🔷 2️⃣ Define features
#“Although injuries_lag1 and deaths were appended into a single dataset, they originated from separate government reports without overlapping individual event records. Therefore, injuries_lag1 was retained as a valid temporal predictor with no data leakage concerns.”

# ============================================================
features = ['gregorian_year', 'gregorian_month', 'region_number', 'deaths_lag1','injuries_lag1']

# ============================================================
# 🔷 3️⃣ Prepare data
# ============================================================
X = df_model[features]
y = df_model[target]

# Time-based split (no shuffle)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# ============================================================
# 🔷 4️⃣ Initialize CatBoost model
# ============================================================
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    cat_features=['region_number'],
    verbose=100
)

# ============================================================
# 🔷 5️⃣ Train model
# ============================================================
train_pool = Pool(X_train, y_train, cat_features=['region_number'])
test_pool = Pool(X_test, y_test, cat_features=['region_number'])

model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50)

# ============================================================
# 🔷 6️⃣ Evaluate model
# ============================================================
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Results for {target}:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.2f}")

# ✅ Optional: Feature importances
importances = model.get_feature_importance(prettified=True)
print(importances)


0:	learn: 28.9690527	test: 25.2094652	best: 25.2094652 (0)	total: 31.9ms	remaining: 31.9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 13.02522022
bestIteration = 47

Shrink model to first 48 iterations.
Results for total_deaths:
MAE: 8.28
RMSE: 13.03
R²: 0.76
        Feature Id  Importances
0    injuries_lag1    90.327513
1    region_number     5.860289
2   gregorian_year     1.813041
3  gregorian_month     1.301437
4      deaths_lag1     0.697719


In [25]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# ============================================================
# 🔷 1️⃣ Select target column
# ============================================================
target = 'total_injuries'  

# ============================================================
# 🔷 2️⃣ Define features
#“Although injuries_lag1 and deaths were appended into a single dataset, they originated from separate government reports without overlapping individual event records. Therefore, injuries_lag1 was retained as a valid temporal predictor with no data leakage concerns.”

# ============================================================
features = ['gregorian_year', 'gregorian_month', 'region_number', 'deaths_lag1','injuries_lag1']

# ============================================================
# 🔷 3️⃣ Prepare data
# ============================================================
X = df_model[features]
y = df_model[target]

# Time-based split (no shuffle)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# ============================================================
# 🔷 4️⃣ Initialize CatBoost model
# ============================================================
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    cat_features=['region_number'],
    verbose=100
)

# ============================================================
# 🔷 5️⃣ Train model
# ============================================================
train_pool = Pool(X_train, y_train, cat_features=['region_number'])
test_pool = Pool(X_test, y_test, cat_features=['region_number'])

model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50)

# ============================================================
# 🔷 6️⃣ Evaluate model
# ============================================================
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Results for {target}:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.2f}")

# ✅ Optional: Feature importances
importances = model.get_feature_importance(prettified=True)
print(importances)


0:	learn: 110.9350148	test: 155.2862628	best: 155.2862628 (0)	total: 34.6ms	remaining: 34.5s
100:	learn: 30.8896112	test: 75.7058308	best: 75.7058308 (100)	total: 2.56s	remaining: 22.8s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 75.10881861
bestIteration = 124

Shrink model to first 125 iterations.
Results for total_injuries:
MAE: 47.27
RMSE: 75.11
R²: 0.77
        Feature Id  Importances
0      deaths_lag1    83.109548
1    injuries_lag1     7.244524
2    region_number     6.839412
3  gregorian_month     1.599566
4   gregorian_year     1.206950
