In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

train_df = pd.read_parquet('final_data_train.parquet')
test_df = pd.read_parquet('final_data_test.parquet')

In [35]:
train_df.shape

(209396, 49)

In [36]:
# Define Features (X) and Target (y)
X_train = train_df.drop(columns=['Log_TotalExpense'])
y_train = train_df['Log_TotalExpense']

X_test = test_df.drop(columns=['Log_TotalExpense'])
y_test = test_df['Log_TotalExpense']

In [37]:
X_test.shape

(52350, 48)

In [38]:
X_train.shape

(209396, 48)

In [39]:
y_train.shape

(209396,)

In [40]:
# Ensure all features are numeric
non_numeric_cols = X_train.select_dtypes(exclude=[np.number]).columns
for col in non_numeric_cols:
    X_train[col] = X_train[col].astype('category').cat.codes
    X_test[col] = X_test[col].astype('category').cat.codes

# Train XGBoost Regressor
model = XGBRegressor(n_estimators=500, max_depth=10, learning_rate=0.05, colsample_bytree=0.8, random_state=42)
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_test)

# Compute R² Score
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

R² Score: 0.7059
Mean Absolute Error (MAE): 0.2446


In [41]:
from sklearn.metrics import mean_squared_error

In [42]:
mse = mean_squared_error(np.expm1(y_test), np.expm1(y_pred))
print(f"Mean Squared Error (MSE): {mse:.4f}")

Mean Squared Error (MSE): 81368881.3643


In [43]:
# Calculate MAPE
mape = np.mean(np.abs((np.expm1(y_test) - np.expm1(y_pred)) / np.expm1(y_test))) * 100
print(f"Mean Absolute Percentage Error (MAPE): {mape:.4f}%")

Mean Absolute Percentage Error (MAPE): 25.7585%
