# Replacing LinearRegression with XGBoost

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler    # instead of using LabelEncoder we have used StandardScaler to improve the redundancy on the code
from xgboost import XGBRegressor


train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


features = [
    'road_type', 'num_lanes', 'curvature', 'speed_limit', 'lighting',
    'weather', 'road_signs_present', 'public_road',
    'time_of_day', 'holiday', 'school_season'
]

x_train = train[features].copy()
y_train = train['accident_risk']
x_test = test[features].copy()

# making "one-hot encodeing"
# Converts categorical (text/string) columns into numeric “dummy” columns (0s and 1s).
x_train = pd.get_dummies(x_train, drop_first=True)
x_test = pd.get_dummies(x_test, drop_first=True)

# Aligning columns (important step)
# Ensures train and test have the exact same feature columns — because after one-hot encoding, one dataset might miss some categories.
x_test = x_test.reindex(columns=x_train.columns, fill_value=0) 
# If x_train has a “weather_Rainy” column but test doesn't, your model will crash during prediction.
# fill_value=0 fills missing columns with zeros.



scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


X_tr, X_val, y_tr, y_val = train_test_split(
    x_train_scaled, y_train, test_size=0.2
)

# XGBoost model
# It’s one of the most powerful gradient boosting algorithms — handles non-linear patterns, outliers, and mixed data types beautifully.
model = XGBRegressor(       # replacing linear regressin instead of XGBoost
    model = XGBRegressor(       # replacing linear regressin instead of XGBoost
    n_estimators=300,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
)

"""
n_estimators=300        → number of boosting rounds (trees).
learning_rate=0.05      → how fast the model learns (small = slower but more accurate).
max_depth=8             → depth of each decision tree (controls model complexity).
subsample=0.8           → uses 80% of samples per tree (adds randomness to prevent overfitting).
colsample_bytree=0.8    → uses 80% of features per tree (also helps generalization).
random_state=42         → reproducibility.
n_jobs=-1               → uses all CPU cores for faster training.


"""


model.fit(X_tr, y_tr)
y_val_pred = model.predict(X_val)

# evaluating accuraacy
score = np.round(r2_score(y_val, y_val_pred), 4) * 100
print(f"VALIDATION R² SCORE: {score} %")

# making prediction
y_pred = model.predict(x_test_scaled)


submission = pd.DataFrame({
    'id': test['id'],
    'accident_risk': np.round(y_pred, 3)
})

submission.to_csv("sample_submission.csv", index=False)
print("✅ sample_submission.csv created successfully!")

# Show first few predictions
print(submission.head())


VALIDATION R² SCORE: 86.44 %
✅ sample_submission.csv created successfully!
       id  accident_risk
0  517754          0.291
1  517755          0.117
2  517756          0.183
3  517757          0.385
4  517758          0.282
