In [7]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder

# loading the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Selecting the features for training data
features = [
    'road_type', 'num_lanes', 'curvature', 'speed_limit', 
    'lighting', 'weather', 'road_signs_present', 
    'public_road', 'time_of_day', 'holiday', 'school_season'
]

x_train = train[features].copy()
y_train = train[['accident_risk']].copy()
x_test = test[features].copy()


# Selecting the categorical data to convert them into numerical labels
categorical_cols = [
    'road_type', 'lighting', 'weather', 'road_signs_present', 
    'public_road', 'time_of_day', 'holiday', 'school_season'
]

encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    x_train[col] = le.fit_transform(x_train[col])
    x_test[col] = le.transform(x_test[col])  # ✅ use transform (not fit_transform)
    encoders[col] = le


# training linear regression model
reg = LinearRegression()
reg.fit(x_train, y_train)

y_train_pred = reg.predict(x_train)

# checking for accuracy
r2 = r2_score(y_train, y_train_pred)
print(f"Training R²: {r2:.4f} ({r2*100:.2f}%)")
rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"Training RMSE: {rmse:.4f}")


# makign predictions on the test data
y_test_pred = reg.predict(x_test)

# making of a kaggle submission file
submission = pd.DataFrame({
    'id': test['id'],
    'accident_risk': np.round(y_test_pred.ravel(), 3)
})

submission.to_csv("sample_submission_2.csv", index=False)
print("✅ Submission file 'sample_submission_2.csv' created successfully!")


Training R²: 0.7084 (70.84%)
Training RMSE: 0.0899
✅ Submission file 'sample_submission_2.csv' created successfully!
