<a href="https://www.kaggle.com/code/bandito20/road-torch?scriptVersionId=267544331" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import torch

In [2]:
df = pd.read_csv('/kaggle/input/playground-series-s5e10/train.csv', index_col='id')
df.head()

Unnamed: 0_level_0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 517754 entries, 0 to 517753
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   road_type               517754 non-null  object 
 1   num_lanes               517754 non-null  int64  
 2   curvature               517754 non-null  float64
 3   speed_limit             517754 non-null  int64  
 4   lighting                517754 non-null  object 
 5   weather                 517754 non-null  object 
 6   road_signs_present      517754 non-null  bool   
 7   public_road             517754 non-null  bool   
 8   time_of_day             517754 non-null  object 
 9   holiday                 517754 non-null  bool   
 10  school_season           517754 non-null  bool   
 11  num_reported_accidents  517754 non-null  int64  
 12  accident_risk           517754 non-null  float64
dtypes: bool(4), float64(2), int64(3), object(4)
memory usage: 41.5+ MB


In [None]:
import matplotlib.pyplot as plt

plt.hist(df['accident_risk'], bins=40, edgecolor='black')

In [None]:
df.isna().sum()

In [3]:
numerical_features  = df.select_dtypes(include=['number']).columns.difference(['accident_risk'])
numerical_features 

Index(['curvature', 'num_lanes', 'num_reported_accidents', 'speed_limit'], dtype='object')

In [4]:
categorical_features  = df.select_dtypes(exclude=['number']).columns
print(categorical_features)
print(len(categorical_features))

Index(['road_type', 'lighting', 'weather', 'road_signs_present', 'public_road',
       'time_of_day', 'holiday', 'school_season'],
      dtype='object')
8


In [5]:
target = 'accident_risk'

In [6]:
# One-hot encode categorical features
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 517754 entries, 0 to 517753
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   num_lanes                517754 non-null  int64  
 1   curvature                517754 non-null  float64
 2   speed_limit              517754 non-null  int64  
 3   num_reported_accidents   517754 non-null  int64  
 4   accident_risk            517754 non-null  float64
 5   road_type_rural          517754 non-null  bool   
 6   road_type_urban          517754 non-null  bool   
 7   lighting_dim             517754 non-null  bool   
 8   lighting_night           517754 non-null  bool   
 9   weather_foggy            517754 non-null  bool   
 10  weather_rainy            517754 non-null  bool   
 11  road_signs_present_True  517754 non-null  bool   
 12  public_road_True         517754 non-null  bool   
 13  time_of_day_evening      517754 non-null  bool   
 14  time_of_d

In [7]:
bool_cols = df.select_dtypes(include=['bool']).columns
bool_cols

Index(['road_type_rural', 'road_type_urban', 'lighting_dim', 'lighting_night',
       'weather_foggy', 'weather_rainy', 'road_signs_present_True',
       'public_road_True', 'time_of_day_evening', 'time_of_day_morning',
       'holiday_True', 'school_season_True'],
      dtype='object')

In [8]:
for col in bool_cols:
    df[col] = df[col].astype(int)

In [11]:
df.head()

Unnamed: 0_level_0,num_lanes,curvature,speed_limit,num_reported_accidents,accident_risk,road_type_rural,road_type_urban,lighting_dim,lighting_night,weather_foggy,weather_rainy,road_signs_present_True,public_road_True,time_of_day_evening,time_of_day_morning,holiday_True,school_season_True
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,2,0.06,35,1,0.13,0,1,0,0,0,1,0,1,0,0,0,1
1,4,0.99,35,0,0.35,0,1,0,0,0,0,1,0,1,0,1,1
2,4,0.63,70,2,0.3,1,0,1,0,0,0,0,1,0,1,1,0
3,4,0.07,35,1,0.21,0,0,1,0,0,1,1,1,0,1,0,0
4,1,0.58,60,1,0.56,1,0,0,0,1,0,0,0,1,0,1,0


In [9]:
# Separate features and target
X = df.drop(columns=[target])
y = df[target]

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_val[numerical_features] = scaler.transform(X_val[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# Convert to PyTorch Tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1) # Unsqueeze for a column vector
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

In [21]:
import torch.nn as nn

class AccidentRiskPredictor(nn.Module):
    def __init__(self, input_size):
        super(AccidentRiskPredictor, self).__init__()
        self.layer1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128) # Matches output of layer1
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)
        self.layer2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64) # Matches output of layer2
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)
        self.layer3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32) # Matches output of layer3
        self.relu3 = nn.ReLU()
        self.output_layer = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.layer1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.layer2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.layer3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.output_layer(x)
        x = self.sigmoid(x)
        return x 


input_size = X_train_tensor.shape[1]
model = AccidentRiskPredictor(input_size)
print(model)

AccidentRiskPredictor(
  (layer1): Linear(in_features=16, out_features=128, bias=True)
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU()
  (dropout1): Dropout(p=0.3, inplace=False)
  (layer2): Linear(in_features=128, out_features=64, bias=True)
  (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU()
  (dropout2): Dropout(p=0.3, inplace=False)
  (layer3): Linear(in_features=64, out_features=32, bias=True)
  (bn3): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3): ReLU()
  (output_layer): Linear(in_features=32, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [52]:
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

learning_rate = 0.001
epochs = 50
batch_size = 64
patience = 5
weight_decay = 1e-4

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True)



In [46]:
# Check if a GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [53]:
# Instantiate the model
input_size = X_train_tensor.shape[1]
model = AccidentRiskPredictor(input_size).to(device) # Move the model to the GPU

In [54]:
for epoch in range(epochs):
    if early_stop:
        print("Early stopping triggered.")
        break
        
    model.train()
    for batch_X, batch_y in train_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        # forward
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        # backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # validation
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for batch_X_val, batch_y_val in val_loader:
            batch_X_val = batch_X_val.to(device)
            batch_y_val = batch_y_val.to(device)
            outputs_val = model(batch_X_val)
            val_loss += criterion(outputs_val, batch_y_val).item()

        avg_val_loss = val_loss / len(val_loader)

        scheduler.step(avg_val_loss)

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_no_imporve = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                early_stop = True
    print(f'Epoch [{epoch+1}/{epochs}], Training Loss: {loss.item():.4f}, Validation Loss: {avg_val_loss:.4f}')

print('Training finished.')

Epoch [1/50], Training Loss: 0.7805, Validation Loss: 0.7898
Epoch [2/50], Training Loss: 0.7784, Validation Loss: 0.7904
Epoch [3/50], Training Loss: 0.7781, Validation Loss: 0.7905
Epoch [4/50], Training Loss: 0.7791, Validation Loss: 0.7907
Epoch [5/50], Training Loss: 0.7725, Validation Loss: 0.7905
Epoch [6/50], Training Loss: 0.7857, Validation Loss: 0.7901
Early stopping triggered.
Training finished.


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Evaluation on the test set
model.eval() # Set model to evaluation mode
with torch.no_grad():
    # Move test data to the GPU
    X_test_tensor = X_test_tensor.to(device)
    
    # Make predictions
    test_predictions = model(X_test_tensor)

    # Move predictions and true values back to CPU for evaluation
    test_predictions_np = test_predictions.squeeze().cpu().numpy()
    y_test_np = y_test_tensor.squeeze().cpu().numpy()
    
    # Calculate metrics
    mse = mean_squared_error(y_test_np, test_predictions_np)
    r2 = r2_score(y_test_np, test_predictions_np)
    
    print(f'Root Mean Squared Error on Test Set: {mse:.4f}')
    print(f'R-squared on Test Set: {r2:.4f}')

# Generate submission file (assuming your test set has an 'id' column)
# test_df = pd.read_csv('your_test_data.csv')
# X_kaggle_test = ... (preprocess the Kaggle test data in the same way)
# X_kaggle_test_tensor = torch.tensor(X_kaggle_test.values, dtype=torch.float32)

# with torch.no_grad():
#     kaggle_predictions = model(X_kaggle_test_tensor).squeeze().numpy()
#
# submission_df = pd.DataFrame({'id': test_df['id'], 'accident_risk': kaggle_predictions})
# submission_df.to_csv('submission.csv', index=False)

In [60]:
# Generate submission file (assuming your test set has an 'id' column)
test_df = pd.read_csv('/kaggle/input/playground-series-s5e10/test.csv')
categorical_features  = test_df.select_dtypes(exclude=['number']).columns
X_kaggle_test = pd.get_dummies(test_df, columns=categorical_features, drop_first=True)
bool_cols = X_kaggle_test.select_dtypes(include=['bool']).columns
for col in bool_cols:
    X_kaggle_test[col] = X_kaggle_test[col].astype(int)

X_kaggle_test = X_kaggle_test.drop('id', axis=1)
X_kaggle_test.head()

Unnamed: 0,num_lanes,curvature,speed_limit,num_reported_accidents,road_type_rural,road_type_urban,lighting_dim,lighting_night,weather_foggy,weather_rainy,road_signs_present_True,public_road_True,time_of_day_evening,time_of_day_morning,holiday_True,school_season_True
0,2,0.34,45,1,0,0,0,1,0,0,1,1,0,0,1,1
1,3,0.04,45,0,0,1,1,0,1,0,1,0,0,0,1,0
2,2,0.59,35,1,0,1,1,0,0,0,1,0,0,0,1,1
3,4,0.95,35,2,1,0,0,0,0,1,0,0,0,0,0,0
4,2,0.86,35,3,0,0,0,0,0,0,1,0,1,0,0,1


In [67]:
X_kaggle_test_tensor = torch.tensor(X_kaggle_test.values, dtype=torch.float32).to(device)

with torch.no_grad():
    kaggle_predictions = model(X_kaggle_test_tensor).cpu().squeeze().numpy()

submission_df = pd.DataFrame({'id': test_df['id'], 'accident_risk': kaggle_predictions})
submission_df.to_csv('submission_t1.csv', index=False)