## Table of Contents

<style>
.spotify-table {
  width: 90%;
  margin: 30px auto;
  border-collapse: collapse;
  font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial;
  font-size: 15px;
  background: linear-gradient(180deg, #0b1612 0%, #07100c 100%);
  color: #e9f8ef;
  border-radius: 14px;
  overflow: hidden;
  box-shadow: 0 6px 20px rgba(0,0,0,0.5);
}

.spotify-table thead {
  background-color: #1DB954;
  color: #fff;
  font-weight: 700;
  text-align: left;
}

.spotify-table th, .spotify-table td {
  padding: 14px 18px;
  border-bottom: 1px solid rgba(255,255,255,0.08);
}

.spotify-table tr:hover {
  background-color: rgba(29,185,84,0.08);
}

.spotify-table td:first-child {
  font-weight: 600;
  color: #1DB954;
}
</style>

<table class="spotify-table">
  <thead>
    <tr>
      <th></th>
      <th>Step</th>
      <th>Description</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>1</td>
      <td>Problem Status</td>
      <td>Define the problem and understand objectives of the analysis or competition.</td>
    </tr>
    <tr>
      <td>2</td>
      <td>Data Handling</td>
      <td>Load data, clean missing values, remove duplicates, and check dataset shape.</td>
    </tr>
    <tr>
      <td>3</td>
      <td>Data Visualization</td>
      <td>Explore distributions, outliers, correlations, and categorical breakdowns.</td>
    </tr>
    <tr>
      <td>4</td>
      <td>Feature Engineering</td>
      <td>Create new variables, encode categorical data, normalize/scale features.</td>
    </tr>
    <tr>
      <td>5</td>
      <td>Advanced Feature Selection</td>
      <td>Apply feature importance, PCA, or correlation filtering to refine dataset.</td>
    </tr>
    <tr>
      <td>6</td>
      <td>Split Data & Create Model</td>
      <td>Train/validation/test split and apply ML models (baseline → advanced).</td>
    </tr>
    <tr>
      <td>7</td>
      <td>Submission</td>
      <td>Generate predictions, save results to CSV, and submit.</td>
    </tr>
  </tbody>
</table>


<style>
.rmse-box {
  max-width: 720px;
  margin: 28px auto;
  padding: 26px 32px;
  background: linear-gradient(180deg, #0d0d0d, #1b1b1b);
  color: #f5f5f5;
  border-radius: 14px;
  box-shadow: 0 8px 26px rgba(0,0,0,0.65);
  border: 1px solid rgba(255,255,255,0.1);
  font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial;
  line-height: 1.8;
}

.rmse-box h3 {
  color: #1DB954; /* Spotify green */
  margin-bottom: 14px;
  font-size: 22px;
}

.rmse-box .formula {
  text-align: center;
  margin: 20px 0;
  padding: 12px;
  background: rgba(255,255,255,0.08);
  border-radius: 10px;
  font-size: 18px;
  font-weight: 500;
  font-family: "Courier New", monospace;
}

.rmse-box p {
  font-size: 15px;
  margin: 6px 0;
}
</style>

<div class="rmse-box">
  <h3>📐 Evaluation Metric: Root Mean Squared Error (RMSE)</h3>

  <div class="formula">
    RMSE = √( (1/n) Σ<sub>i=1</sub><sup>n</sup> ( y<sub>i</sub> − ŷ<sub>i</sub> )² )
  </div>

  <p><strong>where:</strong></p>
  <p>y<sub>i</sub> = true accident risk</p>
  <p>ŷ<sub>i</sub> = predicted accident risk</p>
</div>


In [None]:
#system handling
import os
import time
import warnings
warnings.filterwarnings('ignore')

#data handling
import numpy as np # linear algebra
import pandas as pd # data processing, 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats


#model handling
import lightgbm as lgb
from lightgbm import LGBMRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split,KFold, StratifiedKFold,cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import StandardScaler, LabelEncoder



for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

print('done')

## 2-Data Handling

In [None]:
#read data file
train =pd.read_csv('/kaggle/input/playground-series-s5e10/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e10/test.csv')
sub=pd.read_csv('/kaggle/input/playground-series-s5e10/sample_submission.csv')

In [None]:
#print(f'Shape of Train data = {train.shape}')
#print(f'Shape of Test data = {test.shape}')
#print(f'Shape of Submission data = {sub.shape}')

# Train data handling

In [None]:
#print("STATISTICAL ANALYSIS: Training Dataset")
# Basic info
#print("Dataset Information:")
#print(f"  Total Records: {len(train):,}")
#print(f"  Total Features: {len(train.columns)}")
#print(f"  Duplicates: {train.duplicated().sum()}")


In [None]:
# Numerical columns
numerical_cols_train = train.select_dtypes(include=[np.number]).columns.tolist()
if 'id' in numerical_cols_train:
    numerical_cols_train.remove('id')
if 'accident_risk' in numerical_cols_train and 'accident_risk' in train.columns:
    print(f"\nTarget Variable Statistics (accident_risk):")
    print(f"  Mean: {train['accident_risk'].mean():.4f}")
    print(f"  Std: {train['accident_risk'].std():.4f}")
    print(f"  Min: {train['accident_risk'].min():.4f}")
    print(f"  Max: {train['accident_risk'].max():.4f}")
    print(f"  Median: {train['accident_risk'].median():.4f}")

In [None]:
# Categorical columns
categorical_cols_train = train.select_dtypes(include=['object']).columns.tolist()
if categorical_cols_train:
    print(f"\nCategorical Features: {len(categorical_cols_train)}")
    for col in categorical_cols_train:
        unique_count = train[col].nunique()
        print(f"  {col}: {unique_count} unique values")

# Test data handling

In [None]:
# Numerical columns
numerical_cols_test = test.select_dtypes(include=[np.number]).columns.tolist()
if 'id' in numerical_cols_test:
    numerical_cols_test.remove('id')
if 'accident_risk' in numerical_cols_test and 'accident_risk' in test.columns:
    print(f"\nTarget Variable Statistics (accident_risk):")
    print(f"  Mean: {test['accident_risk'].mean():.4f}")
    print(f"  Std: {test['accident_risk'].std():.4f}")
    print(f"  Min: {test['accident_risk'].min():.4f}")
    print(f"  Max: {test['accident_risk'].max():.4f}")
    print(f"  Median: {test['accident_risk'].median():.4f}")

In [None]:
# Categorical columns
categorical_cols_test = test.select_dtypes(include=['object']).columns.tolist()
if categorical_cols_test:
    print(f"\nCategorical Features: {len(categorical_cols_test)}")
    for col in categorical_cols_test:
        unique_count = test[col].nunique()
        print(f"  {col}: {unique_count} unique values")

# Target Analysis

In [None]:
#target visualization
target_col = "accident_risk"

if target_col not in train.columns:
    print(f"Target column '{target_col}' not found in dataset.")
else:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))

    # Histogram with KDE
    axes[0].hist(train[target_col], bins=50, density=True, alpha=0.7, 
                 color='steelblue', edgecolor='black')
    axes[0].set_xlabel('Accident Risk', fontsize=12)
    axes[0].set_ylabel('Density', fontsize=12)
    axes[0].set_title('Distribution of Accident Risk (Training Data)', 
                      fontsize=14, fontweight='bold')
    axes[0].grid(True, alpha=0.3)

    # Q-Q plot for normality check
    stats.probplot(train[target_col], dist="norm", plot=axes[1])
    axes[1].set_title('Q-Q Plot: Normality Assessment', 
                      fontsize=14, fontweight='bold')
    axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig('target_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()

    # Statistical tests
    shapiro_stat, shapiro_p = stats.shapiro(train[target_col].sample(min(5000, len(train))))
    print("\nShapiro-Wilk Test for Normality:")
    print(f"  Statistic: {shapiro_stat:.4f}")
    print(f"  P-value: {shapiro_p:.4f}")
    if shapiro_p > 0.05:
        print("  Interpretation: Data is approximately Normal distribution")
    else:
        print("  Interpretation: Data is NOT Normal distribution")


## 4-Feature Engineering

In [None]:
CATEGORICAL_FEATURES = ['road_type', 'lighting', 'weather', 'time_of_day']
BOOLEAN_FEATURES = ['road_signs_present', 'public_road', 'holiday', 'school_season']
NUMERICAL_FEATURES = ['num_lanes', 'curvature', 'speed_limit', 'num_reported_accidents']
TARGET = 'accident_risk'
ID_COL = 'id'

In [None]:
def engineer_features(df):
    """
    Create domain-informed feature interactions.
    """
    df_eng = df.copy()
    # --- Base physical interactions ---
    df_eng['curv_speed'] = df_eng['curvature'] * df_eng['speed_limit']
    df_eng['curv_sq'] = df_eng['curvature']**2
    df_eng['speed_sq'] = df_eng['speed_limit']**2
    df_eng['curv_x_acc'] = df_eng['curvature'] * np.log1p(df_eng['num_reported_accidents'])
    df_eng['speed_x_acc'] = df_eng['speed_limit'] * np.log1p(df_eng['num_reported_accidents'])
    df_eng['curv_speed_acc'] = df_eng['curv_speed'] * np.log1p(df_eng['num_reported_accidents'])
    
    # --- Ratio features ---
    df_eng['acc_per_lane'] = df_eng['num_reported_accidents'] / (df_eng['num_lanes'] + 1)
    df_eng['curv_per_lane'] = df_eng['curvature'] / (df_eng['num_lanes'] + 1)
    df_eng['risk_density'] = df_eng['curv_speed'] / (df_eng['num_lanes'] + 1)

    # --- Nonlinear transforms ---
    df_eng['curv_log'] = np.log1p(df_eng['curvature'])
    df_eng['speed_log'] = np.log1p(df_eng['speed_limit'])
    df_eng['acc_log'] = np.log1p(df_eng['num_reported_accidents'])
    df_eng['inv_speed'] = 1 / (df_eng['speed_limit'] + 1)
    
     # --- Statistical combinations ---
    df_eng['risk_index'] = (df_eng['curv_speed'] * df_eng['acc_per_lane']) / (df_eng['speed_limit'] + 1)
    df_eng['stability_score'] = (df_eng['num_lanes'] / (1 + df_eng['curvature'])) * df_eng['speed_limit']
    
    # --- Binary conditions ---
    df_eng['tight_lane'] = (df_eng['num_lanes'] <= 2).astype(int)
    df_eng['sharp_curve'] = (df_eng['curvature'] > 0.6).astype(int)
    df_eng['high_speed_zone'] = (df_eng['speed_limit'] > 80).astype(int)
    df_eng['critical_zone'] = ((df_eng['sharp_curve']==1) & (df_eng['high_speed_zone']==1)).astype(int)
    

    
    # --- Polynomial mixes for smoother nonlinearity ---
    df_eng['poly_mix1'] = np.sqrt(df_eng['curvature'] * df_eng['speed_limit'])
    df_eng['poly_mix2'] = (df_eng['num_reported_accidents']**0.3) * df_eng['speed_limit']
    
    return df_eng

# Preprocessing
train_processed = train.copy()
test_processed = test.copy()

# Convert booleans
for col in BOOLEAN_FEATURES:
    train_processed[col] = train_processed[col].astype(int)
    test_processed[col] = test_processed[col].astype(int)

# Label encode categoricals
label_encoders = {}
for col in CATEGORICAL_FEATURES:
    le = LabelEncoder()
    train_processed[f'{col}_enc'] = le.fit_transform(train_processed[col])
    test_processed[f'{col}_enc'] = le.transform(test_processed[col])
    label_encoders[col] = le

# Apply feature engineering
train_engineered = engineer_features(train_processed)
test_engineered = engineer_features(test_processed)

print(f"Feature engineering complete")
print(f"Original features: {len(CATEGORICAL_FEATURES + BOOLEAN_FEATURES + NUMERICAL_FEATURES)}")
print(f"Engineered features: {train_engineered.shape[1]}")
print(f"New features created: {train_engineered.shape[1] - train_processed.shape[1]}")

In [None]:
# Prepare feature matrix
exclude_cols = [ID_COL, TARGET] + CATEGORICAL_FEATURES
feature_cols = [col for col in train_engineered.columns if col not in exclude_cols]

X_train = train_engineered[feature_cols].values
y_train = train_engineered[TARGET].values
X_test = test_engineered[feature_cols].values

print(f"Training matrix: {X_train.shape}")
print(f"Test matrix: {X_test.shape}")

In [None]:
# Compare before vs after
before_cols = set(train_processed.columns)
after_cols = set(train_engineered.columns)
new_features = sorted(after_cols - before_cols)

# Bar Chart: Feature Count Comparison
plt.figure(figsize=(6, 4))
plt.bar(['Before', 'After'], [len(before_cols), len(after_cols)], color=['skyblue', 'lightgreen'])
plt.title('Feature Count Before vs After Engineering')
plt.ylabel('Number of Features')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Table Visualization: List of New Features
comparison_df = pd.DataFrame({
    'New Features': new_features
})

plt.figure(figsize=(6, len(new_features)*0.4))
plt.axis('off')
plt.title("Newly Created Features", fontsize=14, pad=10)
table = plt.table(cellText=comparison_df.values,
                  colLabels=comparison_df.columns,
                  cellLoc='center',
                  loc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.2, 1.2)
plt.show()


<style>
.feature-box {
  max-width: 820px;
  margin: 28px auto;
  padding: 28px 34px;
  background: linear-gradient(180deg, #141E30, #243B55); /* Blue-Purple Gradient */
  color: #f5f5f5;
  border-radius: 16px;
  box-shadow: 0 8px 26px rgba(0,0,0,0.6);
  border: 1px solid rgba(255,255,255,0.12);
  font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial;
  line-height: 1.8;
}

.feature-box h3 {
  font-size: 22px;
  margin-bottom: 14px;
  font-weight: 700;
  color: #00d4ff; /* Cyan Accent */
}

.feature-box h4 {
  font-size: 18px;
  margin: 18px 0 10px;
  font-weight: 600;
  color: #ffd700; /* Gold Accent */
}

.feature-box ul {
  margin: 0;
  padding-left: 24px;
}

.feature-box li {
  margin-bottom: 8px;
  font-size: 15px;
}
</style>

<div class="feature-box">
  <h3>✨ Feature Engineering Summary</h3>

  <h4>📅 Temporal Features</h4>
  <ul>
    <li>Converted time variables (<code>hour</code>, <code>day_of_week</code>) into cyclical features using sine and cosine transformations.</li>
    <li>Created <strong>rush hour</strong> and <strong>nighttime</strong> indicators.</li>
    <li>Added <strong>weekend flag</strong> for <code>day_of_week</code>.</li>
  </ul>

  <h4>🔗 Interaction Features</h4>
  <ul>
    <li>Combined numeric features, e.g., <code>speed_limit × road_curvature → speed_curvature</code>.</li>
    <li><code>lanes × traffic_density → lanes_density</code>.</li>
    <li>Created interaction between categorical features, e.g., 
      <code>weather_condition + lighting_condition → weather_lighting</code>.</li>
    <li><code>speed_limit + weather_condition → speed_weather</code>.</li>
  </ul>

  <h4>📊 Aggregation Features</h4>
  <ul>
    <li>Computed row-wise statistical features across numerical columns: <strong>mean, std, min, max, range</strong>.</li>
  </ul>

  <h4>🔤 Categorical Encoding</h4>
  <ul>
    <li>Transformed all categorical features into numeric labels using <strong>Label Encoding</strong>.</li>
  </ul>
</div>


## 5- Ensemble Model

In [None]:
# GPU-Optimized XGBoost Parameters
xgb_params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.003,
    "max_depth": 8,
    "min_child_weight": 4,
    "subsample": 0.8,
    "colsample_bytree": 0.7,
    "colsample_bynode": 0.8,
    "reg_alpha": 0.5,
    "reg_lambda": 1.5,
    "gamma": 0.1,
    "n_estimators": 6000,
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "device": "cuda",
    "seed": 42,
    "random_state": 42
}


#  Split a small validation set for early stopping
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

#  Create DMatrix objects
dtrain = xgb.DMatrix(X_tr, label=y_tr)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test)

# ✅ Train the single model
print("\n🚀 Training single XGBoost model on GPU...\n")

model = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=10_000,           # enough for early stopping
    evals=[(dtrain, "Train"), (dval, "Valid")],
    early_stopping_rounds=200,
    verbose_eval=200
)

#  Predict
val_preds = model.predict(dval)
test_preds = model.predict(dtest)

# ✅Evaluation

rmse = np.sqrt(mean_squared_error(y_val, val_preds))
r2 = r2_score(y_val, val_preds)

print(f"\n✅ Model Training Complete!")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation R²: {r2:.4f}")

# ✅ Save model for reuse
model.save_model("xgb_single_gpu.model")


In [None]:
# ✅ Create submission file for Kaggle

test_preds_clipped = np.clip(test_preds, 0, 1)

submission = pd.DataFrame({
    'id': test['id'],
    'accident_risk': test_preds_clipped
})

# ✅ Validation checks
assert submission.shape[0] == test.shape[0], "Shape mismatch between test and submission!"
assert submission['accident_risk'].isna().sum() == 0, "There are missing predictions!"
assert (submission['accident_risk'] >= 0).all(), "Some predictions are below 0!"
assert (submission['accident_risk'] <= 1).all(), "Some predictions exceed 1!"

# ✅ Save submission file
submission.to_csv('/kaggle/working/submission.csv', index=False)

# ✅ Summary info
print("✅ Submission Created Successfully")
print("=" * 60)
print(f"Shape: {submission.shape}")
print(f"Prediction Mean: {submission['accident_risk'].mean():.4f}")
print(f"Prediction Std: {submission['accident_risk'].std():.4f}")
print(f"Prediction Min: {submission['accident_risk'].min():.4f}")
print(f"Prediction Max: {submission['accident_risk'].max():.4f}")
print("\nFirst 10 predictions:")
print(submission.head(10))

