In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline
import gc, os

In [2]:
# Loading Dataset
DATA_FN = "US_Accidents_March23.parquet"  # Using Parquet file instead of CSV
RANDOM_STATE = 42
TEST_SIZE = 0.2
N_JOBS = -1
RANDOM_SEARCH_ITER = 30
CV_FOLDS = 5

print("Loading dataset...")
df = pd.read_parquet(DATA_FN, engine='pyarrow')
print("Rows:", len(df), "Columns:", len(df.columns))
print("Columns preview:", df.columns.tolist())

Loading dataset...
Rows: 7728394 Columns: 46
Columns preview: ['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description', 'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']


In [4]:

# DATA CLEANING & PREPARATION PIPELINE

import gc, os

# Exclusion list ---
EXCLUDE_COLS = [
    "ID", "Description", "Country", "Zipcode", 
    "Street", "Airport_Code", "City", "County"
]

# Boolean-like columns ---
bool_cols = [
    "Amenity", "Bump", "Crossing", "Give_Way", "Junction", 
    "No_Exit", "Railway", "Roundabout", "Station", "Stop", 
    "Traffic_Calming", "Traffic_Signal", "Turning_Loop"
]

#  Convert boolean-like columns to True/False ---
for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].astype(bool)

# Handle datetime columns ---
for c in ["Start_Time", "End_Time", "Weather_Timestamp"]:
    if c in df.columns:
        df[c] = pd.to_datetime(df[c], errors='coerce')

# -Fill missing numeric columns with median ---
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

#  Fill missing categorical columns with 'Unknown' ---
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in cat_cols:
    df[col] = df[col].fillna('Unknown')

# --- Fill missing datetime columns with median of Start_Time ---
for c in ["End_Time", "Weather_Timestamp"]:
    if c in df.columns:
        df[c] = df[c].fillna(df["Start_Time"].median())

# --- Drop duplicates based on unique ID ---
if "ID" in df.columns:
    df = df.drop_duplicates(subset=["ID"], keep="first")

gc.collect()

# -Drop excluded columns if they exist -
df = df.drop(columns=[col for col in EXCLUDE_COLS if col in df.columns], errors='ignore')

# Drop rows where any categorical feature == 'Unknown' ---
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
df = df[~df[cat_cols].isin(['Unknown']).any(axis=1)]

print("✅ Cleaning complete.")
print(f"Dataset shape after cleaning: {df.shape}")

# --- Missingness Analysis ---
OUTPUT_DIR = "output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

missing_pct = df.isna().mean().sort_values(ascending=False) * 100
missing_pct.to_csv(os.path.join(OUTPUT_DIR, "missingness_percent.csv"))
print("\nTop missing columns:\n", missing_pct.head(15))


✅ Cleaning complete.
Dataset shape after cleaning: (7494396, 38)

Top missing columns:
 Start_Time           9.53399
Source               0.00000
Station              0.00000
Bump                 0.00000
Crossing             0.00000
Give_Way             0.00000
Junction             0.00000
No_Exit              0.00000
Railway              0.00000
Roundabout           0.00000
Stop                 0.00000
Weather_Condition    0.00000
Traffic_Calming      0.00000
Traffic_Signal       0.00000
Turning_Loop         0.00000
dtype: float64


In [6]:
# LASSO REGULARIZATION MODELING + FEATURE IMPORTANCE PLOT

TARGET = "Severity"

# --- Separate features and target ---
X = df.drop(columns=[TARGET])
y = df[TARGET]

target = "Severity"
EXCLUDE_COLS = [
    "ID", "Description", "Country", "Zipcode",
    "Street", "Airport_Code", "City", "County"
]

# Drop excluded columns
df_model = df.drop(columns=[c for c in EXCLUDE_COLS if c in df.columns])

# Drop rows with "Unknown" categories
cat_cols = df_model.select_dtypes(include=['object']).columns
for col in cat_cols:
    df_model = df_model[df_model[col] != "Unknown"]

# Separate X and y
X = df_model.drop(columns=[target])
y = df_model[target].astype(int)

# ===================== 2. Split train/test =====================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ===================== 3. Preprocessing pipeline =====================
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
bool_cols = X.select_dtypes(include=['bool']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
        ('bool', 'passthrough', bool_cols)
    ]
)

# ===================== 4. Lasso Regularization (Logistic Regression with L1) =====================
lasso_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        penalty='l1',
        solver='liblinear',
        C=0.1,        # Regularization strength (smaller = stronger regularization)
        random_state=42,
        max_iter=1000
    ))
])

# Fit model
lasso_clf.fit(X_train, y_train)

# ===================== 5. Evaluate =====================
y_pred = lasso_clf.predict(X_test)

print("✅ Classification Report:")
print(classification_report(y_test, y_pred))
print("\n✅ Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ===================== 6. Get feature importance =====================
# Extract feature names from one-hot and numeric
feature_names = (
    list(num_cols) +
    list(lasso_clf.named_steps['preprocessor']
         .named_transformers_['cat']
         .get_feature_names_out(cat_cols)) +
    list(bool_cols)
)

coefficients = lasso_clf.named_steps['classifier'].coef_[0]
importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coefficients
})
importance_df["Abs_Coefficient"] = importance_df["Coefficient"].abs()
importance_df = importance_df.sort_values("Abs_Coefficient", ascending=False)

print("\nTop 15 most influential features (LASSO):")
print(importance_df.head(15))


MemoryError: Unable to allocate 686. MiB for an array with shape (12, 7494396) and data type float64