In [13]:
# Print class distribution
unique, counts = np.unique(y_train, return_counts=True)

print("\n===== CLASS DISTRIBUTION IN y_train =====")
for cls, cnt in zip(unique, counts):
    print(f"Severity {cls+1} (internal {cls}): {cnt:,}")


===== CLASS DISTRIBUTION IN y_train =====
Severity 1 (internal 0): 53,891
Severity 2 (internal 1): 4,347,102
Severity 3 (internal 2): 1,039,406
Severity 4 (internal 3): 147,783


FIRST COMPLETED SEQUENCE- STRONG ACCURACY, NOT CAPTURING SEVERITY 1 AND 4 PROPERLY.

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# ======================================
# 1. LOAD DATA
# ======================================
df = pd.read_parquet("US_Accidents_March23.parquet")

# Keep only memory-efficient severity predictors
cols = [
    'Severity','Distance(mi)','Temperature(F)','Visibility(mi)','Humidity(%)','Pressure(in)',
    'Wind_Speed(mph)','Weather_Condition','Traffic_Signal','Junction','Crossing','Stop',
    'Start_Time','State','City'
]
df = df[cols]

# ======================================
# 2. DATETIME → NUMERIC
# ======================================
df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')
df.dropna(subset=['Start_Time'], inplace=True)
df['Hour'] = df['Start_Time'].dt.hour.astype('int8')
df['Month'] = df['Start_Time'].dt.month.astype('int8')
df['Weekday'] = df['Start_Time'].dt.weekday.astype('int8')
df.drop(columns=['Start_Time'], inplace=True)

# ======================================
# 3. WEATHER SIMPLIFICATION
# ======================================
df['Weather_Condition'] = df['Weather_Condition'].fillna('')
df['Rain'] = df['Weather_Condition'].str.contains('Rain|Storm|Thunder', case=False, regex=True).astype('int8')
df['Snow'] = df['Weather_Condition'].str.contains('Snow|Ice|Blizzard', case=False, regex=True).astype('int8')
df['Fog']  = df['Weather_Condition'].str.contains('Fog', case=False, regex=True).astype('int8')
df.drop(columns=['Weather_Condition'], inplace=True)

# ======================================
# 4. LOCATION TARGET ENCODING
# ======================================
te = TargetEncoder(cols=['State', 'City'])
df[['State','City']] = te.fit_transform(df[['State','City']], df['Severity'])

# ======================================
# 5. BINARY INFRA FEATURES
# ======================================
for col in ['Traffic_Signal','Junction','Crossing','Stop']:
    df[col] = df[col].astype('int8')

# ======================================
# 6. REDUCE NUMERIC PRECISION
# ======================================
num_cols = df.select_dtypes(include=['float64','int64']).columns
df[num_cols] = df[num_cols].astype('float32')

# ======================================
# 7. TARGET FIX: Severity 1–4 → 0–3
# ======================================
df['Severity_Internal'] = df['Severity'] - 1
df.drop(columns=['Severity'], inplace=True)

X = df.drop(['Severity_Internal'], axis=1)
y = df['Severity_Internal'].astype('int8')

# ======================================
# 8. IMPUTATION FOR NUMERIC & CATEGORICAL
# ======================================
num_cols = X.select_dtypes(include=['int64','float64','float32']).columns
cat_cols = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), num_cols),
        ('cat', SimpleImputer(strategy='most_frequent'), cat_cols)
    ],
    remainder='drop'
)

X = preprocessor.fit_transform(X)
X = X.astype('float32')  # ensure numeric type for SMOTE/XGBoost

# ======================================
# 9. TRAIN-TEST SPLIT
# ======================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Print class distribution for reference
unique, counts = np.unique(y_train, return_counts=True)
print("\n===== CLASS DISTRIBUTION IN y_train =====")
for cls, cnt in zip(unique, counts):
    print(f"Severity {cls+1} (internal {cls}): {cnt:,}")

# ======================================
# 10. APPLY SMOTE ON TRUE MINORITY CLASSES
# ======================================
sm = SMOTE(
    sampling_strategy={
        0: 200_000,   # Severity 1
        3: 400_000    # Severity 4
    },
    random_state=42
)

X_train, y_train = sm.fit_resample(X_train, y_train)

# ======================================
# 11. MEMORY-OPTIMIZED XGBOOST
# ======================================
xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=4,
    tree_method='hist',
    max_depth=5,
    n_estimators=350,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_lambda=2,
    eval_metric='mlogloss',
    random_state=42
)

xgb.fit(X_train, y_train)

# ======================================
# 12. PREDICT & MAP BACK 0–3 → 1–4
# ======================================
y_pred_internal = xgb.predict(X_test)
y_pred = y_pred_internal + 1
y_true = y_test + 1

print("\n===== XGBOOST (MEMORY OPTIMIZED, FIXED SEVERITY) =====")
print(classification_report(y_true, y_pred))
print("Accuracy:", accuracy_score(y_true, y_pred))



===== CLASS DISTRIBUTION IN y_train =====
Severity 1 (internal 0): 53,891
Severity 2 (internal 1): 4,347,102
Severity 3 (internal 2): 1,039,406
Severity 4 (internal 3): 147,783

===== XGBOOST (MEMORY OPTIMIZED, FIXED SEVERITY) =====
              precision    recall  f1-score   support

           1       0.29      0.05      0.09     13473
           2       0.84      0.94      0.89   1086776
           3       0.64      0.41      0.50    259851
           4       0.33      0.08      0.13     36946

    accuracy                           0.81   1397046
   macro avg       0.52      0.37      0.40   1397046
weighted avg       0.78      0.81      0.79   1397046

Accuracy: 0.8120197903290228


SECOND SEQUENCE, WITH IMPROVED CLASS WEIGHTS, STILL NOT BALANCING, POOR ACCURACY

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# ======================================
# 1. LOAD DATA
# ======================================
df = pd.read_parquet("US_Accidents_March23.parquet")

cols = [
    'Severity','Distance(mi)','Temperature(F)','Visibility(mi)','Humidity(%)','Pressure(in)',
    'Wind_Speed(mph)','Weather_Condition','Traffic_Signal','Junction','Crossing','Stop',
    'Start_Time','State','City'
]
df = df[cols]

# ======================================
# 2. DATETIME → NUMERIC
# ======================================
df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')
df.dropna(subset=['Start_Time'], inplace=True)
df['Hour'] = df['Start_Time'].dt.hour.astype('int8')
df['Month'] = df['Start_Time'].dt.month.astype('int8')
df['Weekday'] = df['Start_Time'].dt.weekday.astype('int8')
df.drop(columns=['Start_Time'], inplace=True)

# ======================================
# 3. WEATHER SIMPLIFICATION
# ======================================
df['Weather_Condition'] = df['Weather_Condition'].fillna('')
df['Rain'] = df['Weather_Condition'].str.contains('Rain|Storm|Thunder', case=False, regex=True).astype('int8')
df['Snow'] = df['Weather_Condition'].str.contains('Snow|Ice|Blizzard', case=False, regex=True).astype('int8')
df['Fog']  = df['Weather_Condition'].str.contains('Fog', case=False, regex=True).astype('int8')
df.drop(columns=['Weather_Condition'], inplace=True)

# ======================================
# 4. LOCATION TARGET ENCODING
# ======================================
te = TargetEncoder(cols=['State','City'])
df[['State','City']] = te.fit_transform(df[['State','City']], df['Severity'])

# ======================================
# 5. BINARY INFRA FEATURES
# ======================================
for col in ['Traffic_Signal','Junction','Crossing','Stop']:
    df[col] = df[col].astype('int8')

# ======================================
# 6. REDUCE NUMERIC PRECISION
# ======================================
num_cols = df.select_dtypes(include=['float64','int64']).columns
df[num_cols] = df[num_cols].astype('float32')

# ======================================
# 7. TARGET FIX: Severity 1–4 → 0–3
# ======================================
df['Severity_Internal'] = df['Severity'] - 1
df.drop(columns=['Severity'], inplace=True)

X = df.drop(['Severity_Internal'], axis=1)
y = df['Severity_Internal'].astype('int8')

# ======================================
# 8. IMPUTATION FOR NUMERIC & CATEGORICAL
# ======================================
num_cols = X.select_dtypes(include=['int64','float64','float32']).columns
cat_cols = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), num_cols),
        ('cat', SimpleImputer(strategy='most_frequent'), cat_cols)
    ],
    remainder='drop'
)

X = preprocessor.fit_transform(X)
X = X.astype('float32')  # ensure numeric type

# ======================================
# 9. TRAIN-TEST SPLIT
# ======================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ======================================
# 10. CALCULATE CLASS WEIGHTS
# ======================================
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = {cls: weight for cls, weight in zip(classes, class_weights)}

# Convert to array of weights for XGBoost
sample_weights = np.array([class_weight_dict[cls] for cls in y_train])

# ======================================
# 11. APPLY SMOTE ON TRUE MINORITY CLASSES
# ======================================
sm = SMOTE(
    sampling_strategy={
        0: 150_000,  # Severity 1
        3: 200_000   # Severity 4
    },
    random_state=42
)

X_train, y_train = sm.fit_resample(X_train, y_train)
# Update sample weights after resampling
sample_weights = np.array([class_weight_dict[cls] for cls in y_train])

# ======================================
# 12. MEMORY-OPTIMIZED XGBOOST WITH CLASS WEIGHTS
# ======================================
xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=4,
    tree_method='hist',
    max_depth=5,
    n_estimators=350,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_lambda=2,
    eval_metric='mlogloss',
    random_state=42
)

xgb.fit(X_train, y_train, sample_weight=sample_weights)

# ======================================
# 13. PREDICT & MAP BACK 0–3 → 1–4
# ======================================
y_pred_internal = xgb.predict(X_test)
y_pred = y_pred_internal + 1
y_true = y_test + 1

print("\n===== XGBOOST (WITH CLASS WEIGHTS & SMOTE) =====")
print(classification_report(y_true, y_pred))
print("Accuracy:", accuracy_score(y_true, y_pred))



===== XGBOOST (WITH CLASS WEIGHTS & SMOTE) =====
              precision    recall  f1-score   support

           1       0.04      0.88      0.07     13473
           2       0.95      0.38      0.54   1086776
           3       0.48      0.54      0.51    259851
           4       0.09      0.84      0.16     36946

    accuracy                           0.42   1397046
   macro avg       0.39      0.66      0.32   1397046
weighted avg       0.83      0.42      0.52   1397046

Accuracy: 0.42218867524762965


IMPROVED WEIGHTS AND BALANCING, OVER SAMPLED SEVERITY 1 AND 4, IGNORED 2 AND 3, POOR ACCURACY

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from category_encoders import TargetEncoder
from sklearn.utils.class_weight import compute_sample_weight

# --------------------------------------
# 1. LOAD DATA
# --------------------------------------
df = pd.read_parquet("US_Accidents_March23.parquet")

# Keep only relevant columns for memory efficiency
cols = [
    'Severity','Distance(mi)','Temperature(F)','Visibility(mi)','Humidity(%)','Pressure(in)',
    'Wind_Speed(mph)','Weather_Condition','Traffic_Signal','Junction','Crossing','Stop',
    'Start_Time','State','City'
]
df = df[cols]

# --------------------------------------
# 2. DATETIME → NUMERIC
# --------------------------------------
df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')
df.dropna(subset=['Start_Time'], inplace=True)
df['Hour'] = df['Start_Time'].dt.hour.astype('int8')
df['Month'] = df['Start_Time'].dt.month.astype('int8')
df['Weekday'] = df['Start_Time'].dt.weekday.astype('int8')
df.drop(columns=['Start_Time'], inplace=True)

# --------------------------------------
# 3. SIMPLIFY WEATHER
# --------------------------------------
df['Weather_Condition'] = df['Weather_Condition'].fillna('')
df['Rain'] = df['Weather_Condition'].str.contains('Rain|Storm|Thunder', case=False, regex=True).astype('int8')
df['Snow'] = df['Weather_Condition'].str.contains('Snow|Ice|Blizzard', case=False, regex=True).astype('int8')
df['Fog']  = df['Weather_Condition'].str.contains('Fog', case=False, regex=True).astype('int8')
df.drop(columns=['Weather_Condition'], inplace=True)

# --------------------------------------
# 4. LOCATION TARGET ENCODING
# --------------------------------------
te = TargetEncoder(cols=['State', 'City'])
df[['State','City']] = te.fit_transform(df[['State','City']], df['Severity'])

# --------------------------------------
# 5. BINARY INFRA FEATURES
# --------------------------------------
for col in ['Traffic_Signal','Junction','Crossing','Stop']:
    df[col] = df[col].astype('int8')

# --------------------------------------
# 6. REDUCE NUMERIC PRECISION
# --------------------------------------
num_cols = df.select_dtypes(include=['float64','int64']).columns
df[num_cols] = df[num_cols].astype('float32')

# --------------------------------------
# 7. TARGET FIX: Convert Severity 1–4 → 0–3
# --------------------------------------
df['Severity_Internal'] = df['Severity'] - 1
df.drop(columns=['Severity'], inplace=True)

X = df.drop(['Severity_Internal'], axis=1)
y = df['Severity_Internal'].astype('int8')

# --------------------------------------
# 8. IMPUTE MISSING VALUES
# --------------------------------------
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer 

num_cols = X.select_dtypes(include=['int64','float32']).columns
cat_cols = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), num_cols),
        ('cat', SimpleImputer(strategy='most_frequent'), cat_cols)
    ],
    remainder='drop'
)

X = preprocessor.fit_transform(X)
X = X.astype('float32')  # ensure float32 for XGBoost

# --------------------------------------
# 9. TRAIN-TEST SPLIT
# --------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------------------------
# 10. COMPUTE CLASS WEIGHTS
# --------------------------------------
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# --------------------------------------
# 11. MEMORY-OPTIMIZED XGBOOST
# --------------------------------------
xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=4,
    tree_method='hist',
    max_depth=5,
    n_estimators=350,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_lambda=2,
    eval_metric='mlogloss',
    random_state=42,
    use_label_encoder=False
)

xgb.fit(X_train, y_train, sample_weight=sample_weights)

# --------------------------------------
# 12. PREDICT & MAP BACK 0–3 → 1–4
# --------------------------------------
y_pred_internal = xgb.predict(X_test)
y_pred = y_pred_internal + 1
y_true = y_test + 1

print("===== XGBOOST (CLASS WEIGHTS ONLY) =====")
print(classification_report(y_true, y_pred))
print("Accuracy:", accuracy_score(y_true, y_pred))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


===== XGBOOST (CLASS WEIGHTS ONLY) =====
              precision    recall  f1-score   support

           1       0.05      0.81      0.09     13473
           2       0.95      0.44      0.60   1086776
           3       0.46      0.63      0.53    259851
           4       0.10      0.82      0.17     36946

    accuracy                           0.49   1397046
   macro avg       0.39      0.67      0.35   1397046
weighted avg       0.82      0.49      0.57   1397046

Accuracy: 0.4901492148433194


WITH ADDITIONAL TEMPORAL FEATURES

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# ======================================
# 1. LOAD DATA
# ======================================
df = pd.read_parquet("US_Accidents_March23.parquet")

cols = [
    'Severity','Distance(mi)','Temperature(F)','Visibility(mi)','Humidity(%)','Pressure(in)',
    'Wind_Speed(mph)','Weather_Condition','Traffic_Signal','Junction','Crossing','Stop',
    'Start_Time','State','City'
]
df = df[cols]

# ======================================
# 2. DATETIME → NUMERIC
# ======================================
df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')
df.dropna(subset=['Start_Time'], inplace=True)

df['Hour'] = df['Start_Time'].dt.hour.astype('int8')
df['Month'] = df['Start_Time'].dt.month.astype('int8')
df['Weekday'] = df['Start_Time'].dt.weekday.astype('int8')
df['Weekend'] = (df['Weekday'] >= 5).astype('int8')
df['Night'] = (df['Hour'] >= 19).astype('int8')
df['Is_Rush_Hour'] = df['Hour'].isin([7,8,17,18]).astype('int8')

df.drop(columns=['Start_Time'], inplace=True)

# ======================================
# 3. WEATHER SIMPLIFICATION
# ======================================
df['Weather_Condition'] = df['Weather_Condition'].fillna('')
df['Rain'] = df['Weather_Condition'].str.contains('Rain|Storm|Thunder', case=False).astype('int8')
df['Snow'] = df['Weather_Condition'].str.contains('Snow|Ice|Blizzard', case=False).astype('int8')
df['Fog']  = df['Weather_Condition'].str.contains('Fog', case=False).astype('int8')

# NEW DERIVED WEATHER SIGNALS
df['Low_Visibility'] = (df['Visibility(mi)'] < 1).astype('int8')
df['High_Wind'] = (df['Wind_Speed(mph)'] > 30).astype('int8')
df['Temp_Below_Freezing'] = (df['Temperature(F)'] < 32).astype('int8')

df.drop(columns=['Weather_Condition'], inplace=True)

# ======================================
# 4. LOCATION TARGET ENCODING
# ======================================
te = TargetEncoder(cols=['State', 'City'])
df[['State','City']] = te.fit_transform(df[['State','City']], df['Severity'])

# ADD CITY DENSITY SIGNAL
city_counts = df['City'].value_counts().to_dict()
df['City_Frequency'] = df['City'].map(city_counts).astype('float32')

# ======================================
# 5. ROAD / INFRA FEATURES
# ======================================
for col in ['Traffic_Signal','Junction','Crossing','Stop']:
    df[col] = df[col].astype('int8')

# PROXY FOR HIGHWAY SPEED ROADS
df['Highway'] = df['City'].astype(str).str.contains(r'\b(I-|US-|SR-|Hwy)', case=False, regex=True).astype('int8')

# ======================================
# 6. DISTANCE NONLINEARITY
# ======================================
df['Short_Distance'] = (df['Distance(mi)'] < 0.2).astype('int8')
df['Long_Distance']  = (df['Distance(mi)'] > 2).astype('int8')

# ======================================
# 7. REDUCE NUMERIC PRECISION
# ======================================
num_cols = df.select_dtypes(include=['float64','int64']).columns
df[num_cols] = df[num_cols].astype('float32')

# ======================================
# 8. TARGET FIX: Severity 1–4 → 0–3
# ======================================
df['Severity_Internal'] = df['Severity'] - 1
df.drop(columns=['Severity'], inplace=True)

X = df.drop(['Severity_Internal'], axis=1)
y = df['Severity_Internal'].astype('int8')

# ======================================
# 9. IMPUTATION
# ======================================
num_cols = X.select_dtypes(include=['float32','float64','int64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), num_cols),
        ('cat', SimpleImputer(strategy='most_frequent'), cat_cols)
    ],
    remainder='drop'
)

X = preprocessor.fit_transform(X).astype('float32')

# ======================================
# 10. TRAIN-TEST SPLIT
# ======================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

unique, counts = np.unique(y_train, return_counts=True)
print("\n===== CLASS DISTRIBUTION IN y_train =====")
for cls, cnt in zip(unique, counts):
    print(f"Severity {cls+1}: {cnt:,}")

# ======================================
# 11. SMOTE
# ======================================
sm = SMOTE(
    sampling_strategy={0: 200_000, 3: 400_000},
    random_state=42
)
X_train, y_train = sm.fit_resample(X_train, y_train)

# ======================================
# 12. XGBOOST
# ======================================
xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=4,
    tree_method='hist',
    max_depth=6,                  # ← Adjusted for new features
    n_estimators=400,             # ← More trees
    learning_rate=0.045,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_lambda=2,
    eval_metric='mlogloss',
    random_state=42
)

xgb.fit(X_train, y_train)

# ======================================
# 13. PREDICT & MAP BACK
# ======================================
y_pred_internal = xgb.predict(X_test)
y_pred = y_pred_internal + 1
y_true = y_test + 1

print("\n===== XGBOOST WITH ENHANCED FEATURES =====")
print(classification_report(y_true, y_pred))
print("Accuracy:", accuracy_score(y_true, y_pred))


  df['Highway'] = df['City'].astype(str).str.contains(r'\b(I-|US-|SR-|Hwy)', case=False, regex=True).astype('int8')



===== CLASS DISTRIBUTION IN y_train =====
Severity 1: 53,891
Severity 2: 4,347,102
Severity 3: 1,039,406
Severity 4: 147,783

===== XGBOOST WITH ENHANCED FEATURES =====
              precision    recall  f1-score   support

           1       0.33      0.05      0.09     13473
           2       0.84      0.94      0.89   1086776
           3       0.65      0.42      0.51    259851
           4       0.42      0.13      0.19     36946

    accuracy                           0.82   1397046
   macro avg       0.56      0.39      0.42   1397046
weighted avg       0.79      0.82      0.79   1397046

Accuracy: 0.8162580187051822


IMPLEMENTING CLASS WEIGHTS + FOCAL LOSS TO ADDED FEATURES

In [19]:
# ---------------------------
# ITERATIVE FOCAL RE-WEIGHTING + CLASS WEIGHTS for XGBoost
# ---------------------------
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
import numpy as np

# Parameters
gamma = 2.0              # focal focusing parameter (common: 1.0-3.0)
n_rounds = 3             # number of re-weighting iterations (2-3 is usually enough)
xgb_params = {
    "objective": "multi:softprob",
    "num_class": 4,
    "eta": 0.08,
    "max_depth": 8,
    "min_child_weight": 5,
    "subsample": 0.8,
    "colsample_bytree": 0.7,
    "gamma": 0.1,
    "lambda": 1.5,
    "tree_method": "hist", # memory efficient
}

# Ensure arrays (X_train from preprocessor may already be np.array)
X_train_arr = np.asarray(X_train, dtype=np.float32)
y_train_arr = np.asarray(y_train, dtype=np.int32)

# 1) compute class-level weights (balanced)
classes = np.unique(y_train_arr)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train_arr)
# map into dict: class -> weight
cw = {int(c): float(w) for c, w in zip(classes, class_weights)}
print("Class weights:", cw)

# initialize sample weights as class weights
sample_weight = np.array([cw[int(lbl)] for lbl in y_train_arr], dtype=np.float32)

# iterative reweighting
for it in range(n_rounds):
    print(f"\n--- Focal reweighting iteration {it+1}/{n_rounds} ---")
    # train xgboost with current sample weights
    model = XGBClassifier(**xgb_params)
    model.fit(X_train_arr, y_train_arr, sample_weight=sample_weight, verbose=False)

    # predict probabilities on training set (softprob)
    proba = model.predict_proba(X_train_arr)  # shape (n_samples, n_classes)
    # probability for the true class p_t
    p_t = proba[np.arange(len(y_train_arr)), y_train_arr]

    # compute focal reweighting factor: (1 - p_t) ** gamma
    focal_factor = np.power(1.0 - np.clip(p_t, 1e-12, 1.0), gamma).astype(np.float32)

    # new sample weight = class_weight * focal_factor
    new_sample_weight = np.array([cw[int(lbl)] for lbl in y_train_arr], dtype=np.float32) * focal_factor

    # stabilize weights by scaling to have same sum as before (keeps total weight scale stable)
    if new_sample_weight.sum() > 0:
        new_sample_weight = new_sample_weight * (sample_weight.sum() / new_sample_weight.sum())

    sample_weight = new_sample_weight

    # report training-side diagnostics (class-wise average p_t and focal factor)
    avg_pt_by_class = {}
    avg_factor_by_class = {}
    for c in classes:
        mask = (y_train_arr == c)
        if mask.sum() == 0:
            avg_pt_by_class[c] = np.nan
            avg_factor_by_class[c] = np.nan
        else:
            avg_pt_by_class[c] = float(p_t[mask].mean())
            avg_factor_by_class[c] = float(focal_factor[mask].mean())
    print("Avg p_t by class (train):", avg_pt_by_class)
    print("Avg focal factor by class (train):", avg_factor_by_class)

# After iterations, final model is `model`
# Evaluate on test set (X_test may be np.array; ensure dtype)
X_test_arr = np.asarray(X_test, dtype=np.float32)
y_test_arr = np.asarray(y_test, dtype=np.int32)

y_pred_internal = model.predict(X_test_arr)
# map back to 1-4 externally if you want
y_pred = y_pred_internal + 1
y_true = y_test_arr + 1

from sklearn.metrics import classification_report, accuracy_score
print("\n===== XGBOOST with CLASS WEIGHTS + FOCAL REWEIGHTING =====")
print(classification_report(y_true, y_pred))
print("Accuracy:", accuracy_score(y_true, y_pred))


Class weights: {0: 7.483135, 1: 0.3442815466487789, 2: 1.4398868199721764, 3: 3.7415675}

--- Focal reweighting iteration 1/3 ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Avg p_t by class (train): {0: 0.6731884479522705, 1: 0.46296611428260803, 2: 0.5538685917854309, 3: 0.6795997619628906}
Avg focal factor by class (train): {0: 0.1793413907289505, 1: 0.3437821567058563, 2: 0.2633805274963379, 3: 0.16095396876335144}

--- Focal reweighting iteration 2/3 ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Avg p_t by class (train): {0: 0.3731985092163086, 1: 0.3801787495613098, 2: 0.3061244487762451, 3: 0.30045434832572937}
Avg focal factor by class (train): {0: 0.424507737159729, 1: 0.39740464091300964, 2: 0.49012163281440735, 3: 0.50416100025177}

--- Focal reweighting iteration 3/3 ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Avg p_t by class (train): {0: 0.6766406893730164, 1: 0.4387747645378113, 2: 0.5851922035217285, 3: 0.7054749131202698}
Avg focal factor by class (train): {0: 0.18709827959537506, 1: 0.39136460423469543, 2: 0.2563002109527588, 3: 0.1570015847682953}

===== XGBOOST with CLASS WEIGHTS + FOCAL REWEIGHTING =====
              precision    recall  f1-score   support

           1       0.05      0.72      0.10     13473
           2       0.94      0.49      0.64   1086776
           3       0.45      0.69      0.54    259851
           4       0.11      0.74      0.19     36946

    accuracy                           0.53   1397046
   macro avg       0.39      0.66      0.37   1397046
weighted avg       0.82      0.53      0.61   1397046

Accuracy: 0.5336953829723574


In [20]:
# =========================
# FULL: Stable focal reweighting + calibration
# =========================
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import gc

# --- Parameters (tweakable) ---
gamma = 0.5            # focal focusing parameter (reduced to avoid over-focus)
n_rounds = 2           # iterative reweighting rounds (2 is usually enough)
max_calib_samples = 50000  # max samples to use for probability calibration
random_state = 42

# --- Ensure numpy arrays and dtypes (memory friendly) ---
X_train_arr = np.asarray(X_train, dtype=np.float32)
X_test_arr = np.asarray(X_test, dtype=np.float32)
y_train_arr = np.asarray(y_train, dtype=np.int32)  # internal labels 0..3
y_test_arr  = np.asarray(y_test, dtype=np.int32)

print("Shapes:", X_train_arr.shape, X_test_arr.shape, y_train_arr.shape)

# --- Compute class-level balanced weights (as baseline) ---
classes = np.unique(y_train_arr)
cw = compute_class_weight(class_weight='balanced', classes=classes, y=y_train_arr)
cw_dict = {int(c): float(w) for c, w in zip(classes, cw)}
print("Baseline class weights:", cw_dict)

# Scale down the class weights a bit to avoid huge gradients (optional)
# e.g., bring max weight to at most 3.0
max_allowed = 3.0
scale = min(1.0, max_allowed / max(cw_dict.values()))
if scale < 1.0:
    cw_dict = {k: float(v * scale) for k, v in cw_dict.items()}
    print("Scaled class weights:", cw_dict)

# Initialize sample weights = class weights
sample_weight = np.array([cw_dict[int(lbl)] for lbl in y_train_arr], dtype=np.float32)

# XGBoost baseline parameters (memory-optimized)
xgb_params = dict(
    objective='multi:softprob',
    num_class=4,
    tree_method='hist',
    max_depth=6,
    n_estimators=350,
    learning_rate=0.05,
    subsample=0.75,
    colsample_bytree=0.7,
    reg_lambda=2.0,
    eval_metric='mlogloss',
    random_state=random_state,
    use_label_encoder=False
)

model = None
for it in range(n_rounds):
    print(f"\n--- Iteration {it+1}/{n_rounds} — training with current sample weights ---")
    # Train model with current sample weights
    model = XGBClassifier(**xgb_params)
    model.fit(X_train_arr, y_train_arr, sample_weight=sample_weight, verbose=False)

    # Predict probabilities on training set
    proba = model.predict_proba(X_train_arr)  # shape (n_train, n_classes)
    # p_t: probability assigned to the true class for each sample
    p_t = proba[np.arange(len(y_train_arr)), y_train_arr].astype(np.float32)

    # Focal factor: (1 - p_t) ** gamma
    focal_factor = np.power(1.0 - np.clip(p_t, 1e-12, 1.0), gamma).astype(np.float32)

    # New sample weights = class_weight * focal_factor
    new_sample_weight = np.array([cw_dict[int(lbl)] for lbl in y_train_arr], dtype=np.float32) * focal_factor

    # Stabilize total weight scale: keep same sum as previous iteration
    prev_sum = sample_weight.sum()
    new_sum = new_sample_weight.sum()
    if new_sum > 0:
        new_sample_weight = new_sample_weight * (prev_sum / new_sum)

    sample_weight = new_sample_weight

    # Diagnostics: average p_t and focal factor per class (training)
    avg_pt = {}
    avg_factor = {}
    for c in classes:
        mask = (y_train_arr == c)
        if mask.sum() == 0:
            avg_pt[c] = np.nan
            avg_factor[c] = np.nan
        else:
            avg_pt[c] = float(p_t[mask].mean())
            avg_factor[c] = float(focal_factor[mask].mean())
    print("Avg p_t by class (train):", avg_pt)
    print("Avg focal factor by class (train):", avg_factor)
    gc.collect()

# Final trained model after reweighting
final_model = model

# --------------------------
# Probability calibration
# --------------------------
# Use a small stratified subset of the *training* set for calibration to avoid heavy cost
calib_size = min(max_calib_samples, int(len(X_train_arr) * 0.1))
if calib_size < 2000:
    calib_size = min(2000, len(X_train_arr))

# stratified sample
X_cal, _, y_cal, _ = train_test_split(
    X_train_arr, y_train_arr, train_size=calib_size, stratify=y_train_arr, random_state=random_state
)
print(f"Using {len(X_cal)} samples to calibrate probabilities (sigmoid).")

# Wrap the prefit model with CalibratedClassifierCV (cv='prefit') and fit on small calibration set
calibrated = CalibratedClassifierCV(base_estimator=final_model, method='sigmoid', cv='prefit')
calibrated.fit(X_cal, y_cal)

# --------------------------
# Evaluate on test set
# --------------------------
y_pred_internal = calibrated.predict(X_test_arr)
y_pred = y_pred_internal + 1           # map back to 1-4 externally
y_true = y_test_arr + 1

print("\n===== XGBOOST (C3: class weights + focal reweighting + calibration) =====")
print(classification_report(y_true, y_pred))
print("Accuracy:", accuracy_score(y_true, y_pred))


Shapes: (5986508, 9) (1397046, 9) (5986508,)
Baseline class weights: {0: 7.483135, 1: 0.3442815466487789, 2: 1.4398868199721764, 3: 3.7415675}
Scaled class weights: {0: 3.0, 1: 0.13802298634814644, 2: 0.5772527770669017, 3: 1.5}

--- Iteration 1/2 — training with current sample weights ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Avg p_t by class (train): {0: 0.6792340874671936, 1: 0.4683409333229065, 2: 0.5571920871734619, 3: 0.6835695505142212}
Avg focal factor by class (train): {0: 0.5087133049964905, 1: 0.7066897749900818, 2: 0.6350737810134888, 3: 0.5160141587257385}

--- Iteration 2/2 — training with current sample weights ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Avg p_t by class (train): {0: 0.6199984550476074, 1: 0.4510178565979004, 2: 0.5051166415214539, 3: 0.6096357107162476}
Avg focal factor by class (train): {0: 0.5823259949684143, 1: 0.7306800484657288, 2: 0.6906033754348755, 3: 0.599987804889679}
Using 50000 samples to calibrate probabilities (sigmoid).


TypeError: CalibratedClassifierCV.__init__() got an unexpected keyword argument 'base_estimator'

In [21]:
calibrated = CalibratedClassifierCV(estimator=final_model, method='sigmoid', cv='prefit')
calibrated.fit(X_cal, y_cal)

# --------------------------
# Evaluate on test set
# --------------------------
y_pred_internal = calibrated.predict(X_test_arr)
y_pred = y_pred_internal + 1
y_true = y_test_arr + 1

print("\n===== XGBOOST (C3: class weights + focal reweighting + calibration) =====")
print(classification_report(y_true, y_pred))
print("Accuracy:", accuracy_score(y_true, y_pred))





===== XGBOOST (C3: class weights + focal reweighting + calibration) =====
              precision    recall  f1-score   support

           1       0.20      0.14      0.17     13473
           2       0.84      0.94      0.89   1086776
           3       0.66      0.40      0.50    259851
           4       0.35      0.21      0.26     36946

    accuracy                           0.81   1397046
   macro avg       0.51      0.42      0.45   1397046
weighted avg       0.79      0.81      0.79   1397046

Accuracy: 0.8088638455712983


HIERARCHICAL SEVEERITY PREDICTION

In [23]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# ======================================
# STAGE 1: LOW (1-2) vs HIGH (3-4)
# ======================================
y_stage1 = np.where(y <= 1, 0, 1)  # 0=Low, 1=High

X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X, y_stage1, test_size=0.2, random_state=42, stratify=y_stage1
)

clf_stage1 = XGBClassifier(
    objective='binary:logistic',
    tree_method='hist',
    max_depth=5,
    n_estimators=250,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_lambda=2,
    eval_metric='logloss',
    random_state=42
)

clf_stage1.fit(X_train1, y_train1)
pred_stage1 = clf_stage1.predict(X_test1)

print("\n===== STAGE 1: LOW vs HIGH =====")
print(classification_report(y_test1, pred_stage1))

# ======================================
# SPLIT TEST DATA BASED ON STAGE 1 OUTPUT
# ======================================
idx_low  = np.where(pred_stage1 == 0)[0]
idx_high = np.where(pred_stage1 == 1)[0]

X_test_low  = X_test1[idx_low]
X_test_high = X_test1[idx_high]
y_test_low  = y_test.iloc[idx_low]      # original severity 0–3
y_test_high = y_test.iloc[idx_high] 

# ======================================
# STAGE 2A — LOW MODEL: SEVERITY 1 vs 2
# ======================================
mask_low = (y <= 1)
X_low = X[mask_low]
y_low = y[mask_low]

clf_low = XGBClassifier(
    objective='multi:softmax',
    num_class=2,
    tree_method='hist',
    max_depth=5,
    n_estimators=300,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_lambda=2,
    eval_metric='mlogloss',
    random_state=42
)

clf_low.fit(X_low, y_low)
pred_low = clf_low.predict(X_test_low)

# ======================================
# STAGE 2B — HIGH MODEL: SEVERITY 3 vs 4
# ======================================
mask_high = (y >= 2)
X_high = X[mask_high]
y_high = y[mask_high] - 2  # convert 2→0, 3→1

clf_high = XGBClassifier(
    objective='multi:softmax',
    num_class=2,
    tree_method='hist',
    max_depth=5,
    n_estimators=300,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_lambda=2,
    eval_metric='mlogloss',
    random_state=42
)

clf_high.fit(X_high, y_high)
pred_high = clf_high.predict(X_test_high) + 2  # map back

# ======================================
# MERGE FINAL PREDICTIONS
# ======================================
y_pred_final = np.zeros_like(y_test1)
y_pred_final[idx_low]  = pred_low
y_pred_final[idx_high] = pred_high

y_pred_final += 1      # convert internal 0–3 → real 1–4
y_true_final = y_test + 1

print("\n===== HIERARCHICAL SEVERITY CLASSIFIER =====")
print(classification_report(y_true_final, y_pred_final))
print("Accuracy:", accuracy_score(y_true_final, y_pred_final))



===== STAGE 1: LOW vs HIGH =====
              precision    recall  f1-score   support

           0       0.85      0.94      0.90   1100249
           1       0.65      0.39      0.49    296797

    accuracy                           0.83   1397046
   macro avg       0.75      0.67      0.69   1397046
weighted avg       0.81      0.83      0.81   1397046


===== HIERARCHICAL SEVERITY CLASSIFIER =====
              precision    recall  f1-score   support

           1       0.00      0.00      0.00     13473
           2       0.78      0.87      0.82   1086776
           3       0.19      0.12      0.15    259851
           4       0.03      0.00      0.01     36946

    accuracy                           0.70   1397046
   macro avg       0.25      0.25      0.24   1397046
weighted avg       0.64      0.70      0.67   1397046

Accuracy: 0.701827284140966


Ordinal XGBoost

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb

# =====================================================
# 1. LOAD DATA (ADAPT THIS PART IF NECESSARY)
# =====================================================
# Ensure df already preprocessed and Severity ∈ {1,2,3,4}

X = df.drop(columns=['Severity_Internal'])
y = df['Severity_Internal']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train samples:", len(X_train))
print("Test samples:", len(X_test))
print("Class counts:\n", y_train.value_counts())

# =====================================================
# 2. COMPUTE CLASS WEIGHTS
# =====================================================
class_counts = y_train.value_counts().sort_index()
total = len(y_train)
K = len(class_counts)

class_weights = {cls: total / (K * count) for cls, count in class_counts.items()}

print("\nClass weights:", class_weights)

sample_weight_train = y_train.map(class_weights)

# =====================================================
# 3. TRAIN ORDINAL XGBOOST MODELS
# =====================================================
ordinal_models = []
classes = sorted(class_weights.keys())  # [1,2,3,4]

for k in range(K - 1):  # train models for thresholds 1|2, 2|3, 3|4
    print(f"\nTraining threshold model for y > {classes[k]} ...")

    y_binary = (y_train > classes[k]).astype(int)

    dtrain = xgb.DMatrix(
        X_train,
        label=y_binary,
        weight=sample_weight_train
    )

    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "eta": 0.05,
        "max_depth": 8,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "min_child_weight": 3,
        "tree_method": "hist"  # faster for large data
    }

    model = xgb.train(params, dtrain, num_boost_round=500)
    ordinal_models.append(model)

print("\nFinished training all ordinal models.")

# =====================================================
# 4. PREDICT ON TEST SET
# =====================================================
dtest = xgb.DMatrix(X_test)

# P(y > k)
P = np.column_stack([m.predict(dtest) for m in ordinal_models])

probs = np.zeros((len(P), K))
probs[:, 0] = 1 - P[:, 0]  # P(y = 1)

for k in range(1, K - 1):
    probs[:, k] = P[:, k - 1] - P[:, k]  # P(y = 2,3)

probs[:, K - 1] = P[:, K - 2]  # P(y = 4)

y_pred = np.argmax(probs, axis=1) + 1

# =====================================================
# 5. EVALUATE MODEL
# =====================================================
print("\n===== ORDINAL XGBOOST RESULTS =====")
print(classification_report(y_test, y_pred, digits=4))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Optional: Show predicted distribution
print("\nPredicted severity distribution:\n", pd.Series(y_pred).value_counts().sort_index())


Train samples: 5588182
Test samples: 1397046
Class counts:
 Severity_Internal
1.0    4347102
2.0    1039406
3.0     147783
0.0      53891
Name: count, dtype: int64

Class weights: {0.0: 25.92354010873801, 1.0: 0.32137398662373234, 2.0: 1.3440806576063635, 3.0: 9.453357287374056}

Training threshold model for y > 0.0 ...

Training threshold model for y > 1.0 ...

Training threshold model for y > 2.0 ...

Finished training all ordinal models.

===== ORDINAL XGBOOST RESULTS =====


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

         0.0     0.0000    0.0000    0.0000     13473
         1.0     0.7587    0.0883    0.1582   1086776
         2.0     0.0415    0.1091    0.0602    259851
         3.0     0.0057    0.0557    0.0104     36946
         4.0     0.0000    0.0000    0.0000         0

    accuracy                         0.0905   1397046
   macro avg     0.1612    0.0506    0.0457   1397046
weighted avg     0.5981    0.0905    0.1345   1397046

Accuracy: 0.09046015664480625

Predicted severity distribution:
 1    126475
2    682663
3    360536
4    227372
Name: count, dtype: int64


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [4]:
df.columns


Index(['Distance(mi)', 'Temperature(F)', 'Visibility(mi)', 'Humidity(%)',
       'Pressure(in)', 'Wind_Speed(mph)', 'Traffic_Signal', 'Junction',
       'Crossing', 'Stop', 'State', 'City', 'Hour', 'Month', 'Weekday',
       'Weekend', 'Night', 'Is_Rush_Hour', 'Rain', 'Snow', 'Fog',
       'Low_Visibility', 'High_Wind', 'Temp_Below_Freezing', 'City_Frequency',
       'Highway', 'Short_Distance', 'Long_Distance', 'Severity_Internal'],
      dtype='object')