Step-by-Step Preprocessing Plan


In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv("Nigeria Crash Data.csv")


print("Dataset Shape:", df.shape)
print("\n")
print("\nColumns:", df.columns.tolist())
print("\n")
print("\nData Types:")
print(df.dtypes)
print("\n")
print("\nMissing Values:")
print(df.isnull().sum())
print("\n")

Dataset Shape: (518, 11)



Columns: ['Quarter', 'State', 'Total_Crashes', 'Num_Injured', 'Num_Killed', 'Total_Vehicles_Involved', 'SPV', 'DAD', 'PWR', 'FTQ', 'Other_Factors']



Data Types:
Quarter                    object
State                      object
Total_Crashes               int64
Num_Injured                 int64
Num_Killed                  int64
Total_Vehicles_Involved     int64
SPV                         int64
DAD                         int64
PWR                         int64
FTQ                         int64
Other_Factors               int64
dtype: object



Missing Values:
Quarter                    0
State                      0
Total_Crashes              0
Num_Injured                0
Num_Killed                 0
Total_Vehicles_Involved    0
SPV                        0
DAD                        0
PWR                        0
FTQ                        0
Other_Factors              0
dtype: int64




Data Cleaning & Validation

In [12]:

df['Other_Factors'] = df['Other_Factors'].apply(lambda x: max(0, x) if pd.notnull(x) else x)


df['Quarter_Num'] = df['Quarter'].str.extract(r'Q(\d+)').astype(int)
df['Year'] = df['Quarter'].str.extract(r'(\d{4})').astype(int)


df['Quarter_Date'] = pd.to_datetime(df['Year'].astype(str) + '-' +
                                   (df['Quarter_Num']*3).astype(str) + '-01')


df['State'] = df['State'].str.strip()


duplicates = df.duplicated(subset=['Quarter', 'State']).sum()
print(f"Duplicate records found: {duplicates}")

Duplicate records found: 0


Feature Engineering

In [14]:

df['Total_Casualties'] = df['Num_Injured'] + df['Num_Killed']
df['Casualty_Rate'] = df['Total_Casualties'] / df['Total_Vehicles_Involved'].replace(0, 1)
df['Fatality_Rate'] = df['Num_Killed'] / df['Total_Casualties'].replace(0, 1)
df['Vehicles_per_Crash'] = df['Total_Vehicles_Involved'] / df['Total_Crashes'].replace(0, 1)


factors = ['SPV', 'DAD', 'PWR', 'FTQ', 'Other_Factors']
df['Total_Factors'] = df[factors].sum(axis=1)
for factor in factors:
    df[f'{factor}_Prop'] = df[factor] / df['Total_Factors'].replace(0, 1)

df['Quarter_Sin'] = np.sin(2 * np.pi * df['Quarter_Num']/4)
df['Quarter_Cos'] = np.cos(2 * np.pi * df['Quarter_Num']/4)


df['Severe_Crash'] = (df['Num_Killed'] > 5).astype(int)
df['High_Casualty'] = (df['Total_Casualties'] > df['Total_Casualties'].median()).astype(int)

Handling Categorical Values

In [15]:

state_encoder = LabelEncoder()
df['State_Encoded'] = state_encoder.fit_transform(df['State'])


state_dummies = pd.get_dummies(df['State'], prefix='State', drop_first=True)
df = pd.concat([df, state_dummies], axis=1)


regions = {
    'North West': ['Jigawa', 'Kaduna', 'Kano', 'Katsina', 'Kebbi', 'Sokoto', 'Zamfara'],
    'North East': ['Adamawa', 'Bauchi', 'Borno', 'Gombe', 'Taraba', 'Yobe'],
    'North Central': ['Benue', 'Kogi', 'Kwara', 'Nasarawa', 'Niger', 'Plateau', 'FCT'],
    'South West': ['Ekiti', 'Lagos', 'Ogun', 'Ondo', 'Osun', 'Oyo'],
    'South East': ['Abia', 'Anambra', 'Ebonyi', 'Enugu', 'Imo'],
    'South South': ['Akwa Ibom', 'Bayelsa', 'Cross River', 'Delta', 'Edo', 'Rivers']
}


state_to_region = {}
for region, states in regions.items():
    for state in states:
        state_to_region[state] = region

df['Region'] = df['State'].map(state_to_region)
region_encoder = LabelEncoder()
df['Region_Encoded'] = region_encoder.fit_transform(df['Region'])

Create Training/Test Split Ready Dataset

In [16]:

feature_columns = [

    'Total_Crashes', 'Total_Vehicles_Involved',


    'Total_Casualties', 'Casualty_Rate', 'Fatality_Rate',


    'SPV_Prop', 'DAD_Prop', 'PWR_Prop', 'FTQ_Prop', 'Other_Factors_Prop',


    'Year', 'Quarter_Num', 'Quarter_Sin', 'Quarter_Cos',


    'State_Encoded', 'Region_Encoded'
]


region_dummies = pd.get_dummies(df['Region'], prefix='Region', drop_first=True)
df = pd.concat([df, region_dummies], axis=1)


target_options = {
    'Num_Killed': 'Regression - Number of fatalities',
    'Num_Injured': 'Regression - Number of injuries',
    'Total_Casualties': 'Regression - Total casualties',
    'Severe_Crash': 'Classification - Binary severe crash indicator',
    'High_Casualty': 'Classification - High casualty indicator'
}

print("Available target variables:")
for target, description in target_options.items():
    print(f"  - {target}: {description}")


X = df[feature_columns].copy()


X = X.fillna(X.median())


scaler = StandardScaler()
scaled_columns = [col for col in X.columns if col not in ['State_Encoded', 'Region_Encoded', 'Quarter_Num', 'Year']]
X_scaled = X.copy()
X_scaled[scaled_columns] = scaler.fit_transform(X[scaled_columns])

print(f"\nFeature matrix shape: {X_scaled.shape}")
print(f"Features available: {list(X_scaled.columns)}")

Available target variables:
  - Num_Killed: Regression - Number of fatalities
  - Num_Injured: Regression - Number of injuries
  - Total_Casualties: Regression - Total casualties
  - Severe_Crash: Classification - Binary severe crash indicator
  - High_Casualty: Classification - High casualty indicator

Feature matrix shape: (518, 16)
Features available: ['Total_Crashes', 'Total_Vehicles_Involved', 'Total_Casualties', 'Casualty_Rate', 'Fatality_Rate', 'SPV_Prop', 'DAD_Prop', 'PWR_Prop', 'FTQ_Prop', 'Other_Factors_Prop', 'Year', 'Quarter_Num', 'Quarter_Sin', 'Quarter_Cos', 'State_Encoded', 'Region_Encoded']


Saving File

In [19]:
df.to_csv("Nigeria Crash Data P.csv", index=False)