In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib
import re

class FeatureEngineer:
    def __init__(self):
        self.label_encoders = {}

    def create_time_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Extract time-based features from datetime"""
        df['hour'] = df['datetime'].dt.hour
        df['day_of_week'] = df['datetime'].dt.dayofweek 
        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
        df['is_rush_hour'] = ((df['hour'].between(7, 9)) | (df['hour'].between(17, 19))).astype(int)
        return df

    def create_weather_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Map weather conditions to risk scores with broader mapping"""
        def map_weather(w):
            w = str(w).lower()
            if 'storm' in w or 'blizzard' in w or 'hurricane' in w:
                return 5
            elif 'rain' in w or 'snow' in w or 'fog' in w or 'mist' in w:
                return 4
            elif 'cloud' in w or 'overcast' in w or 'hazy' in w or 'drizzle' in w:
                return 2
            else:
                return 1  # Clear or sunny
        df['weather_score'] = df['Weather Conditions'].apply(map_weather)
        return df

    def create_road_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Map road type and condition to risk scores"""
        type_map = {'National Highway': 5, 'State Highway': 4, 'Urban Road': 3, 'Village Road': 2, 'Rural Road': 2}
        cond_map = {'Dry': 1, 'Wet': 3, 'Damaged': 4, 'Under Construction': 5}
        df['road_type_score'] = df['Road Type'].map(type_map).fillna(2)
        df['road_cond_score'] = df['Road Condition'].map(cond_map).fillna(1)
        return df

    def create_driver_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create driver-related feature scores (realistic)"""
        # Age scoring: young (<21) and elderly (>=75) higher risk
        def age_score_func(age):
            age = float(age)
            if age < 21:
                return 4
            elif age < 45:
                return 3
            elif age < 65:
                return 2
            elif age < 75:
                return 1
            else:
                return 5
        df['age_score'] = df['Driver Age'].apply(age_score_func)

        lic_map = {'Valid': 1, 'Expired': 2, 'No License': 3}
        
        df['license_score'] = df['Driver License Status'].map(lic_map).fillna(3)


        df['alcohol_flag'] = ((df.get('Alcohol Involvement', None) == 'Yes')
                              | (df.get('alcohol_flag', 0) == 1)).astype(int)

        df['experience_score'] = df['driver_experience'].apply(self.get_experience_score)
        
        df['vehicle_condition_score'] = df['vehicle_condition'].apply(self.get_vehicle_condition_score)
        return df

    def create_driver_speed_feature(self, df: pd.DataFrame) -> pd.DataFrame:
        """Map driver speed to realistic risk score"""
        def map_speed_to_score(x):
            try:
                speed = float(x)
                if speed <= 40:
                    return 1
                elif speed <= 60:
                    return 2
                elif speed <= 80:
                    return 3
                elif speed <= 100:
                    return 4
                elif speed <= 120:
                    return 5
                else:
                    return 5
            except:
                return 3
        colname = 'driver_speed_habit'
        if 'driver_speed_habit' in df.columns:
            df['driver_speed_score'] = df['driver_speed_habit'].apply(map_speed_to_score)
        elif 'driver_speed_habit(km/h)' in df.columns:
            df['driver_speed_score'] = df['driver_speed_habit(km/h)'].apply(map_speed_to_score)
        else:
            df['driver_speed_score'] = 2
        return df

    @staticmethod
    def get_experience_score(experience_str):
        """Parse driver experience and apply realistic accident risk scoring (1=lowest risk, 5=highest risk)"""
        try:
            experience_str = str(experience_str).lower().strip()
            match = re.search(r'\d+\.?\d*', experience_str)
            if not match:
                return 5  # Unknown experience: treat as highest risk
            value = float(match.group())
            years = value / 12 if 'month' in experience_str else value
            if years < 0.5:
                return 5
            elif years < 1:
                return 4
            elif years < 2:
                return 3
            elif years < 5:
                return 2
            else:
                return 1
        except Exception:
            return 5


    def get_vehicle_condition_score(self, condition):
        """Score vehicle condition (realistic)"""
        try:
            condition = str(condition).lower()
            if 'poor' in condition or 'bad' in condition or 'damaged' in condition:
                return 5
            elif 'average' in condition or 'medium' in condition or 'old' in condition:
                return 3
            elif 'good' in condition or 'excellent' in condition or 'new' in condition:
                return 1
            else:
                return 2
        except:
            return 2

    def encode_categoricals(self, df: pd.DataFrame, cols: list) -> pd.DataFrame:
        """Encode categorical features using LabelEncoder"""
        for col in cols:
            le = LabelEncoder()
            if col in df.columns:
                df[col + '_enc'] = le.fit_transform(df[col].astype(str))
                self.label_encoders[col] = le 
            else:
                print(f"Warning: Column '{col}' not found for encoding.")
        return df

    def process(self, df: pd.DataFrame) -> pd.DataFrame:
        """Robust main processing pipeline"""
        if not pd.api.types.is_datetime64_any_dtype(df['datetime']):
            df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
        df = self.create_time_features(df)
        df = self.create_weather_features(df)
        df = self.create_road_features(df)
        df = self.create_driver_features(df)
        df = self.create_driver_speed_feature(df)
        categorical_cols = [
            'State Name', 'City Name', 'Accident Severity', 
            'Vehicle Type Involved', 'Lighting Conditions', 
            'Traffic Control Presence'
        ]
        df = self.encode_categoricals(df, categorical_cols)
        severity_mapping = {'Minor': 0, 'Serious': 1,'Fatal': 2}
        df['target'] = df['Accident Severity'].map(severity_mapping)
        if df['target'].isna().any():
            print(f"⚠️  Warning: {df['target'].isna().sum()} rows have unmapped Accident Severity values")
            df['target'].fillna(0, inplace=True)
        return df


In [6]:

df = pd.read_csv(
    'C:/Users/rahul/OneDrive/Desktop/ai-traffic-prediction-backend new/ai-traffic-prediction-backend/data/processed/cleaned_data.csv',
    parse_dates=['datetime'],
    dayfirst=True
)

fe = FeatureEngineer()
df_feat = fe.process(df)

print("Feature Engineering Complete. First 5 rows of the processed DataFrame:")
df_feat.head()
df_feat.describe()
df_feat.to_csv('../data/processed/featured_data.csv', index=False)
joblib.dump(fe.label_encoders, '../models/feature_columns.joblib')

Feature Engineering Complete. First 5 rows of the processed DataFrame:


['../models/feature_columns.joblib']