# Data Preprocessing and Feature Engineering

This notebook covers:
1. Data cleaning
2. Feature engineering for time series
3. Creating lag features
4. Data splitting for training
5. Saving processed data

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded!")

In [None]:
# Load data
df = pd.read_csv('../data/raw/india_crime_data_2019_2024.csv')
df['Date'] = pd.to_datetime(df['Date'])
print(f"Original data shape: {df.shape}")

## 1. Data Cleaning

In [None]:
# Check for any outliers in crime counts
print("Crime Incident Statistics:")
print(df['Incidents_Reported'].describe())

# Remove any rows with negative values (if any)
df = df[df['Incidents_Reported'] >= 0]
print(f"\nData shape after cleaning: {df.shape}")

## 2. Feature Engineering

In [None]:
# Create time-based features
df['Quarter'] = df['Date'].dt.quarter
df['Day_of_Year'] = df['Date'].dt.dayofyear
df['Week_of_Year'] = df['Date'].dt.isocalendar().week

# Season mapping
def get_season(month):
    if month in [3, 4, 5]:
        return 'Summer'
    elif month in [6, 7, 8, 9]:
        return 'Monsoon'
    elif month in [10, 11]:
        return 'Autumn'
    else:
        return 'Winter'

df['Season'] = df['Month'].apply(get_season)

# Festival months indicator (October-November Diwali, March-April Holi, etc.)
df['Is_Festival_Month'] = df['Month'].isin([3, 4, 10, 11]).astype(int)

print("Time-based features created:")
print(df[['Date', 'Quarter', 'Season', 'Is_Festival_Month']].head())

In [None]:
# Create aggregated features for time series
# Monthly aggregation by city
city_monthly = df.groupby(['City', 'Year', 'Month']).agg({
    'Incidents_Reported': 'sum',
    'Crime_Rate_Per_100K': 'mean',
    'Cases_Convicted': 'sum'
}).reset_index()

city_monthly['Date'] = pd.to_datetime(city_monthly[['Year', 'Month']].assign(day=1))
city_monthly = city_monthly.sort_values(['City', 'Date'])

print(f"\nCity-monthly aggregated data shape: {city_monthly.shape}")
print(city_monthly.head(10))

In [None]:
# Create lag features for time series (past 1, 3, 6, 12 months)
def create_lag_features(group_df, lags=[1, 3, 6, 12]):
    for lag in lags:
        group_df[f'Incidents_Lag_{lag}'] = group_df['Incidents_Reported'].shift(lag)
        group_df[f'Crime_Rate_Lag_{lag}'] = group_df['Crime_Rate_Per_100K'].shift(lag)
    return group_df

# Apply lag features for each city
city_monthly = city_monthly.groupby('City', group_keys=False).apply(create_lag_features)

print("\nLag features created:")
print(city_monthly[['City', 'Date', 'Incidents_Reported', 'Incidents_Lag_1', 'Incidents_Lag_3']].head(15))

In [None]:
# Rolling statistics (3-month and 6-month moving averages)
def create_rolling_features(group_df):
    group_df['MA_3'] = group_df['Incidents_Reported'].rolling(window=3, min_periods=1).mean()
    group_df['MA_6'] = group_df['Incidents_Reported'].rolling(window=6, min_periods=1).mean()
    group_df['Rolling_Std_3'] = group_df['Incidents_Reported'].rolling(window=3, min_periods=1).std()
    return group_df

city_monthly = city_monthly.groupby('City', group_keys=False).apply(create_rolling_features)

print("\nRolling statistics created:")
print(city_monthly[['City', 'Date', 'Incidents_Reported', 'MA_3', 'MA_6']].head(15))

## 3. Encode Categorical Variables

In [None]:
# Label encode categorical features
le_city = LabelEncoder()
le_state = LabelEncoder()
le_crime_type = LabelEncoder()
le_crime_category = LabelEncoder()
le_season = LabelEncoder()

df['City_Encoded'] = le_city.fit_transform(df['City'])
df['State_Encoded'] = le_state.fit_transform(df['State'])
df['Crime_Type_Encoded'] = le_crime_type.fit_transform(df['Crime_Type'])
df['Crime_Category_Encoded'] = le_crime_category.fit_transform(df['Crime_Category'])
df['Season_Encoded'] = le_season.fit_transform(df['Season'])

# Save label encoders for later use
import joblib
joblib.dump(le_city, '../models/le_city.pkl')
joblib.dump(le_state, '../models/le_state.pkl')
joblib.dump(le_crime_type, '../models/le_crime_type.pkl')
joblib.dump(le_crime_category, '../models/le_crime_category.pkl')

print("Categorical variables encoded and label encoders saved!")

## 4. Save Processed Data

In [None]:
# Save full processed dataset
df.to_csv('../data/processed/crime_data_processed.csv', index=False)
print(f"Full processed data saved: {df.shape}")

# Save city-monthly aggregated data for time series models
city_monthly.to_csv('../data/processed/city_monthly_timeseries.csv', index=False)
print(f"City-monthly time series data saved: {city_monthly.shape}")

# Create overall monthly aggregation (all cities combined)
overall_monthly = df.groupby(['Year', 'Month']).agg({
    'Incidents_Reported': 'sum',
    'Crime_Rate_Per_100K': 'mean'
}).reset_index()
overall_monthly['Date'] = pd.to_datetime(overall_monthly[['Year', 'Month']].assign(day=1))
overall_monthly = overall_monthly.sort_values('Date')

overall_monthly.to_csv('../data/processed/overall_monthly_timeseries.csv', index=False)
print(f"Overall monthly time series data saved: {overall_monthly.shape}")

In [None]:
print("\n" + "="*60)
print("DATA PREPROCESSING COMPLETE")
print("="*60)
print("\nProcessed files saved:")
print("1. crime_data_processed.csv - Full processed dataset")
print("2. city_monthly_timeseries.csv - City-wise monthly aggregation")
print("3. overall_monthly_timeseries.csv - Overall monthly aggregation")
print("4. Label encoders saved in ../models/")
print("\nReady for model development!")