# Exploratory Data Analysis (EDA)

This notebook performs comprehensive exploratory analysis:
1. Temporal trends and patterns
2. Geographic distribution
3. Crime type analysis
4. Correlation analysis
5. Seasonal patterns

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (14, 7)

print("Libraries loaded successfully!")

In [None]:
# Load data
df = pd.read_csv('../data/raw/india_crime_data_2019_2024.csv')
df['Date'] = pd.to_datetime(df['Date'])

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")

## 1. Temporal Analysis

In [None]:
# Monthly crime trends
monthly_trends = df.groupby(['Year', 'Month'])['Incidents_Reported'].sum().reset_index()
monthly_trends['Date'] = pd.to_datetime(monthly_trends[['Year', 'Month']].assign(day=1))

fig = px.line(monthly_trends, x='Date', y='Incidents_Reported',
              title='Crime Trends Over Time (Monthly)',
              labels={'Incidents_Reported': 'Total Incidents', 'Date': 'Month-Year'})
fig.update_layout(height=500, template='plotly_white')
fig.show()

In [None]:
# Year-over-Year comparison
yearly_comparison = df.groupby('Year')['Incidents_Reported'].sum()

fig = go.Figure()
fig.add_trace(go.Bar(x=yearly_comparison.index, y=yearly_comparison.values,
                     marker_color='indianred', text=yearly_comparison.values,
                     textposition='outside'))
fig.update_layout(title='Year-over-Year Crime Incidents',
                  xaxis_title='Year', yaxis_title='Total Incidents',
                  template='plotly_white', height=500)
fig.show()

# Calculate YoY growth
yoy_growth = yearly_comparison.pct_change() * 100
print("\nYear-over-Year Growth Rate:")
for year, growth in yoy_growth.items():
    if not np.isnan(growth):
        print(f"{year}: {growth:+.2f}%")

In [None]:
# Seasonal patterns
monthly_avg = df.groupby('Month')['Incidents_Reported'].mean().reset_index()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
monthly_avg['Month_Name'] = [month_names[m-1] for m in monthly_avg['Month']]

plt.figure(figsize=(14, 6))
plt.plot(monthly_avg['Month'], monthly_avg['Incidents_Reported'], 
         marker='o', linewidth=2, markersize=8, color='darkblue')
plt.fill_between(monthly_avg['Month'], monthly_avg['Incidents_Reported'], alpha=0.3)
plt.title('Average Crime Incidents by Month (Seasonal Pattern)', fontsize=14, fontweight='bold')
plt.xlabel('Month')
plt.ylabel('Average Incidents')
plt.xticks(monthly_avg['Month'], monthly_avg['Month_Name'])
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Identify peak and low crime months
peak_month = monthly_avg.loc[monthly_avg['Incidents_Reported'].idxmax()]
low_month = monthly_avg.loc[monthly_avg['Incidents_Reported'].idxmin()]
print(f"\nPeak Crime Month: {peak_month['Month_Name']} (Avg: {peak_month['Incidents_Reported']:.0f} incidents)")
print(f"Lowest Crime Month: {low_month['Month_Name']} (Avg: {low_month['Incidents_Reported']:.0f} incidents)")

## 2. Geographic Analysis

In [None]:
# City-wise crime analysis
city_stats = df.groupby('City').agg({
    'Incidents_Reported': 'sum',
    'Crime_Rate_Per_100K': 'mean',
    'Population_Lakhs': 'mean'
}).reset_index().sort_values('Incidents_Reported', ascending=False)

# Top 15 cities
top_cities = city_stats.head(15)

fig = make_subplots(rows=1, cols=2, 
                    subplot_titles=('Top 15 Cities by Total Crimes', 
                                   'Top 15 Cities by Crime Rate (per 100K)'))

fig.add_trace(go.Bar(y=top_cities['City'], x=top_cities['Incidents_Reported'],
                     orientation='h', marker_color='teal', name='Total Incidents'),
              row=1, col=1)

fig.add_trace(go.Bar(y=top_cities['City'], x=top_cities['Crime_Rate_Per_100K'],
                     orientation='h', marker_color='orange', name='Crime Rate'),
              row=1, col=2)

fig.update_xaxes(title_text='Total Incidents', row=1, col=1)
fig.update_xaxes(title_text='Crime Rate per 100K', row=1, col=2)
fig.update_layout(height=600, showlegend=False, template='plotly_white')
fig.show()

In [None]:
# State-wise analysis
state_stats = df.groupby('State')['Incidents_Reported'].sum().sort_values(ascending=False)

fig = px.bar(x=state_stats.index, y=state_stats.values,
             title='Crime Distribution Across States',
             labels={'x': 'State', 'y': 'Total Incidents'},
             color=state_stats.values, color_continuous_scale='Reds')
fig.update_layout(height=500, template='plotly_white', xaxis_tickangle=-45)
fig.show()

print("\nTop 5 States by Crime Count:")
print(state_stats.head())

## 3. Crime Type Analysis

In [None]:
# Crime category distribution
category_dist = df.groupby('Crime_Category')['Incidents_Reported'].sum().sort_values(ascending=False)

fig = px.pie(values=category_dist.values, names=category_dist.index,
             title='Crime Distribution by Category',
             hole=0.4, color_discrete_sequence=px.colors.qualitative.Set3)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(height=600)
fig.show()

In [None]:
# Top crime types
crime_type_dist = df.groupby('Crime_Type')['Incidents_Reported'].sum().sort_values(ascending=False).head(15)

plt.figure(figsize=(14, 8))
plt.barh(range(len(crime_type_dist)), crime_type_dist.values, color='steelblue')
plt.yticks(range(len(crime_type_dist)), crime_type_dist.index)
plt.xlabel('Number of Incidents', fontsize=12)
plt.title('Top 15 Crime Types by Incident Count', fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)

# Add value labels
for i, v in enumerate(crime_type_dist.values):
    plt.text(v, i, f' {v:,}', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Crime type trends over years
top_5_crimes = df.groupby('Crime_Type')['Incidents_Reported'].sum().nlargest(5).index
crime_trends = df[df['Crime_Type'].isin(top_5_crimes)].groupby(['Year', 'Crime_Type'])['Incidents_Reported'].sum().reset_index()

fig = px.line(crime_trends, x='Year', y='Incidents_Reported', color='Crime_Type',
              title='Trends of Top 5 Crime Types Over Years',
              markers=True, labels={'Incidents_Reported': 'Total Incidents'})
fig.update_layout(height=500, template='plotly_white')
fig.show()

## 4. Correlation Analysis

In [None]:
# Select numerical columns for correlation
numeric_cols = ['Incidents_Reported', 'Crime_Rate_Per_100K', 'Cases_Charge_Sheeted', 'Cases_Convicted']
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.3f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Crime Metrics', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

## 5. Advanced Insights

In [None]:
# Crime heatmap: Month vs Crime Category
heatmap_data = df.groupby(['Month', 'Crime_Category'])['Incidents_Reported'].sum().reset_index()
heatmap_pivot = heatmap_data.pivot(index='Month', columns='Crime_Category', values='Incidents_Reported')

plt.figure(figsize=(14, 8))
sns.heatmap(heatmap_pivot, cmap='YlOrRd', annot=False, fmt='.0f', cbar_kws={'label': 'Incidents'})
plt.title('Crime Heatmap: Month vs Crime Category', fontsize=14, fontweight='bold')
plt.xlabel('Crime Category', fontsize=12)
plt.ylabel('Month', fontsize=12)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Identify safest and most dangerous cities
city_crime_rate = df.groupby('City')['Crime_Rate_Per_100K'].mean().sort_values()

print("\n" + "="*60)
print("5 SAFEST CITIES (Lowest Crime Rate per 100K population)")
print("="*60)
print(city_crime_rate.head())

print("\n" + "="*60)
print("5 MOST DANGEROUS CITIES (Highest Crime Rate per 100K population)")
print("="*60)
print(city_crime_rate.tail())

In [None]:
# Conviction efficiency analysis
city_conviction = df.groupby('City').agg({
    'Incidents_Reported': 'sum',
    'Cases_Convicted': 'sum'
})
city_conviction['Conviction_Rate'] = (city_conviction['Cases_Convicted'] / city_conviction['Incidents_Reported']) * 100
city_conviction = city_conviction.sort_values('Conviction_Rate', ascending=False)

fig = px.bar(x=city_conviction.head(15).index, y=city_conviction.head(15)['Conviction_Rate'],
             title='Top 15 Cities by Conviction Rate',
             labels={'x': 'City', 'y': 'Conviction Rate (%)'},
             color=city_conviction.head(15)['Conviction_Rate'],
             color_continuous_scale='Greens')
fig.update_layout(height=500, template='plotly_white', xaxis_tickangle=-45)
fig.show()

print("\nCities with Highest Conviction Rates:")
print(city_conviction.head(10)['Conviction_Rate'])

## 6. Key Findings Summary

In [None]:
print("\n" + "="*70)
print("KEY FINDINGS FROM EXPLORATORY DATA ANALYSIS")
print("="*70)

print("\n1. TEMPORAL PATTERNS:")
print(f"   - Peak crime month: {peak_month['Month_Name']}")
print(f"   - Lowest crime month: {low_month['Month_Name']}")
print(f"   - Overall trend: {'+' if yoy_growth.mean() > 0 else ''}{yoy_growth.mean():.2f}% average YoY growth")

print("\n2. GEOGRAPHIC INSIGHTS:")
print(f"   - Highest crime city: {city_stats.iloc[0]['City']} ({city_stats.iloc[0]['Incidents_Reported']:.0f} incidents)")
print(f"   - Safest city (by rate): {city_crime_rate.index[0]} ({city_crime_rate.iloc[0]:.2f} per 100K)")
print(f"   - Most affected state: {state_stats.index[0]} ({state_stats.iloc[0]:,.0f} incidents)")

print("\n3. CRIME TYPES:")
print(f"   - Most common category: {category_dist.index[0]} ({category_dist.iloc[0]:,.0f} incidents)")
print(f"   - Most common crime type: {crime_type_dist.index[0]} ({crime_type_dist.iloc[0]:,.0f} incidents)")

print("\n4. JUSTICE METRICS:")
avg_conviction = city_conviction['Conviction_Rate'].mean()
print(f"   - Average conviction rate: {avg_conviction:.2f}%")
print(f"   - Best performing city: {city_conviction.index[0]} ({city_conviction.iloc[0]['Conviction_Rate']:.2f}%)")

print("\n" + "="*70)