# Crime Data Acquisition and Initial Exploration

This notebook covers:
1. Loading the crime dataset
2. Initial data inspection
3. Basic statistics
4. Data quality checks

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Load Data

In [None]:
# Load the crime dataset
df = pd.read_csv('../data/raw/india_crime_data_2019_2024.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")

## 2. Initial Data Inspection

In [None]:
# Display first few rows
df.head(10)

In [None]:
# Data types and info
df.info()

In [None]:
# Summary statistics
df.describe()

## 3. Data Quality Checks

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:")
print(missing_values[missing_values > 0])

if missing_values.sum() == 0:
    print("\n✓ No missing values found!")

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

In [None]:
# Check unique values in categorical columns
print("Unique Cities:", df['City'].nunique())
print(df['City'].unique())

print("\nUnique States:", df['State'].nunique())
print(df['State'].unique())

print("\nUnique Crime Types:", df['Crime_Type'].nunique())
print(df['Crime_Type'].unique())

print("\nUnique Crime Categories:", df['Crime_Category'].nunique())
print(df['Crime_Category'].unique())

## 4. Basic Statistics

In [None]:
# Total crimes by year
yearly_crimes = df.groupby('Year')['Incidents_Reported'].sum().sort_index()
print("Total Crimes by Year:")
print(yearly_crimes)

plt.figure(figsize=(10, 5))
yearly_crimes.plot(kind='bar', color='steelblue')
plt.title('Total Crime Incidents by Year', fontsize=14, fontweight='bold')
plt.xlabel('Year')
plt.ylabel('Number of Incidents')
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Top 10 cities by crime count
city_crimes = df.groupby('City')['Incidents_Reported'].sum().sort_values(ascending=False).head(10)
print("\nTop 10 Cities by Crime Count:")
print(city_crimes)

plt.figure(figsize=(12, 6))
city_crimes.plot(kind='barh', color='coral')
plt.title('Top 10 Cities by Total Crime Incidents', fontsize=14, fontweight='bold')
plt.xlabel('Number of Incidents')
plt.ylabel('City')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Crime distribution by category
category_crimes = df.groupby('Crime_Category')['Incidents_Reported'].sum().sort_values(ascending=False)
print("\nCrimes by Category:")
print(category_crimes)

plt.figure(figsize=(10, 6))
category_crimes.plot(kind='bar', color='seagreen')
plt.title('Crime Distribution by Category', fontsize=14, fontweight='bold')
plt.xlabel('Crime Category')
plt.ylabel('Number of Incidents')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Top 10 crime types
crime_type_stats = df.groupby('Crime_Type')['Incidents_Reported'].sum().sort_values(ascending=False).head(10)
print("\nTop 10 Crime Types:")
print(crime_type_stats)

plt.figure(figsize=(12, 6))
crime_type_stats.plot(kind='barh', color='indianred')
plt.title('Top 10 Crime Types by Incident Count', fontsize=14, fontweight='bold')
plt.xlabel('Number of Incidents')
plt.ylabel('Crime Type')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Conviction Rate Analysis

In [None]:
# Calculate conviction rates
total_incidents = df['Incidents_Reported'].sum()
total_chargesheeted = df['Cases_Charge_Sheeted'].sum()
total_convicted = df['Cases_Convicted'].sum()

chargesheet_rate = (total_chargesheeted / total_incidents) * 100
conviction_rate = (total_convicted / total_incidents) * 100

print(f"Total Incidents: {total_incidents:,}")
print(f"Cases Charge-Sheeted: {total_chargesheeted:,} ({chargesheet_rate:.2f}%)")
print(f"Cases Convicted: {total_convicted:,} ({conviction_rate:.2f}%)")

# Visualization
plt.figure(figsize=(8, 6))
categories = ['Reported', 'Charge-Sheeted', 'Convicted']
values = [total_incidents, total_chargesheeted, total_convicted]
colors = ['#ff7f0e', '#2ca02c', '#d62728']

plt.bar(categories, values, color=colors, alpha=0.7)
plt.title('Crime Case Progression', fontsize=14, fontweight='bold')
plt.ylabel('Number of Cases')
plt.grid(axis='y', alpha=0.3)

# Add percentage labels
for i, v in enumerate(values):
    plt.text(i, v, f'{v:,}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

## 6. Save Summary Statistics

In [None]:
# Save key statistics
summary_stats = {
    'total_records': len(df),
    'date_range': f"{df['Date'].min()} to {df['Date'].max()}",
    'cities_count': df['City'].nunique(),
    'states_count': df['State'].nunique(),
    'crime_types': df['Crime_Type'].nunique(),
    'total_incidents': int(total_incidents),
    'chargesheet_rate': round(chargesheet_rate, 2),
    'conviction_rate': round(conviction_rate, 2)
}

summary_df = pd.DataFrame([summary_stats])
summary_df.to_csv('../data/processed/data_summary.csv', index=False)

print("\n✓ Summary statistics saved to: ../data/processed/data_summary.csv")
print("\n" + "="*60)
print("DATA ACQUISITION COMPLETE")
print("="*60)