# Roysambu Crime Data - Exploratory Data Analysis

This notebook performs exploratory data analysis on crime data for Roysambu ward, Nairobi.

## Objectives
- Load and validate crime data
- Analyze temporal patterns
- Examine spatial distribution
- Investigate crime type patterns
- Generate summary statistics

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import geopandas as gpd
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

In [None]:
# Load crime data
# Replace with actual data file path
data_path = '../data/raw/roysambu_crime_data.csv'

try:
    df = pd.read_csv(data_path)
    print(f"Data loaded successfully! Shape: {df.shape}")
except FileNotFoundError:
    print("Data file not found. Creating sample data for demonstration.")
    
    # Create sample data for Roysambu ward
    np.random.seed(42)
    n_records = 1000
    
    # Roysambu approximate bounds
    lat_min, lat_max = -1.2200, -1.2000
    lon_min, lon_max = 36.8900, 36.9100
    
    df = pd.DataFrame({
        'incident_id': range(1, n_records + 1),
        'latitude': np.random.uniform(lat_min, lat_max, n_records),
        'longitude': np.random.uniform(lon_min, lon_max, n_records),
        'datetime': pd.date_range(start='2023-01-01', periods=n_records, freq='6H'),
        'crime_type': np.random.choice(['Theft', 'Robbery', 'Burglary', 'Assault', 'Vehicle Crime'], n_records),
        'description': 'Sample crime incident',
        'status': np.random.choice(['Open', 'Closed', 'Under Investigation'], n_records)
    })
    
    print(f"Sample data created! Shape: {df.shape}")

# Display first few rows
df.head()

In [None]:
# Basic data information
print("=== DATA SUMMARY ===")
print(f"Total records: {len(df):,}")
print(f"Date range: {df['datetime'].min()} to {df['datetime'].max()}")
print(f"Unique crime types: {df['crime_type'].nunique()}")
print(f"Missing values: {df.isnull().sum().sum()}")

print("\n=== COLUMN INFO ===")
df.info()

In [None]:
# Crime type distribution
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
crime_counts = df['crime_type'].value_counts()
plt.bar(crime_counts.index, crime_counts.values)
plt.title('Crime Type Distribution')
plt.xlabel('Crime Type')
plt.ylabel('Number of Incidents')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
plt.pie(crime_counts.values, labels=crime_counts.index, autopct='%1.1f%%')
plt.title('Crime Type Proportion')

plt.tight_layout()
plt.show()

In [None]:
# Temporal analysis
df['datetime'] = pd.to_datetime(df['datetime'])
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.day_name()
df['month'] = df['datetime'].dt.month

plt.figure(figsize=(15, 10))

# Crimes by hour
plt.subplot(2, 2, 1)
hourly_crimes = df['hour'].value_counts().sort_index()
plt.plot(hourly_crimes.index, hourly_crimes.values, marker='o')
plt.title('Crimes by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Number of Crimes')
plt.grid(True)

# Crimes by day of week
plt.subplot(2, 2, 2)
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_crimes = df['day_of_week'].value_counts().reindex(day_order)
plt.bar(daily_crimes.index, daily_crimes.values)
plt.title('Crimes by Day of Week')
plt.xlabel('Day')
plt.ylabel('Number of Crimes')
plt.xticks(rotation=45)

# Crimes by month
plt.subplot(2, 2, 3)
monthly_crimes = df['month'].value_counts().sort_index()
plt.bar(monthly_crimes.index, monthly_crimes.values)
plt.title('Crimes by Month')
plt.xlabel('Month')
plt.ylabel('Number of Crimes')

# Crime trends over time
plt.subplot(2, 2, 4)
daily_trend = df.groupby(df['datetime'].dt.date).size()
plt.plot(daily_trend.index, daily_trend.values)
plt.title('Crime Trends Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Crimes')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Create interactive map of crime locations
# Calculate center of Roysambu
center_lat = df['latitude'].mean()
center_lon = df['longitude'].mean()

# Create base map
m = folium.Map(
    location=[center_lat, center_lon],
    zoom_start=13,
    tiles='OpenStreetMap'
)

# Add crime points (limit to 200 for performance)
sample_data = df.sample(min(200, len(df)))

# Color mapping for crime types
crime_colors = {
    'Theft': 'red',
    'Robbery': 'orange', 
    'Burglary': 'yellow',
    'Assault': 'purple',
    'Vehicle Crime': 'blue'
}

for idx, row in sample_data.iterrows():
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=5,
        popup=f"Crime: {row['crime_type']}<br>Date: {row['datetime']}",
        color=crime_colors.get(row['crime_type'], 'gray'),
        fill=True,
        fillOpacity=0.6
    ).add_to(m)

# Add legend
legend_html = '''
<div style="position: fixed; 
            bottom: 50px; left: 50px; width: 150px; height: 120px; 
            background-color: white; border:2px solid grey; z-index:9999; 
            font-size:14px; padding: 10px">
<p><b>Crime Types</b></p>
'''
for crime_type, color in crime_colors.items():
    legend_html += f'<p><i class="fa fa-circle" style="color:{color}"></i> {crime_type}</p>'
legend_html += '</div>'

m.get_root().html.add_child(folium.Element(legend_html))

# Display map
m

In [None]:
# Spatial distribution analysis
plt.figure(figsize=(12, 8))

# Scatter plot of crime locations
plt.subplot(2, 2, 1)
colors = [crime_colors.get(ct, 'gray') for ct in df['crime_type']]
plt.scatter(df['longitude'], df['latitude'], c=colors, alpha=0.6, s=20)
plt.title('Spatial Distribution of Crimes')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

# Latitude distribution
plt.subplot(2, 2, 2)
plt.hist(df['latitude'], bins=30, alpha=0.7)
plt.title('Latitude Distribution')
plt.xlabel('Latitude')
plt.ylabel('Frequency')

# Longitude distribution
plt.subplot(2, 2, 3)
plt.hist(df['longitude'], bins=30, alpha=0.7)
plt.title('Longitude Distribution')
plt.xlabel('Longitude')
plt.ylabel('Frequency')

# 2D histogram (heatmap)
plt.subplot(2, 2, 4)
plt.hist2d(df['longitude'], df['latitude'], bins=20, cmap='YlOrRd')
plt.colorbar(label='Crime Count')
plt.title('Crime Density Heatmap')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

plt.tight_layout()
plt.show()

In [None]:
# Summary statistics
print("=== ROYSAMBU CRIME ANALYSIS SUMMARY ===")
print(f"\nTotal crime incidents analyzed: {len(df):,}")
print(f"Analysis period: {df['datetime'].min().strftime('%Y-%m-%d')} to {df['datetime'].max().strftime('%Y-%m-%d')}")
print(f"Geographic coverage: {df['latitude'].min():.4f}°N to {df['latitude'].max():.4f}°N")
print(f"                     {df['longitude'].min():.4f}°E to {df['longitude'].max():.4f}°E")

print("\n=== CRIME TYPE BREAKDOWN ===")
for crime_type, count in df['crime_type'].value_counts().items():
    percentage = (count / len(df)) * 100
    print(f"{crime_type}: {count:,} incidents ({percentage:.1f}%)")

print("\n=== TEMPORAL PATTERNS ===")
peak_hour = df['hour'].mode()[0]
peak_day = df['day_of_week'].mode()[0]
print(f"Peak crime hour: {peak_hour}:00")
print(f"Peak crime day: {peak_day}")

night_crimes = df[(df['hour'] >= 22) | (df['hour'] <= 5)]
night_percentage = (len(night_crimes) / len(df)) * 100
print(f"Night-time crimes (22:00-05:59): {len(night_crimes):,} ({night_percentage:.1f}%)")

weekend_crimes = df[df['day_of_week'].isin(['Saturday', 'Sunday'])]
weekend_percentage = (len(weekend_crimes) / len(df)) * 100
print(f"Weekend crimes: {len(weekend_crimes):,} ({weekend_percentage:.1f}%)")

print("\n=== RECOMMENDATIONS ===")
print("1. Increase patrol frequency during peak hours")
print("2. Focus security resources on identified hotspot areas")
print("3. Implement targeted interventions for most common crime types")
print("4. Consider community policing programs in high-density areas")