# Key Insights Analysis

This notebook explores key insights from the NYC 311 dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load data
df = pd.read_csv("data/raw/311.csv")
df['created_date'] = pd.to_datetime(df['created_date'], errors='coerce')
df['closed_date'] = pd.to_datetime(df['closed_date'], errors='coerce')

# Calculate resolution hours
df['resolution_hours'] = (df['closed_date'] - df['created_date']).dt.total_seconds() / 3600

print(f"Loaded {len(df):,} records")

## Trend of Total Requests Over Time

In [None]:
# Aggregate by date
daily_requests = df.groupby(df['created_date'].dt.date).size().reset_index(name='requests')
daily_requests = daily_requests.sort_values('created_date')

plt.figure(figsize=(12, 5))
plt.plot(daily_requests['created_date'], daily_requests['requests'], linewidth=1.5)
plt.title('Total Requests Trend Over Time', fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Number of Requests')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Top Complaint Types

In [None]:
# Top 10 complaint types
top_complaints = df['complaint_type'].value_counts().head(10)

plt.figure(figsize=(10, 6))
top_complaints.plot(kind='barh')
plt.title('Top 10 Complaint Types', fontsize=14, fontweight='bold')
plt.xlabel('Number of Requests')
plt.ylabel('Complaint Type')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 Complaint Types:")
print(top_complaints)

## Borough Differences

In [None]:
# Borough comparison
borough_stats = pd.DataFrame({
    'Total Requests': df['borough'].value_counts(),
    'Avg Resolution Hours': df.groupby('borough')['resolution_hours'].mean(),
    'Median Resolution Hours': df.groupby('borough')['resolution_hours'].median()
})
borough_stats = borough_stats.sort_values('Total Requests', ascending=False)

print("Borough Comparison:")
print(borough_stats.to_string())

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

borough_stats['Total Requests'].plot(kind='bar', ax=ax1, color='steelblue')
ax1.set_title('Total Requests by Borough', fontweight='bold')
ax1.set_xlabel('Borough')
ax1.set_ylabel('Number of Requests')
ax1.tick_params(axis='x', rotation=45)

borough_stats['Median Resolution Hours'].plot(kind='bar', ax=ax2, color='coral')
ax2.set_title('Median Resolution Hours by Borough', fontweight='bold')
ax2.set_xlabel('Borough')
ax2.set_ylabel('Hours')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Resolution Time Distribution

In [None]:
# Filter to closed requests only
closed_df = df[df['closed_date'].notna()].copy()
closed_resolution = closed_df['resolution_hours'].dropna()

print(f"Closed requests: {len(closed_resolution):,}")
print(f"Mean resolution time: {closed_resolution.mean():.2f} hours")
print(f"Median resolution time: {closed_resolution.median():.2f} hours")
print(f"90th percentile: {closed_resolution.quantile(0.9):.2f} hours")

# Histogram
plt.figure(figsize=(12, 5))
plt.hist(closed_resolution[closed_resolution <= closed_resolution.quantile(0.95)], 
         bins=50, edgecolor='black', alpha=0.7)
plt.title('Resolution Time Distribution (up to 95th percentile)', fontsize=14, fontweight='bold')
plt.xlabel('Resolution Hours')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Box plot
plt.figure(figsize=(10, 5))
plt.boxplot(closed_resolution[closed_resolution <= closed_resolution.quantile(0.95)], 
           vert=True)
plt.title('Resolution Time Distribution (Box Plot)', fontsize=14, fontweight='bold')
plt.ylabel('Resolution Hours')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()