# NYC 311 Data Cleaning Pipeline
This notebook implements comprehensive data cleaning for the NYC 311 Service Requests dataset including:
- Missing value handling
- Deduplication
- Column standardization
- Data type conversion
- Location validation
- Text normalization

In [120]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

## Load and Explore Data

In [104]:
# Load NYC 311 data
# Note: Using on_bad_lines='skip' to handle malformed rows in the CSV
df = pd.read_csv("../data/raw/nyc_311/nyc_311_raw.csv", 
                  on_bad_lines='skip',
                  engine='python')

print(f"Original dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
df.head()


Original dataset shape: (13262, 44)
Memory usage: 29.91 MB


Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Problem (formerly Complaint Type),Problem Detail (formerly Descriptor),Additional Details,Location Type,Incident Zip,...,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Latitude,Longitude,Location
0,67869470,02/06/2026 02:05:09 AM,,NYPD,New York City Police Department,Noise - Residential,Banging/Pounding,,Residential Building/House,11204.0,...,,,,,,,,40.616645,-73.992191,POINT (-73.992190975602 40.616645363723)
1,67858781,02/06/2026 02:04:35 AM,,NYPD,New York City Police Department,Noise - Residential,Banging/Pounding,,Residential Building/House,10040.0,...,,,,,,,,40.85856,-73.929669,POINT (-73.929669194282 40.858560221857)
2,67860301,02/06/2026 02:04:28 AM,,NYPD,New York City Police Department,Noise - Residential,Banging/Pounding,,Residential Building/House,11355.0,...,,,,,,,,40.752019,-73.821211,POINT (-73.821211164678 40.752018968731)
3,67858763,02/06/2026 02:02:03 AM,,NYPD,New York City Police Department,Illegal Parking,Blocked Hydrant,,Street/Sidewalk,11377.0,...,,,,,,,,40.738983,-73.899837,POINT (-73.899837120308 40.738983283506)
4,67861825,02/06/2026 02:01:27 AM,,NYPD,New York City Police Department,Noise - Street/Sidewalk,Loud Talking,,Street/Sidewalk,11693.0,...,,,,,,,,40.604511,-73.820936,POINT (-73.820936340811 40.604511386258)


## Standardize Column Names

In [105]:
# SECTION 2: Standardize Column Names (REQUIREMENT 1)
# Rule-based column name standardization
df.columns = (
    df.columns
    .str.lower()           # Convert to lowercase
    .str.strip()           # Remove leading/trailing whitespace
    .str.replace(" ", "_")  # Replace spaces with underscores
    .str.replace(r"[^\w_]", "", regex=True)  # Remove special characters
)

print(f"✓ Standardized {len(df.columns)} column names")
print(f"Sample columns: {list(df.columns[:5])}")

✓ Standardized 44 column names
Sample columns: ['unique_key', 'created_date', 'closed_date', 'agency', 'agency_name']


## Select relevant columns

In [106]:
# SECTION 3: Select Relevant Columns
# Your 10 essential columns
columns_to_keep = [
    "unique_key",
    "created_date",
    "problem_formerly_complaint_type",
    "problem_detail_formerly_descriptor",
    "borough",
    "agency",
    "location_type",
    "incident_zip",
    "latitude",
    "longitude",
]

# Keep only available columns
available_cols = [c for c in columns_to_keep if c in df.columns]
missing_cols = [c for c in columns_to_keep if c not in df.columns]

if missing_cols:
    print(f"⚠ Missing columns: {missing_cols}")

df = df[available_cols].copy()
print(f"✓ Selected {len(df.columns)} columns")
print(f"Dataset shape: {df.shape}")
df.head()

✓ Selected 10 columns
Dataset shape: (13262, 10)


Unnamed: 0,unique_key,created_date,problem_formerly_complaint_type,problem_detail_formerly_descriptor,borough,agency,location_type,incident_zip,latitude,longitude
0,67869470,02/06/2026 02:05:09 AM,Noise - Residential,Banging/Pounding,BROOKLYN,NYPD,Residential Building/House,11204.0,40.616645,-73.992191
1,67858781,02/06/2026 02:04:35 AM,Noise - Residential,Banging/Pounding,MANHATTAN,NYPD,Residential Building/House,10040.0,40.85856,-73.929669
2,67860301,02/06/2026 02:04:28 AM,Noise - Residential,Banging/Pounding,QUEENS,NYPD,Residential Building/House,11355.0,40.752019,-73.821211
3,67858763,02/06/2026 02:02:03 AM,Illegal Parking,Blocked Hydrant,QUEENS,NYPD,Street/Sidewalk,11377.0,40.738983,-73.899837
4,67861825,02/06/2026 02:01:27 AM,Noise - Street/Sidewalk,Loud Talking,QUEENS,NYPD,Street/Sidewalk,11693.0,40.604511,-73.820936


Summarized conversation history

## Handle Missing Values

In [107]:
# SECTION 5: Handle Missing Values 
#1: Drop rows with missing CRITICAL fields
critical_fields = ['unique_key', 'created_date', 'latitude', 'longitude']
rows_before = len(df)

df = df.dropna(subset=critical_fields)
rows_dropped_critical = rows_before - len(df)

print(f"✓ Dropped {rows_dropped_critical} rows with missing critical fields")
print(f"  Remaining rows: {len(df):,}")
print("  Critical fields preserved: unique_key, created_date, latitude, longitude")


✓ Dropped 144 rows with missing critical fields
  Remaining rows: 13,118
  Critical fields preserved: unique_key, created_date, latitude, longitude


## Remove Duplicates

In [108]:
# Remove exact duplicates based on unique_key (if available)
initial_rows = len(df)

# Remove exact duplicates based on unique_key
if 'unique_key' in df.columns:
    df = df.drop_duplicates(subset=['unique_key'], keep='first')
    exact_dupes = initial_rows - len(df)
    print(f"✓ Exact duplicates removed by unique_key: {exact_dupes}")
else:
    exact_dupes = 0
    print("⚠ Warning: unique_key column not found")

print(f"  Rows after exact deduplication: {len(df):,}")

if 'unique_key' in df.columns:
    unique_count = df['unique_key'].nunique()
    print(f"  Unique complaint keys: {unique_count:,}")

✓ Exact duplicates removed by unique_key: 0
  Rows after exact deduplication: 13,118
  Unique complaint keys: 13,118


## Data Type Conversion

In [109]:
# SECTION 7: Data Type Conversion 

# Convert date columns to datetime
if 'created_date' in df.columns:
    df['created_date'] = pd.to_datetime(df['created_date'], errors='coerce')
    print("✓ Converted created_date to datetime64")

# Convert numeric columns
numeric_cols = ['latitude', 'longitude', 'incident_zip']
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
print(f"✓ Converted {len(numeric_cols)} numeric columns to float64")

# Convert categorical columns (memory efficiency)
categorical_cols = ['agency', 'problem_formerly_complaint_type', 'location_type', 'borough']
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype('category')
print(f"✓ Converted {len(categorical_cols)} columns to category dtype")

print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

✓ Converted created_date to datetime64
✓ Converted 3 numeric columns to float64
✓ Converted 4 columns to category dtype

Memory usage: 1.47 MB


## Location Validation and Cleaning

In [110]:
# SECTION 8: Location Cleaning & Geospatial Validation 
# Rule-based: NYC bounding box validation
print("Validating coordinates with NYC geographic bounds:")
print("  Latitude: 40.5° to 40.9°N")
print("  Longitude: -74.3° to -73.7°W")

invalid_coords = (
    (df['latitude'] < 40.5) | (df['latitude'] > 40.9) |
    (df['longitude'] < -74.3) | (df['longitude'] > -73.7)
)

rows_invalid_coords = invalid_coords.sum()
print(f"\n✓ Rows with invalid coordinates: {rows_invalid_coords}")

# Statistical: IQR-based outlier detection
from scipy import stats

def detect_outliers_iqr(data, column):
    """Detect outliers using Interquartile Range method"""
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (data[column] < lower_bound) | (data[column] > upper_bound)

outliers_lat = detect_outliers_iqr(df, 'latitude')
outliers_lon = detect_outliers_iqr(df, 'longitude')

print(f"Statistical outlier detection (IQR method):")
print(f"  Latitude outliers: {outliers_lat.sum()}")
print(f"  Longitude outliers: {outliers_lon.sum()}")

# Drop invalid coordinates
df = df[~invalid_coords].copy()
print(f"\n✓ Dataset after coordinate validation: {len(df):,} rows")

# Coordinate statistics
print(f"\nCoordinate Statistics (after validation):")
print(df[['latitude', 'longitude']].describe())

Validating coordinates with NYC geographic bounds:
  Latitude: 40.5° to 40.9°N
  Longitude: -74.3° to -73.7°W

✓ Rows with invalid coordinates: 62
Statistical outlier detection (IQR method):
  Latitude outliers: 0
  Longitude outliers: 722

✓ Dataset after coordinate validation: 13,056 rows

Coordinate Statistics (after validation):
           latitude     longitude
count  13056.000000  13056.000000
mean      40.736976    -73.921986
std        0.088244      0.073243
min       40.501312    -74.250187
25%       40.668506    -73.960295
50%       40.730369    -73.924705
75%       40.822104    -73.882108
max       40.899870    -73.701451


In [111]:
# SECTION 9: ZIP Code Cleaning
if 'incident_zip' in df.columns:
    # Extract 5-digit ZIP codes
    df['incident_zip'] = df['incident_zip'].astype(str).str.extract('(\d{5})', expand=False)
    
    # Count valid vs invalid
    valid_zips = df['incident_zip'].notna().sum()
    invalid_zips = df['incident_zip'].isna().sum()
    
    print(f"✓ ZIP code extraction (5-digit format):")
    print(f"  Valid ZIP codes: {valid_zips:,}")
    print(f"  Invalid/missing: {invalid_zips:,}")
    
    # Convert to numeric
    df['incident_zip'] = pd.to_numeric(df['incident_zip'], errors='coerce')

✓ ZIP code extraction (5-digit format):
  Valid ZIP codes: 13,036
  Invalid/missing: 20


In [112]:
# SECTION 10: Borough Normalization 
if 'borough' in df.columns:
    print("Normalizing borough names:")
    
    # Convert to uppercase and strip whitespace
    df['borough'] = df['borough'].str.upper().str.strip()
    
    # Rule-based mapping for variations
    borough_mapping = {
        'NY': 'MANHATTAN',
        'NEWYORK': 'MANHATTAN',
        'KINGS': 'BROOKLYN',
        'QUEENS': 'QUEENS',
        'BRONX': 'BRONX',
        'RICHMOND': 'STATEN ISLAND'
    }
    
    for old, new in borough_mapping.items():
        df['borough'] = df['borough'].replace(old, new)
    
    print(f"✓ Borough distribution (normalized):")
    print(df['borough'].value_counts())

Normalizing borough names:
✓ Borough distribution (normalized):
borough
BROOKLYN         4107
QUEENS           3050
BRONX            2950
MANHATTAN        2465
STATEN ISLAND     484
Name: count, dtype: int64


## Text Normalization

In [113]:
# SECTION 11: Text Normalization 

def normalize_text(text):
    """Normalize text: uppercase, trim, remove extra spaces"""
    if pd.isna(text):
        return 'UNKNOWN'
    text = str(text).strip().upper()
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    return text

# Normalize complaint type
if 'problem_formerly_complaint_type' in df.columns:
    df['problem_formerly_complaint_type'] = df['problem_formerly_complaint_type'].apply(normalize_text)
    print(f"✓ Complaint types normalized (uppercase, trimmed)")
    print(f"\nTop 15 Complaint Types:")
    print(df['problem_formerly_complaint_type'].value_counts().head(15))

# Normalize location type
if 'location_type' in df.columns:
    df['location_type'] = df['location_type'].apply(normalize_text)
    print(f"\n✓ Location types normalized")
    print(f"Unique location types: {df['location_type'].nunique()}")
    print(df['location_type'].value_counts())

✓ Complaint types normalized (uppercase, trimmed)

Top 15 Complaint Types:
problem_formerly_complaint_type
HEAT/HOT WATER              2422
ILLEGAL PARKING             2046
NOISE - RESIDENTIAL         1180
BLOCKED DRIVEWAY            1148
SNOW OR ICE                  795
UNSANITARY CONDITION         453
PLUMBING                     415
WATER SYSTEM                 354
PAINT/PLASTER                319
DOOR/WINDOW                  240
WATER LEAK                   238
NOISE                        211
GENERAL                      188
TRAFFIC SIGNAL CONDITION     174
ELECTRIC                     150
Name: count, dtype: int64

✓ Location types normalized
Unique location types: 55
location_type
RESIDENTIAL BUILDING             4732
STREET/SIDEWALK                  3594
UNKNOWN                          1355
RESIDENTIAL BUILDING/HOUSE       1266
SIDEWALK                          754
STREET                            700
STORE/COMMERCIAL                  101
BUSINESS                           65

## Advanced Complaint Type Normalization
### Rule-Based Category Mapping
Using domain knowledge and statistical analysis to group similar complaint types into standardized categories

In [114]:
# Analyze complaint type distribution before normalization
print("Top 20 Original Complaint Types:")
complaint_counts = df_dedup['problem_formerly_complaint_type'].value_counts()
print(complaint_counts.head(20))
print(f"\nTotal unique complaint types: {df_dedup['problem_formerly_complaint_type'].nunique()}")

# Rule-based normalization mapping
complaint_mapping = {
    'Noise - Residential': 'NOISE',
    'Noise - Commercial': 'NOISE',
    'Noise - Street/Sidewalk': 'NOISE',
    'Noise - Park': 'NOISE',
    'Noise - Vehicle': 'NOISE',
    'Noise': 'NOISE',
    
    'Illegal Parking': 'PARKING',
    'Blocked Driveway': 'PARKING',
    'Blocked Sidewalk': 'PARKING',
    'Blocked Hydrant': 'PARKING',
    'Posted Parking Sign Violation': 'PARKING',
    
    'Street Condition': 'STREET_CONDITION',
    'Pothole': 'STREET_CONDITION',
    'Street/Sidewalk Condition': 'STREET_CONDITION',
    'Pavement Condition': 'STREET_CONDITION',
    'Curb Condition': 'STREET_CONDITION',
    
    'Traffic Signal': 'TRAFFIC',
    'Traffic Control': 'TRAFFIC',
    'Traffic': 'TRAFFIC',
    
    'Street Light': 'STREET_LIGHT',
    'Street Lights': 'STREET_LIGHT',
    'Lighting': 'STREET_LIGHT',
    
    'Graffiti': 'GRAFFITI',
    'Graffiti - Public': 'GRAFFITI',
    'Graffiti - Private': 'GRAFFITI',
    
    'Sanitation': 'SANITATION',
    'Sanitation Worker or Vehicle Complaint': 'SANITATION',
    'Dirty Conditions': 'SANITATION',
    'Filthy Condition': 'SANITATION',
    'Inadequate Waste Containers': 'SANITATION',
    
    'Water System': 'WATER_UTILITY',
    'Water': 'WATER_UTILITY',
    'Water Quality': 'WATER_UTILITY',
    
    'Snow or Ice': 'WEATHER',
    'Ice/Snow': 'WEATHER',
}

# Normalize complaint types
df_dedup['complaint_type_normalized'] = df_dedup['problem_formerly_complaint_type'].str.upper().str.strip()

# Apply mapping
for original, normalized in complaint_mapping.items():
    df_dedup.loc[df_dedup['complaint_type_normalized'] == original.upper(), 'complaint_type_normalized'] = normalized

# Map remaining uncategorized to "OTHER"
frequent_types = df_dedup['complaint_type_normalized'].value_counts().head(10).index
df_dedup.loc[~df_dedup['complaint_type_normalized'].isin(frequent_types), 'complaint_type_normalized'] = 'OTHER'

print("\n" + "="*50)
print("Normalized Complaint Type Distribution:")
print("="*50)
normalized_counts = df_dedup['complaint_type_normalized'].value_counts()
print(normalized_counts)
print(f"\nTotal normalized categories: {df_dedup['complaint_type_normalized'].nunique()}")

Top 20 Original Complaint Types:
problem_formerly_complaint_type
Illegal Parking             1624
HEAT/HOT WATER              1497
Blocked Driveway             956
Noise - Residential          792
Snow or Ice                  752
UNSANITARY CONDITION         318
Water System                 284
PLUMBING                     277
PAINT/PLASTER                218
WATER LEAK                   212
Noise                        185
DOOR/WINDOW                  179
GENERAL                      154
Street Condition             144
Dirty Condition              133
Traffic Signal Condition     119
ELECTRIC                     115
FLOORING/STAIRS              110
Noise - Commercial           105
Abandoned Vehicle            101
Name: count, dtype: int64

Total unique complaint types: 118

Normalized Complaint Type Distribution:
complaint_type_normalized
PARKING                 2580
OTHER                   2508
HEAT/HOT WATER          1497
NOISE                   1256
WEATHER                  752
UN

## Outlier Detection: Statistical Methods
### Using IQR (Interquartile Range) and Z-Score for anomaly detection

In [115]:
from scipy import stats

# Detect spatial outliers using IQR method
def detect_outliers_iqr(data, column):
    """Detect outliers using Interquartile Range method"""
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (data[column] < lower_bound) | (data[column] > upper_bound)

# Detect spatial outliers using Z-score method
def detect_outliers_zscore(data, column, threshold=3):
    """Detect outliers using Z-score method"""
    z_scores = np.abs(stats.zscore(data[column].dropna()))
    return np.abs(stats.zscore(data[column])) > threshold

# Check for coordinate anomalies
print("Checking for spatial outliers...")
outliers_lat = detect_outliers_iqr(df_dedup, 'latitude')
outliers_lon = detect_outliers_iqr(df_dedup, 'longitude')

print(f"Latitude outliers (IQR method): {outliers_lat.sum()}")
print(f"Longitude outliers (IQR method): {outliers_lon.sum()}")

# Spatial validation: NYC bounding box
valid_lat = (df_dedup['latitude'] >= 40.5) & (df_dedup['latitude'] <= 40.9)
valid_lon = (df_dedup['longitude'] >= -74.3) & (df_dedup['longitude'] <= -73.7)
valid_coords = valid_lat & valid_lon

print(f"Rows with valid NYC coordinates: {valid_coords.sum()}/{len(df_dedup)}")
print(f"Rows with invalid coordinates (removed): {(~valid_coords).sum()}")

# Remove rows with invalid coordinates
df_dedup = df_dedup[valid_coords].reset_index(drop=True)

print(f"Dataset after coordinate validation: {len(df_dedup)} records")

# Statistical summary of coordinates
print("\nCoordinate Statistics (after cleaning):")
print(df_dedup[['latitude', 'longitude']].describe())

Checking for spatial outliers...
Latitude outliers (IQR method): 0
Longitude outliers (IQR method): 594
Rows with valid NYC coordinates: 10100/10100
Rows with invalid coordinates (removed): 0
Dataset after coordinate validation: 10100 records

Coordinate Statistics (after cleaning):
           latitude     longitude
count  10100.000000  10100.000000
mean      40.732182    -73.923104
std        0.087581      0.076093
min       40.501312    -74.250187
25%       40.666237    -73.962549
50%       40.725121    -73.925148
75%       40.815111    -73.881684
max       40.899870    -73.701451


## Near Duplicate Detection


In [116]:
# SECTION 12: Near-Duplicate Detection 

def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate distance in meters between two coordinates using Haversine formula
    
    Parameters:
    -----------
    lat1, lon1 : float - First coordinate (degrees)
    lat2, lon2 : float - Second coordinate (degrees)
    
    Returns:
    --------
    float : Distance in meters
    """
    R = 6371000  # Earth's radius in meters
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return R * c

# Sort by spatial and temporal attributes for efficiency
df = df.sort_values(['latitude', 'longitude', 'problem_formerly_complaint_type', 'created_date']).reset_index(drop=True)

# Parameters for near-duplicate detection
DISTANCE_THRESHOLD_M = 50  # meters
TIME_THRESHOLD_H = 24      # hours

print("Near-duplicate detection criteria:")
print(f"  Spatial threshold (Haversine): {DISTANCE_THRESHOLD_M} meters")
print(f"  Temporal threshold: {TIME_THRESHOLD_H} hours")
print(f"  Semantic requirement: Same complaint type")

# Detect near-duplicates
near_dup_indices = []
rows_checked = 0

for i in range(len(df) - 1):
    current = df.iloc[i]
    next_row = df.iloc[i + 1]
    
    if pd.isna(current['latitude']) or pd.isna(current['longitude']):
        continue
    
    # Calculate distance
    distance = haversine_distance(
        current['latitude'], current['longitude'],
        next_row['latitude'], next_row['longitude']
    )
    
    # Calculate time difference
    time_diff = abs((next_row['created_date'] - current['created_date']).total_seconds() / 3600)
    
    # Mark as near-duplicate if ALL criteria met
    if (distance < DISTANCE_THRESHOLD_M and 
        time_diff < TIME_THRESHOLD_H and 
        current['problem_formerly_complaint_type'] == next_row['problem_formerly_complaint_type']):
        near_dup_indices.append(i + 1)  # Keep first, mark second for removal
    
    rows_checked += 1

# Remove near-duplicates
df_before_near_dup = len(df)
df = df[~df.index.isin(near_dup_indices)].reset_index(drop=True)
near_dupes = df_before_near_dup - len(df)

print(f"\n✓ Near-duplicate detection complete:")
print(f"  Rows checked: {rows_checked:,}")
print(f"  Near-duplicates identified: {near_dupes:,}")
print(f"  Rows removed: {near_dupes:,}")
print(f"  Rows retained: {len(df):,}")

Near-duplicate detection criteria:
  Spatial threshold (Haversine): 50 meters
  Temporal threshold: 24 hours
  Semantic requirement: Same complaint type

✓ Near-duplicate detection complete:
  Rows checked: 13,055
  Near-duplicates identified: 2,960
  Rows removed: 2,960
  Rows retained: 10,096


## Missing Value Handling: Documented Strategy
### Four-tier approach: Drop Critical → Impute → Fill → Default

In [121]:
# Analyze missing values BEFORE handling

#2: Fill categorical fields with semantic defaults
categorical_impute = {
    'problem_formerly_complaint_type': 'UNKNOWN',
    'agency': 'VARIOUS',
    'location_type': 'UNSPECIFIED',
    'borough': 'UNSPECIFIED'
}

print("Filling categorical missing values with defaults:")
for col, fill_value in categorical_impute.items():
    if col in df.columns and df[col].isnull().sum() > 0:
        fill_count = df[col].isnull().sum()
        df[col] = df[col].fillna(fill_value)
        print(f"  {col}: Filled {fill_count} missing values with '{fill_value}'")
    elif col in df.columns:
        print(f"  {col}: No missing values")

# SECTION 14: Missing Value Handling - TIER 3 (TEXT)
# 3: Fill text descriptions
if 'problem_detail_formerly_descriptor' in df.columns:
    missing_count = df['problem_detail_formerly_descriptor'].isnull().sum()
    df['problem_detail_formerly_descriptor'] = df['problem_detail_formerly_descriptor'].fillna('No description provided')
    print(f"✓ problem_detail_formerly_descriptor: Filled {missing_count} with default text")

# SECTION 15: Missing Value Handling - TIER 4 (STATISTICAL IMPUTATION)
# 4: Statistical imputation for incident_zip using borough median
if 'incident_zip' in df.columns:
    print("Statistical imputation strategy: Borough-level median")
    print("\nFilling missing ZIP codes by borough:")
    
    for borough in df['borough'].unique():
        if borough != 'UNSPECIFIED':
            # Calculate borough median ZIP
            borough_median_zip = df[df['borough'] == borough]['incident_zip'].median()
            
            # Create mask for missing values in this borough
            mask = (df['borough'] == borough) & (df['incident_zip'].isnull())
            filled_count = mask.sum()
            
            # Impute if median exists
            if not pd.isna(borough_median_zip) and filled_count > 0:
                df.loc[mask, 'incident_zip'] = int(borough_median_zip)
                print(f"  {borough}: Filled {filled_count} ZIPs with median {int(borough_median_zip)}")
            elif filled_count > 0:
                print(f"  {borough}: No valid ZIPs found for imputation")

# SECTION 16: Analyze Missing Values (After Cleaning)
missing_after = df.isnull().sum()
missing_after = missing_after[missing_after > 0].sort_values(ascending=False)

if len(missing_after) > 0:
    print("Remaining missing values:")
    print(missing_after)
else:
    print("✓ No missing values remaining!")

print(f"\nDataset shape: {df.shape}")
print(f"Total missing cells: {df.isnull().sum().sum()}")

Filling categorical missing values with defaults:
  problem_formerly_complaint_type: No missing values
  agency: No missing values
  location_type: No missing values
  borough: No missing values
✓ problem_detail_formerly_descriptor: Filled 0 with default text
Statistical imputation strategy: Borough-level median

Filling missing ZIP codes by borough:
✓ No missing values remaining!

Dataset shape: (10096, 10)
Total missing cells: 0


## Data Quality Metrics and Final Validation

In [118]:
# SECTION 17: Final Data Quality Assessment

# 1. Completeness
complete_rows = len(df[df.isnull().sum(axis=1) == 0])
completeness = (complete_rows / len(df)) * 100
print(f"\n1. COMPLETENESS:")
print(f"   ✓ Complete rows (0 missing values): {complete_rows:,} ({completeness:.1f}%)")
print(f"   Total missing cells: {df.isnull().sum().sum()}")

# 2. Validity
valid_coords = ((df['latitude'] >= 40.5) & (df['latitude'] <= 40.9) &
                (df['longitude'] >= -74.3) & (df['longitude'] <= -73.7)).sum()
validity_coords = (valid_coords / len(df)) * 100
print(f"\n2. VALIDITY (Geospatial):")
print(f"   ✓ Valid NYC coordinates: {valid_coords:,} ({validity_coords:.1f}%)")
print(f"   Coordinate range:")
print(f"     Latitude:  {df['latitude'].min():.4f}° to {df['latitude'].max():.4f}°")
print(f"     Longitude: {df['longitude'].min():.4f}° to {df['longitude'].max():.4f}°")

# 3. Uniqueness
if 'unique_key' in df.columns:
    unique_keys = df['unique_key'].nunique()
    uniqueness = (unique_keys / len(df)) * 100
    print(f"\n3. UNIQUENESS:")
    print(f"   ✓ Unique identifiers: {unique_keys:,} out of {len(df):,} ({uniqueness:.1f}%)")

# 4. Consistency
print(f"\n4. CONSISTENCY:")
print(f"   ✓ Standardized column names: All lowercase with underscores")
print(f"   ✓ Normalized complaint types: {df['problem_formerly_complaint_type'].nunique()} unique types")
print(f"   ✓ Standardized date format: datetime64[ns]")
print(f"   ✓ Normalized coordinates: float64 within NYC bounds")

# 5. Coverage
print(f"\n5. COVERAGE:")
print(f"   Time span: {df['created_date'].min().date()} to {df['created_date'].max().date()}")
print(f"   Boroughs: {df['borough'].nunique()} boroughs")
print(f"   Agencies: {df['agency'].nunique()} agencies")
print(f"   Complaint types: {df['problem_formerly_complaint_type'].nunique()} types")

# 6. Distribution Statistics
print(f"\n6. GEOGRAPHIC DISTRIBUTION (Top Boroughs):")
print(df['borough'].value_counts())

print(f"\n7. COMPLAINT TYPE DISTRIBUTION (Top 10):")
print(df['problem_formerly_complaint_type'].value_counts().head(10))

print(f"\n8. AGENCY DISTRIBUTION (Top 10):")
print(df['agency'].value_counts().head(10))

print(f"\n9. TEMPORAL STATISTICS:")
days_covered = (df['created_date'].max() - df['created_date'].min()).days
records_per_day = len(df) / (days_covered + 1) if days_covered > 0 else 0
print(f"   Date range: {days_covered} days")
print(f"   Records per day (avg): {records_per_day:.0f}")


1. COMPLETENESS:
   ✓ Complete rows (0 missing values): 10,096 (100.0%)
   Total missing cells: 0

2. VALIDITY (Geospatial):
   ✓ Valid NYC coordinates: 10,096 (100.0%)
   Coordinate range:
     Latitude:  40.5013° to 40.8999°
     Longitude: -74.2502° to -73.7015°

3. UNIQUENESS:
   ✓ Unique identifiers: 10,096 out of 10,096 (100.0%)

4. CONSISTENCY:
   ✓ Standardized column names: All lowercase with underscores
   ✓ Normalized complaint types: 116 unique types
   ✓ Standardized date format: datetime64[ns]
   ✓ Normalized coordinates: float64 within NYC bounds

5. COVERAGE:
   Time span: 2026-02-04 to 2026-02-06
   Boroughs: 5 boroughs
   Agencies: 13 agencies
   Complaint types: 116 types

6. GEOGRAPHIC DISTRIBUTION (Top Boroughs):
borough
BROOKLYN         3250
QUEENS           2408
BRONX            2117
MANHATTAN        1892
STATEN ISLAND     429
Name: count, dtype: int64

7. COMPLAINT TYPE DISTRIBUTION (Top 10):
problem_formerly_complaint_type
ILLEGAL PARKING         1624
HEAT/HOT

## Save Cleaned Data and Generate Documentation

In [119]:
output_path = "../data/raw/nyc_311_cleaned.csv"
df.to_csv(output_path, index=False)

print(f"✓ Cleaned data saved to: {output_path}")
print(f"\n========== CLEANING SUMMARY ==========")
print(f"Final row count: {len(df):,}")
print(f"Final column count: {len(df.columns)}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")



✓ Cleaned data saved to: ../data/raw/nyc_311_cleaned.csv

Final row count: 10,096
Final column count: 10
Memory usage: 2.80 MB
