In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Configure display settings
warnings.filterwarnings('ignore')

In [2]:
# File paths
crashes_file = r'data\Motor_Vehicle_Collisions_Crashes.csv'
persons_file = r'data\Motor_Vehicle_Collisions_Person.csv'

# Load sample from crashes dataset
crashes_sample = pd.read_csv(crashes_file, low_memory=False)
print(f"‚úì Crashes sample loaded: {crashes_sample.shape}")

print("\n=== DATASET STRUCTURE OVERVIEW ===")
print(f"Crashes Dataset: {crashes_sample.shape[1]} columns")

‚úì Crashes sample loaded: (2219657, 29)

=== DATASET STRUCTURE OVERVIEW ===
Crashes Dataset: 29 columns


## Dataset 1: Motor Vehicle Collisions - Crashes Analysis

In [3]:
# Display first few rows of each dataset
print("\n--- Crashes Sample ---")
print(crashes_sample.head().to_string())


--- Crashes Sample ---
   CRASH DATE CRASH TIME   BOROUGH ZIP CODE  LATITUDE  LONGITUDE                    LOCATION           ON STREET NAME CROSS STREET NAME OFF STREET NAME  NUMBER OF PERSONS INJURED  NUMBER OF PERSONS KILLED  NUMBER OF PEDESTRIANS INJURED  NUMBER OF PEDESTRIANS KILLED  NUMBER OF CYCLIST INJURED  NUMBER OF CYCLIST KILLED  NUMBER OF MOTORIST INJURED  NUMBER OF MOTORIST KILLED CONTRIBUTING FACTOR VEHICLE 1 CONTRIBUTING FACTOR VEHICLE 2 CONTRIBUTING FACTOR VEHICLE 3 CONTRIBUTING FACTOR VEHICLE 4 CONTRIBUTING FACTOR VEHICLE 5  COLLISION_ID                  VEHICLE TYPE CODE 1 VEHICLE TYPE CODE 2 VEHICLE TYPE CODE 3 VEHICLE TYPE CODE 4 VEHICLE TYPE CODE 5
0  09/11/2021       2:39       NaN      NaN       NaN        NaN                         NaN    WHITESTONE EXPRESSWAY         20 AVENUE             NaN                        2.0                       0.0                              0                             0                          0                         0   

In [4]:
# Clear any previous output and create missing stats analysis
missing_stats = pd.DataFrame({
    'Column': crashes_sample.columns,
    'Missing_Count': crashes_sample.isnull().sum(),
    'Missing_Percentage': (crashes_sample.isnull().sum() / len(crashes_sample) * 100).round(2),
    'Data_Type': crashes_sample.dtypes
})

# Sort by missing count (descending)
missing_stats = missing_stats.sort_values('Missing_Count', ascending=False)

# Missing Values Analysis for Crashes Dataset
print("üîç MISSING VALUES ANALYSIS - CRASHES DATASET")
print("="*50)
print("\nAll columns with their missing value statistics:")
print(missing_stats.to_string(index=False))

üîç MISSING VALUES ANALYSIS - CRASHES DATASET

All columns with their missing value statistics:
                       Column  Missing_Count  Missing_Percentage Data_Type
          VEHICLE TYPE CODE 5        2209932               99.56    object
CONTRIBUTING FACTOR VEHICLE 5        2209616               99.55    object
          VEHICLE TYPE CODE 4        2184322               98.41    object
CONTRIBUTING FACTOR VEHICLE 4        2182991               98.35    object
          VEHICLE TYPE CODE 3        2065294               93.05    object
CONTRIBUTING FACTOR VEHICLE 3        2059062               92.76    object
              OFF STREET NAME        1828634               82.38    object
            CROSS STREET NAME         848140               38.21    object
                     ZIP CODE         680402               30.65    object
                      BOROUGH         680127               30.64    object
               ON STREET NAME         483397               21.78    object
   

Location, Latitude, Longitude are coordinates and will provide significant insights, they are in sync of missing values, we should inspect them fruther below

### Location Data Completeness Analysis

In [5]:
# Location Data Completeness Analysis
# Define all location columns to analyze (Including LATITUDE, LONGITUDE, LOCATION)
location_cols = ['BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE', 'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME']

# 1. Rows with at least ONE location field filled
at_least_one_mask = False
for col in location_cols:
    at_least_one_mask = at_least_one_mask | crashes_sample[col].notna()

at_least_one_count = at_least_one_mask.sum()

# 2. Rows with ALL location fields empty
all_empty_count = len(crashes_sample) - at_least_one_count

# Now print all results at once
print("üó∫Ô∏è LOCATION DATA COMPLETENESS ANALYSIS")
print("="*70)
print(f"\n1. Rows with AT LEAST ONE location field filled:")
print(f"   ‚Ä¢ Count: {at_least_one_count:,} rows ({at_least_one_count/len(crashes_sample)*100:.2f}%)")
print(f"\n2. Rows with ALL location fields empty:")
print(f"   ‚Ä¢ Count: {all_empty_count:,} rows ({all_empty_count/len(crashes_sample)*100:.2f}%)")

üó∫Ô∏è LOCATION DATA COMPLETENESS ANALYSIS

1. Rows with AT LEAST ONE location field filled:
   ‚Ä¢ Count: 2,189,634 rows (98.65%)

2. Rows with ALL location fields empty:
   ‚Ä¢ Count: 30,023 rows (1.35%)


since the 2. Rows with ALL location fields empty:
   ‚Ä¢ Count: 30,023 rows (1.35%)
   then the data provides no geographical meaning and can't be properly imputated
it is resonable to remove them

In [6]:
# Drop rows with all location fields empty
original_shape = crashes_sample.shape
crashes_sample = crashes_sample[at_least_one_mask].reset_index(drop=True)
print(f"\n‚úì After dropping rows with all location fields empty: {crashes_sample.shape}")
print(f"Percentage of rows retained: {len(crashes_sample)/ original_shape[0] * 100:.2f}%")


‚úì After dropping rows with all location fields empty: (2189634, 29)
Percentage of rows retained: 98.65%


In [7]:
# Get rows where all three fields exist
all_coords_exist = crashes_sample['LONGITUDE'].notna() & crashes_sample['LATITUDE'].notna() & crashes_sample['LOCATION'].notna()
all_coords_count = all_coords_exist.sum()

# Check rows where LATITUDE, LONGITUDE, and LOCATION either all missing or all filled
lat_lon_loc_mask = (crashes_sample['LATITUDE'].isna() & crashes_sample['LONGITUDE'].isna() & crashes_sample['LOCATION'].isna()) | \
                   (crashes_sample['LATITUDE'].notna() & crashes_sample['LONGITUDE'].notna() & crashes_sample['LOCATION'].notna())
lat_lon_loc_count = lat_lon_loc_mask.sum()

# Initialize variables
successful_extractions = 0
long_match = 0
lat_match = 0
both_match = 0
coords_subset = None

if all_coords_count > 0:
    # Work with subset that has all coordinates
    coords_subset = crashes_sample[all_coords_exist].copy()
    
    # Extract coordinates from LOCATION field using string splitting
    # Note: LOCATION format is (LATITUDE, LONGITUDE) not (LONGITUDE, LATITUDE)
    def extract_coords(location_str):
        try:
            # Remove spaces and parentheses, then split by comma
            trimmed = str(location_str).strip().strip('()')
            parts = trimmed.split(',')
            if len(parts) == 2:
                latitude = float(parts[0].strip())   # First value is LATITUDE
                longitude = float(parts[1].strip())  # Second value is LONGITUDE
                return longitude, latitude
            else:
                return None, None
        except:
            return None, None
    
    # Apply extraction function
    coords_subset[['LOC_LONG', 'LOC_LAT']] = coords_subset['LOCATION'].apply(
        lambda x: pd.Series(extract_coords(x))
    )
    
    # Count successful extractions
    successful_extractions = coords_subset['LOC_LONG'].notna().sum()
    
    if successful_extractions > 0:
        # Compare coordinates (check for exact matches)
        long_match = (coords_subset['LONGITUDE'] == coords_subset['LOC_LONG']).sum()
        lat_match = (coords_subset['LATITUDE'] == coords_subset['LOC_LAT']).sum()
        both_match = ((coords_subset['LONGITUDE'] == coords_subset['LOC_LONG']) & 
                     (coords_subset['LATITUDE'] == coords_subset['LOC_LAT'])).sum()
        

# Check consistency between LONGITUDE/LATITUDE and LOCATION coordinates
print("üîç LONGITUDE/LATITUDE vs LOCATION COORDINATE CONSISTENCY CHECK")
print("="*70)

# Print all results at the end
print(f"Rows with LONGITUDE, LATITUDE, and LOCATION all present: {all_coords_count:,}")
print(f"Rows where LAT/LON/LOCATION either all missing or all filled: {lat_lon_loc_count:,} ({lat_lon_loc_count/len(crashes_sample)*100:.2f}%)")
# Count Rows where LATITUDE == "0"
print(f"Rows where LATITUDE is 0: {(crashes_sample['LATITUDE'] == 0).sum():,}")

if all_coords_count > 0:
    print(f"\n\nSuccessfully extracted coordinates from LOCATION: {successful_extractions:,}")
    
    if successful_extractions > 0:
        print(f"\nüìä COORDINATE MATCHING RESULTS:")
        print(f"   ‚Ä¢ LONGITUDE matches: {long_match:,} / {successful_extractions:,} ({long_match/successful_extractions*100:.2f}%)")
        print(f"   ‚Ä¢ LATITUDE matches: {lat_match:,} / {successful_extractions:,} ({lat_match/successful_extractions*100:.2f}%)")
        print(f"   ‚Ä¢ BOTH coordinates match: {both_match:,} / {successful_extractions:,} ({both_match/successful_extractions*100:.2f}%)")
        
        if both_match == successful_extractions:
            print(f"\n‚úÖ PERFECT MATCH: All coordinates are 100% consistent!")
        else:
            print(f"\n‚ùå MISMATCHES FOUND: {successful_extractions - both_match:,} rows with inconsistent coordinates")
            
            # Show mismatch examples
            mismatch_mask = ((coords_subset['LONGITUDE'] != coords_subset['LOC_LONG']) | 
                           (coords_subset['LATITUDE'] != coords_subset['LOC_LAT']))
            mismatch_rows = coords_subset[mismatch_mask]
            
            print(f"\nüìã FIRST 10 MISMATCH EXAMPLES:")
            print("-" * 100)
            display_cols = ['COLLISION_ID', 'LONGITUDE', 'LATITUDE', 'LOC_LONG', 'LOC_LAT', 'LOCATION']
            print(mismatch_rows[display_cols].head(10).to_string(index=False))


üîç LONGITUDE/LATITUDE vs LOCATION COORDINATE CONSISTENCY CHECK
Rows with LONGITUDE, LATITUDE, and LOCATION all present: 1,979,311
Rows where LAT/LON/LOCATION either all missing or all filled: 2,189,634 (100.00%)
Rows where LATITUDE is 0: 6,495


Successfully extracted coordinates from LOCATION: 1,979,311

üìä COORDINATE MATCHING RESULTS:
   ‚Ä¢ LONGITUDE matches: 1,979,311 / 1,979,311 (100.00%)
   ‚Ä¢ LATITUDE matches: 1,979,311 / 1,979,311 (100.00%)
   ‚Ä¢ BOTH coordinates match: 1,979,311 / 1,979,311 (100.00%)

‚úÖ PERFECT MATCH: All coordinates are 100% consistent!


Results show that the location is redundent due to it is always equivelent to the longitude and latitude
and there are 6,495 where latitude and longitude are (0,0) 

In [8]:
# Location Data Completeness Analysis - WITHOUT Coordinates/Location

# Define location columns excluding LATITUDE, LONGITUDE, LOCATION
location_cols_no_coords = ['BOROUGH', 'ZIP CODE', 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME']

# 1. Rows with at least ONE location field filled (excluding coordinates)
at_least_one_mask_no_coords = False
for col in location_cols_no_coords:
    at_least_one_mask_no_coords = at_least_one_mask_no_coords | crashes_sample[col].notna()

at_least_one_count_no_coords = at_least_one_mask_no_coords.sum()

# 2. Rows with ALL location fields empty (excluding coordinates)
all_empty_count_no_coords = len(crashes_sample) - at_least_one_count_no_coords

# 3. Rows with Any street name (On Street OR Cross Street OR Off Street)
any_street_cols = ['ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME']
any_street_mask = False
for col in any_street_cols:
    any_street_mask = any_street_mask | crashes_sample[col].notna()
any_street_count = any_street_mask.sum()



# NOW PRINT EVERYTHING AT ONCE
print("üó∫Ô∏è LOCATION DATA COMPLETENESS ANALYSIS - WITHOUT COORDINATES")
print("="*70)
print(f"\n1. Rows with AT LEAST ONE location field filled (no coordinates):")
print(f"   ‚Ä¢ Count: {at_least_one_count_no_coords:,} rows ({at_least_one_count_no_coords/len(crashes_sample)*100:.2f}%)")
print(f"\n2. Rows with ALL location fields empty (no coordinates):")
print(f"   ‚Ä¢ Count: {all_empty_count_no_coords:,} rows ({all_empty_count_no_coords/len(crashes_sample)*100:.2f}%)")
print(f"\nüìä COMPARISON:")
print(f"   ‚Ä¢ With coordinates: {at_least_one_count:,} rows ({at_least_one_count/len(crashes_sample)*100:.2f}%)")
print(f"   ‚Ä¢ Without coordinates: {at_least_one_count_no_coords:,} rows ({at_least_one_count_no_coords/len(crashes_sample)*100:.2f}%)")
print(f"   ‚Ä¢ Difference: {at_least_one_count - at_least_one_count_no_coords:,} rows")
print(f"\n3. Rows with at least ONE street name field filled (On, Cross, Off):")
print(f"   ‚Ä¢ Count: {any_street_count:,} rows ({any_street_count/len(crashes_sample)*100:.2f}%)")

üó∫Ô∏è LOCATION DATA COMPLETENESS ANALYSIS - WITHOUT COORDINATES

1. Rows with AT LEAST ONE location field filled (no coordinates):
   ‚Ä¢ Count: 2,128,179 rows (97.19%)

2. Rows with ALL location fields empty (no coordinates):
   ‚Ä¢ Count: 61,455 rows (2.81%)

üìä COMPARISON:
   ‚Ä¢ With coordinates: 2,189,634 rows (100.00%)
   ‚Ä¢ Without coordinates: 2,128,179 rows (97.19%)
   ‚Ä¢ Difference: 61,455 rows

3. Rows with at least ONE street name field filled (On, Cross, Off):
   ‚Ä¢ Count: 2,127,585 rows (97.17%)


since coordinates are the smallest unit it is hard to impute it by correlation, besides imputing it by central tendancy is not descriptive and will make a change in skewness(around 3% of values change)

In [9]:
# Clean invalid coordinates and check for location anomalies
print("üßπ CLEANING INVALID COORDINATES & DETECTING ANOMALIES")
print("="*70)

# Store original shape for comparison
original_shape = crashes_sample.shape
print(f"Original dataset shape: {original_shape}")

# 1. Remove rows with missing coordinates (LATITUDE, LONGITUDE, LOCATION all missing)
coords_missing_mask = crashes_sample['LATITUDE'].isna() & crashes_sample['LONGITUDE'].isna() & crashes_sample['LOCATION'].isna()
coords_missing_count = coords_missing_mask.sum()

crashes_sample = crashes_sample[~coords_missing_mask].reset_index(drop=True)
print(f"\n‚úì Removed {coords_missing_count:,} rows with all coordinates missing")
print(f"   Shape after removal: {crashes_sample.shape}")

# 2. Remove rows with coordinates = (0, 0) - invalid location
zero_coords_mask = (crashes_sample['LATITUDE'] == 0) & (crashes_sample['LONGITUDE'] == 0)
zero_coords_count = zero_coords_mask.sum()

crashes_sample = crashes_sample[~zero_coords_mask].reset_index(drop=True)
print(f"\n‚úì Removed {zero_coords_count:,} rows with coordinates = (0, 0)")
print(f"   Shape after removal: {crashes_sample.shape}")

üßπ CLEANING INVALID COORDINATES & DETECTING ANOMALIES
Original dataset shape: (2189634, 29)

‚úì Removed 210,323 rows with all coordinates missing
   Shape after removal: (1979311, 29)

‚úì Removed 6,495 rows with coordinates = (0, 0)
   Shape after removal: (1972816, 29)


In [10]:
# 3. Check for other coordinate anomalies in remaining data
print(f"\nüîç COORDINATE ANOMALY DETECTION")
print("-" * 50)

# Get rows with coordinates
has_coords = crashes_sample['LATITUDE'].notna() & crashes_sample['LONGITUDE'].notna()
coord_data = crashes_sample[has_coords].copy()

if len(coord_data) > 0:
    # NYC approximate boundaries
    NYC_LAT_MIN, NYC_LAT_MAX = 40.4774, 40.9176  # Roughly Staten Island to Bronx
    NYC_LON_MIN, NYC_LON_MAX = -74.2591, -73.7004  # Roughly NJ border to Queens
    
    # Check coordinates outside NYC bounds
    outside_nyc = (
        (coord_data['LATITUDE'] < NYC_LAT_MIN) | 
        (coord_data['LATITUDE'] > NYC_LAT_MAX) |
        (coord_data['LONGITUDE'] < NYC_LON_MIN) | 
        (coord_data['LONGITUDE'] > NYC_LON_MAX)
    )
    outside_nyc_count = outside_nyc.sum()
    
    # Check for extreme outliers (likely data entry errors)
    lat_outliers = (coord_data['LATITUDE'].abs() > 90) | (coord_data['LATITUDE'] == 0)
    lon_outliers = (coord_data['LONGITUDE'].abs() > 180) | (coord_data['LONGITUDE'] == 0)
    coord_outliers = lat_outliers | lon_outliers
    outlier_count = coord_outliers.sum()
    
    # Check for duplicate coordinates (exact same location)
    duplicate_coords = coord_data.duplicated(subset=['LATITUDE', 'LONGITUDE'], keep=False)
    duplicate_count = duplicate_coords.sum()
    unique_duplicate_locations = coord_data[duplicate_coords].drop_duplicates(subset=['LATITUDE', 'LONGITUDE']).shape[0]
    
    # Coordinate statistics
    print(f"üìä COORDINATE STATISTICS:")
    print(f"   ‚Ä¢ Total rows with coordinates: {len(coord_data):,}")
    print(f"   ‚Ä¢ Latitude range: {coord_data['LATITUDE'].min():.6f} to {coord_data['LATITUDE'].max():.6f}")
    print(f"   ‚Ä¢ Longitude range: {coord_data['LONGITUDE'].min():.6f} to {coord_data['LONGITUDE'].max():.6f}")
    
    print(f"\nüö® ANOMALY DETECTION RESULTS:")
    print(f"   ‚Ä¢ Coordinates outside NYC bounds: {outside_nyc_count:,} rows ({outside_nyc_count/len(coord_data)*100:.2f}%)")
    print(f"   ‚Ä¢ Invalid coordinate outliers: {outlier_count:,} rows")
    print(f"   ‚Ä¢ Exact duplicate coordinates: {duplicate_count:,} rows at {unique_duplicate_locations:,} unique locations")
    
    # Show examples of anomalies
    if outside_nyc_count > 0:
        print(f"\nüìã COORDINATES OUTSIDE NYC (first 10):")
        outside_examples = coord_data[outside_nyc][['COLLISION_ID', 'LATITUDE', 'LONGITUDE', 'BOROUGH', 'ON STREET NAME']]
        print(outside_examples.to_string(index=False))
    
    if outlier_count > 0:
        print(f"\nüìã COORDINATE OUTLIERS:")
        outlier_examples = coord_data[coord_outliers][['COLLISION_ID', 'LATITUDE', 'LONGITUDE', 'BOROUGH']]
        print(outlier_examples.to_string(index=False))
    
    # Show most common duplicate locations
    if duplicate_count > 0:
        print(f"\nüìã TOP 5 MOST COMMON DUPLICATE LOCATIONS:")
        top_duplicates = coord_data[duplicate_coords].groupby(['LATITUDE', 'LONGITUDE']).size().sort_values(ascending=False).head()
        for (lat, lon), count in top_duplicates.items():
            example_row = coord_data[(coord_data['LATITUDE'] == lat) & (coord_data['LONGITUDE'] == lon)].iloc[0]
            print(f"   ‚Ä¢ ({lat:.6f}, {lon:.6f}): {count:,} crashes - {example_row.get('ON STREET NAME', 'Unknown')} in {example_row.get('BOROUGH', 'Unknown')}")

# Summary of cleaning
total_removed = original_shape[0] - crashes_sample.shape[0]
print(f"\nüìà CLEANING SUMMARY:")
print(f"   ‚Ä¢ Original rows: {original_shape[0]:,}")
print(f"   ‚Ä¢ Removed missing coordinates: {coords_missing_count:,}")
print(f"   ‚Ä¢ Removed zero coordinates: {zero_coords_count:,}")
print(f"   ‚Ä¢ Total rows removed: {total_removed:,}")
print(f"   ‚Ä¢ Final dataset: {crashes_sample.shape[0]:,} rows ({crashes_sample.shape[0]/original_shape[0]*100:.2f}% retained)")

# Ask user what to do with anomalies
if len(coord_data) > 0 and (outside_nyc_count > 0 or outlier_count > 0):
    print(f"\nüí° RECOMMENDATION:")
    if outside_nyc_count > 0:
        print(f"   ‚Ä¢ Consider removing {outside_nyc_count:,} rows outside NYC bounds")
    if outlier_count > 0:
        print(f"   ‚Ä¢ Consider removing {outlier_count:,} rows with invalid coordinates")


üîç COORDINATE ANOMALY DETECTION
--------------------------------------------------
üìä COORDINATE STATISTICS:
   ‚Ä¢ Total rows with coordinates: 1,972,816
   ‚Ä¢ Latitude range: 30.784180 to 43.344444
   ‚Ä¢ Longitude range: -201.359990 to -32.768513

üö® ANOMALY DETECTION RESULTS:
   ‚Ä¢ Coordinates outside NYC bounds: 150 rows (0.01%)
   ‚Ä¢ Invalid coordinate outliers: 106 rows
   ‚Ä¢ Exact duplicate coordinates: 1,782,376 rows at 143,177 unique locations

üìã COORDINATES OUTSIDE NYC (first 10):
 COLLISION_ID  LATITUDE   LONGITUDE  BOROUGH                   ON STREET NAME
      3885895 40.758370 -201.237060      NaN QUEENSBORO BRIDGE UPPER ROADWAY 
      3975700 40.758370 -201.237060      NaN QUEENSBORO BRIDGE UPPER ROADWAY 
      3965268 40.758370 -201.237060      NaN QUEENSBORO BRIDGE UPPER ROADWAY 
      3958909 40.758370 -201.237060      NaN QUEENSBORO BRIDGE UPPER ROADWAY 
      3968897 40.665226  -32.768513      NaN NASSAU EXPRESSWAY               
      3927302 40.6652

this shows some rows off NYC which is invalid state, and by inspective them there are many locations with same location and diffirent boroughs making it hard to impute

In [11]:
# Remove outlier coordinates based on location

# Remove rows with coordinates outside NYC bounds
crashes_sample = crashes_sample[~outside_nyc].reset_index(drop=True)
print(f"\n‚úì Removed rows with coordinates outside NYC bounds. New shape: {crashes_sample.shape}")


‚úì Removed rows with coordinates outside NYC bounds. New shape: (1972666, 29)


In [12]:
# Test coordinate-based imputation for BOROUGH and ZIP CODE
print("üéØ TESTING COORDINATE-BASED IMPUTATION FOR BOROUGH & ZIP CODE")
print("="*70)

from sklearn.neighbors import NearestNeighbors
import numpy as np

# Get rows with coordinates
has_coords = crashes_sample['LATITUDE'].notna() & crashes_sample['LONGITUDE'].notna()
coord_data = crashes_sample[has_coords].copy()

# Separate data into known and unknown BOROUGH/ZIP CODE
borough_known = coord_data['BOROUGH'].notna()
zip_known = coord_data['ZIP CODE'].notna()

# Count missing values
borough_missing_count = (~borough_known).sum()
zip_missing_count = (~zip_known).sum()

print(f"üìä MISSING VALUE COUNTS:")
print(f"   ‚Ä¢ Missing BOROUGH: {borough_missing_count:,} rows")
print(f"   ‚Ä¢ Missing ZIP CODE: {zip_missing_count:,} rows")
print(f"   ‚Ä¢ Total rows with coordinates: {len(coord_data):,}")

if borough_missing_count > 0 or zip_missing_count > 0:
    # Prepare coordinate arrays
    coordinates = coord_data[['LATITUDE', 'LONGITUDE']].values.astype(float)
    
    # Test with k=5 nearest neighbors
    k_neighbors = 10
    nbrs = NearestNeighbors(n_neighbors=k_neighbors, metric='haversine')
    
    # Convert to radians for haversine distance (great circle distance)
    coordinates_rad = np.radians(coordinates)
    nbrs.fit(coordinates_rad)
    
    # Test BOROUGH imputation
    borough_results = []
    if borough_missing_count > 0:
        print(f"\nüèôÔ∏è TESTING BOROUGH IMPUTATION:")
        print("-" * 50)
        
        # Get sample of missing boroughs for testing (first 10)
        missing_borough_indices = coord_data[~borough_known].index[:10]
        
        for idx in missing_borough_indices:
            target_coords = coord_data.loc[idx, ['LATITUDE', 'LONGITUDE']].values.reshape(1, -1)
            target_coords_rad = np.radians(target_coords.astype(float))
            
            # Find nearest neighbors
            distances, indices = nbrs.kneighbors(target_coords_rad)
            neighbor_indices = coord_data.iloc[indices[0]].index
            
            # Get boroughs from neighbors (exclude NaN values)
            neighbor_boroughs = coord_data.loc[neighbor_indices, 'BOROUGH'].dropna()
            
            if len(neighbor_boroughs) > 0:
                # Count occurrences of each borough
                borough_counts = neighbor_boroughs.value_counts()
                most_common_borough = borough_counts.index[0]
                confidence = borough_counts.iloc[0] / len(neighbor_boroughs)
                
                # Convert distance from radians to km (approximate)
                distances_km = distances[0] * 6371  # Earth's radius in km
                
                borough_results.append({
                    'collision_id': coord_data.loc[idx, 'COLLISION_ID'],
                    'lat': coord_data.loc[idx, 'LATITUDE'],
                    'lon': coord_data.loc[idx, 'LONGITUDE'],
                    'predicted_borough': most_common_borough,
                    'confidence': confidence,
                    'avg_distance_km': distances_km.mean(),
                    'neighbor_boroughs': list(neighbor_boroughs.values),
                    'neighbor_distances_km': list(distances_km)
                })
        
        print(f"üìã BOROUGH IMPUTATION RESULTS (first 10 examples):")
        for i, result in enumerate(borough_results, 1):
            print(f"\n{i}. Collision ID: {result['collision_id']}")
            print(f"   Coordinates: ({result['lat']:.6f}, {result['lon']:.6f})")
            print(f"   Predicted Borough: {result['predicted_borough']} (confidence: {result['confidence']:.2f})")
            print(f"   Average distance to neighbors: {result['avg_distance_km']:.3f} km")
            print(f"   Neighbor boroughs: {result['neighbor_boroughs']}")
    
    # Test ZIP CODE imputation
    zip_results = []
    if zip_missing_count > 0:
        print(f"\nüìÆ TESTING ZIP CODE IMPUTATION:")
        print("-" * 50)
        
        # Get sample of missing zip codes for testing (first 10)
        missing_zip_indices = coord_data[~zip_known].index[:10]
        
        for idx in missing_zip_indices:
            target_coords = coord_data.loc[idx, ['LATITUDE', 'LONGITUDE']].values.reshape(1, -1)
            target_coords_rad = np.radians(target_coords.astype(float))
            
            # Find nearest neighbors
            distances, indices = nbrs.kneighbors(target_coords_rad)
            neighbor_indices = coord_data.iloc[indices[0]].index
            
            # Get zip codes from neighbors (exclude NaN values)
            neighbor_zips = coord_data.loc[neighbor_indices, 'ZIP CODE'].dropna()
            
            if len(neighbor_zips) > 0:
                # Count occurrences of each zip code
                zip_counts = neighbor_zips.value_counts()
                most_common_zip = zip_counts.index[0]
                confidence = zip_counts.iloc[0] / len(neighbor_zips)
                
                # Convert distance from radians to km
                distances_km = distances[0] * 6371
                
                zip_results.append({
                    'collision_id': coord_data.loc[idx, 'COLLISION_ID'],
                    'lat': coord_data.loc[idx, 'LATITUDE'],
                    'lon': coord_data.loc[idx, 'LONGITUDE'],
                    'predicted_zip': most_common_zip,
                    'confidence': confidence,
                    'avg_distance_km': distances_km.mean(),
                    'neighbor_zips': list(neighbor_zips.values),
                    'neighbor_distances_km': list(distances_km)
                })
        
        print(f"üìã ZIP CODE IMPUTATION RESULTS (first 10 examples):")
        for i, result in enumerate(zip_results, 1):
            print(f"\n{i}. Collision ID: {result['collision_id']}")
            print(f"   Coordinates: ({result['lat']:.6f}, {result['lon']:.6f})")
            print(f"   Predicted ZIP: {result['predicted_zip']} (confidence: {result['confidence']:.2f})")
            print(f"   Average distance to neighbors: {result['avg_distance_km']:.3f} km")
            print(f"   Neighbor ZIP codes: {result['neighbor_zips']}")
    
    # Summary statistics
    print(f"\nüìà IMPUTATION QUALITY ASSESSMENT:")
    print("-" * 50)
    
    if borough_results:
        avg_borough_confidence = sum(r['confidence'] for r in borough_results) / len(borough_results)
        avg_borough_distance = sum(r['avg_distance_km'] for r in borough_results) / len(borough_results)
        print(f"BOROUGH Imputation:")
        print(f"   ‚Ä¢ Average confidence: {avg_borough_confidence:.2f}")
        print(f"   ‚Ä¢ Average distance to neighbors: {avg_borough_distance:.3f} km")
        print(f"   ‚Ä¢ High confidence predictions (>0.6): {sum(1 for r in borough_results if r['confidence'] > 0.6)}/{len(borough_results)}")
    
    if zip_results:
        avg_zip_confidence = sum(r['confidence'] for r in zip_results) / len(zip_results)
        avg_zip_distance = sum(r['avg_distance_km'] for r in zip_results) / len(zip_results)
        print(f"\nZIP CODE Imputation:")
        print(f"   ‚Ä¢ Average confidence: {avg_zip_confidence:.2f}")
        print(f"   ‚Ä¢ Average distance to neighbors: {avg_zip_distance:.3f} km")
        print(f"   ‚Ä¢ High confidence predictions (>0.6): {sum(1 for r in zip_results if r['confidence'] > 0.6)}/{len(zip_results)}")
    
    print(f"\nüí° RECOMMENDATIONS:")
    if borough_results and avg_borough_confidence > 0.6:
        print(f"   ‚úÖ BOROUGH imputation looks promising (avg confidence: {avg_borough_confidence:.2f})")
    elif borough_results:
        print(f"   ‚ö†Ô∏è BOROUGH imputation has moderate confidence (avg: {avg_borough_confidence:.2f})")
    
    if zip_results and avg_zip_confidence > 0.6:
        print(f"   ‚úÖ ZIP CODE imputation looks promising (avg confidence: {avg_zip_confidence:.2f})")
    elif zip_results:
        print(f"   ‚ö†Ô∏è ZIP CODE imputation has moderate confidence (avg: {avg_zip_confidence:.2f})")
    
    print(f"\nüöÄ NEXT STEPS:")
    print(f"   ‚Ä¢ If results look good, we can impute all missing values")
    print(f"   ‚Ä¢ Consider adjusting k={k_neighbors} neighbors based on confidence scores")
    print(f"   ‚Ä¢ Set confidence threshold for imputation (e.g., only impute if confidence > 0.6)")
    
else:
    print("\n‚úÖ No missing BOROUGH or ZIP CODE values found!")
    print("All coordinates already have complete location data.")

üéØ TESTING COORDINATE-BASED IMPUTATION FOR BOROUGH & ZIP CODE
üìä MISSING VALUE COUNTS:
   ‚Ä¢ Missing BOROUGH: 476,174 rows
   ‚Ä¢ Missing ZIP CODE: 476,419 rows
   ‚Ä¢ Total rows with coordinates: 1,972,666

üèôÔ∏è TESTING BOROUGH IMPUTATION:
--------------------------------------------------
üìã BOROUGH IMPUTATION RESULTS (first 10 examples):

1. Collision ID: 4486304
   Coordinates: (40.804375, -73.937420)
   Predicted Borough: MANHATTAN (confidence: 1.00)
   Average distance to neighbors: 0.000 km
   Neighbor boroughs: ['MANHATTAN', 'MANHATTAN', 'MANHATTAN', 'MANHATTAN', 'MANHATTAN', 'MANHATTAN']

2. Collision ID: 4486581
   Coordinates: (40.784615, -73.953964)
   Predicted Borough: MANHATTAN (confidence: 1.00)
   Average distance to neighbors: 0.000 km
   Neighbor boroughs: ['MANHATTAN', 'MANHATTAN', 'MANHATTAN', 'MANHATTAN', 'MANHATTAN', 'MANHATTAN', 'MANHATTAN']

3. Collision ID: 4456659
   Coordinates: (40.720535, -73.888850)
   Predicted Borough: QUEENS (confidence: 1.00

this  show the potentiality of grouping close locations together and imputing the missing values, however it becomes infeasible using k nearest neighbours, using other datastucture such as ball tree can be more efficient

In [13]:
#LOCATION-BASED GROUPING IMPUTATION with Inconsistency Detection
from collections import Counter
import time

# Start timing for performance analysis
start_time = time.time()

# Get rows with coordinates
has_coords = crashes_sample['LATITUDE'].notna() & crashes_sample['LONGITUDE'].notna()
coord_data = crashes_sample[has_coords].copy()

# Store initial stats
initial_rows = len(coord_data)
initial_borough_missing = coord_data['BOROUGH'].isna().sum()
initial_zip_missing = coord_data['ZIP CODE'].isna().sum()

step1_time = time.time()

# Round coordinates to reasonable precision (5 decimal places = ~1 meter accuracy)
coord_precision = 5
coord_data['LAT_ROUNDED'] = coord_data['LATITUDE'].round(coord_precision)
coord_data['LON_ROUNDED'] = coord_data['LONGITUDE'].round(coord_precision)
coord_data['LOCATION_KEY'] = coord_data['LAT_ROUNDED'].astype(str) + ',' + coord_data['LON_ROUNDED'].astype(str)

step2_time = time.time()

# Analyze location groups using vectorized operations
location_groups = coord_data.groupby('LOCATION_KEY').agg({
    'COLLISION_ID': 'count',
    'BOROUGH': lambda x: x.dropna().unique().tolist() if len(x.dropna()) > 0 else [],
    'ZIP CODE': lambda x: x.dropna().unique().tolist() if len(x.dropna()) > 0 else [],
    'LATITUDE': 'first',
    'LONGITUDE': 'first'
}).rename(columns={'COLLISION_ID': 'CRASH_COUNT'})

step3_time = time.time()

# Find inconsistencies
location_groups['BOROUGH_COUNT'] = location_groups['BOROUGH'].apply(len)
location_groups['ZIP_COUNT'] = location_groups['ZIP CODE'].apply(len)

inconsistent_boroughs = location_groups[location_groups['BOROUGH_COUNT'] > 1]
inconsistent_zips = location_groups[location_groups['ZIP_COUNT'] > 1]

# Create canonical mappings
location_groups['CANONICAL_BOROUGH'] = location_groups['BOROUGH'].apply(
    lambda boroughs: Counter(boroughs).most_common(1)[0][0] if len(boroughs) > 0 else None
)
location_groups['CANONICAL_ZIP'] = location_groups['ZIP CODE'].apply(
    lambda zips: Counter(zips).most_common(1)[0][0] if len(zips) > 0 else None
)

step4_time = time.time()

# Create mapping dictionaries for fast lookup
borough_mapping = location_groups[location_groups['CANONICAL_BOROUGH'].notna()]['CANONICAL_BOROUGH'].to_dict()
zip_mapping = location_groups[location_groups['CANONICAL_ZIP'].notna()]['CANONICAL_ZIP'].to_dict()

# ULTRA-FAST VECTORIZED IMPUTATION (no loops!)
# Apply borough imputation using vectorized map operation
borough_missing_mask = coord_data['BOROUGH'].isna()
if borough_missing_mask.any():
    # Use pandas map for vectorized lookup - much faster than loops
    coord_data.loc[borough_missing_mask, 'BOROUGH'] = coord_data.loc[borough_missing_mask, 'LOCATION_KEY'].map(borough_mapping)

# Apply zip code imputation using vectorized map operation
zip_missing_mask = coord_data['ZIP CODE'].isna()  
if zip_missing_mask.any():
    # Use pandas map for vectorized lookup - much faster than loops
    coord_data.loc[zip_missing_mask, 'ZIP CODE'] = coord_data.loc[zip_missing_mask, 'LOCATION_KEY'].map(zip_mapping)

step5_time = time.time()

# Count successful imputations
borough_imputed = initial_borough_missing - coord_data['BOROUGH'].isna().sum()
zip_imputed = initial_zip_missing - coord_data['ZIP CODE'].isna().sum()

# Update main dataset using vectorized update
crashes_sample.update(coord_data[['BOROUGH', 'ZIP CODE']])

# Clean up temporary columns
coord_data = coord_data.drop(['LAT_ROUNDED', 'LON_ROUNDED', 'LOCATION_KEY'], axis=1)

final_time = time.time()

# ==================== ALL PRINTS AT THE END ====================
print("üéØ ULTRA-FAST LOCATION-BASED GROUPING IMPUTATION")
print("="*60)

print(f"üìä STARTING ANALYSIS:")
print(f"   ‚Ä¢ Total rows with coordinates: {initial_rows:,}")
print(f"   ‚Ä¢ Missing BOROUGH: {initial_borough_missing:,}")
print(f"   ‚Ä¢ Missing ZIP CODE: {initial_zip_missing:,}")

print(f"\nüéØ GROUPING LOCATIONS:")
print(f"   ‚Ä¢ Coordinate precision: {coord_precision} decimal places (~1-meter accuracy)")
print(f"   ‚Ä¢ Unique location groups created: {len(location_groups):,}")
print(f"   ‚Ä¢ Average crashes per location: {location_groups['CRASH_COUNT'].mean():.1f}")

print(f"\nüö® INCONSISTENCY DETECTION:")
print(f"   ‚Ä¢ Locations with multiple BOROUGHs: {len(inconsistent_boroughs):,}")
print(f"   ‚Ä¢ Locations with multiple ZIP CODEs: {len(inconsistent_zips):,}")

# Show examples of inconsistencies
if len(inconsistent_boroughs) > 0:
    print(f"\nüìã BOROUGH INCONSISTENCIES (first 5):")
    top_borough_issues = inconsistent_boroughs.nlargest(5, 'CRASH_COUNT')
    for idx, (location_key, row) in enumerate(top_borough_issues.iterrows(), 1):
        print(f"   {idx}. Location ({row['LATITUDE']:.6f}, {row['LONGITUDE']:.6f})")
        print(f"      ‚Ä¢ {row['CRASH_COUNT']} crashes with boroughs: {row['BOROUGH']}")

if len(inconsistent_zips) > 0:
    print(f"\nüìã ZIP CODE INCONSISTENCIES (first 5):")
    top_zip_issues = inconsistent_zips.nlargest(5, 'CRASH_COUNT')
    for idx, (location_key, row) in enumerate(top_zip_issues.iterrows(), 1):
        print(f"   {idx}. Location ({row['LATITUDE']:.6f}, {row['LONGITUDE']:.6f})")
        print(f"      ‚Ä¢ {row['CRASH_COUNT']} crashes with ZIP codes: {row['ZIP CODE']}")

print(f"\n‚ö° VECTORIZED IMPUTATION RESULTS:")
print(f"   ‚Ä¢ Location groups with known BOROUGH: {len(borough_mapping):,}")
print(f"   ‚Ä¢ Location groups with known ZIP CODE: {len(zip_mapping):,}")
print(f"   ‚Ä¢ BOROUGH imputation: {borough_imputed:,} / {initial_borough_missing:,} values ({borough_imputed/initial_borough_missing*100 if initial_borough_missing > 0 else 0:.1f}%)")
print(f"   ‚Ä¢ ZIP CODE imputation: {zip_imputed:,} / {initial_zip_missing:,} values ({zip_imputed/initial_zip_missing*100 if initial_zip_missing > 0 else 0:.1f}%)")

print(f"\n‚è±Ô∏è PERFORMANCE BREAKDOWN:")
print(f"   ‚Ä¢ Data preparation: {step2_time - step1_time:.3f} seconds")
print(f"   ‚Ä¢ Location grouping: {step3_time - step2_time:.3f} seconds") 
print(f"   ‚Ä¢ Inconsistency analysis: {step4_time - step3_time:.3f} seconds")
print(f"   ‚Ä¢ Vectorized imputation: {step5_time - step4_time:.3f} seconds")
print(f"   ‚Ä¢ Total processing time: {final_time - start_time:.3f} seconds")

print(f"\nüìà FINAL SUMMARY:")
print(f"   ‚Ä¢ Method: Vectorized location-based grouping (pandas .map())")
print(f"   ‚Ä¢ Precision: {coord_precision} decimal places")
print(f"   ‚Ä¢ Data quality issues detected: {len(inconsistent_boroughs)} borough conflicts, {len(inconsistent_zips)} ZIP conflicts") 
print(f"   ‚Ä¢ Processing speed: ~{(borough_imputed + zip_imputed)/(final_time - start_time):.0f} imputations per second")
print(f"   ‚Ä¢ Coverage: {borough_imputed + zip_imputed:,} total values imputed")

print(f"\n‚úÖ LOCATION-BASED IMPUTATION COMPLETE!")

üéØ ULTRA-FAST LOCATION-BASED GROUPING IMPUTATION
üìä STARTING ANALYSIS:
   ‚Ä¢ Total rows with coordinates: 1,972,666
   ‚Ä¢ Missing BOROUGH: 476,174
   ‚Ä¢ Missing ZIP CODE: 476,419

üéØ GROUPING LOCATIONS:
   ‚Ä¢ Coordinate precision: 5 decimal places (~1-meter accuracy)
   ‚Ä¢ Unique location groups created: 308,213
   ‚Ä¢ Average crashes per location: 6.4

üö® INCONSISTENCY DETECTION:
   ‚Ä¢ Locations with multiple BOROUGHs: 373
   ‚Ä¢ Locations with multiple ZIP CODEs: 764

üìã BOROUGH INCONSISTENCIES (first 5):
   1. Location (40.696033, -73.984535)
      ‚Ä¢ 1406 crashes with boroughs: ['BROOKLYN', 'QUEENS']
   2. Location (40.804700, -73.912430)
      ‚Ä¢ 597 crashes with boroughs: ['BRONX', 'BROOKLYN', 'MANHATTAN']
   3. Location (40.763110, -73.962524)
      ‚Ä¢ 558 crashes with boroughs: ['MANHATTAN', 'QUEENS']
   4. Location (40.770770, -73.917270)
      ‚Ä¢ 470 crashes with boroughs: ['QUEENS', 'STATEN ISLAND']
   5. Location (40.820305, -73.890830)
      ‚Ä¢ 467 cra

it might be resonable to impute streets as well, but after revising the collisions of boroughs, i suppose that imputing the street will be a complete mess and no hope of getting somthing valuable

In [14]:
# Replace any missing BOROUGH with 'Unknown' and missing zip code with "-1"
crashes_sample['BOROUGH'] = crashes_sample['BOROUGH'].fillna('Unknown')
crashes_sample['ZIP CODE'] = crashes_sample['ZIP CODE'].fillna('-1')


with this we can make visuallizations clear about the zip codes of rows we aren't aware of 

In [15]:
# Clear any previous output and create missing stats analysis
missing_stats = pd.DataFrame({
    'Column': crashes_sample.columns,
    'Missing_Count': crashes_sample.isnull().sum(),
    'Missing_Percentage': (crashes_sample.isnull().sum() / len(crashes_sample) * 100).round(2),
    'Data_Type': crashes_sample.dtypes
})

# Sort by missing count (descending)
missing_stats = missing_stats.sort_values('Missing_Count', ascending=False)

# Missing Values Analysis for Crashes Dataset
print("üîç MISSING VALUES ANALYSIS - CRASHES DATASET")
print("="*50)
print("\nAll columns with their missing value statistics:")
print(missing_stats.to_string(index=False))

üîç MISSING VALUES ANALYSIS - CRASHES DATASET

All columns with their missing value statistics:
                       Column  Missing_Count  Missing_Percentage Data_Type
          VEHICLE TYPE CODE 5        1963833               99.55    object
CONTRIBUTING FACTOR VEHICLE 5        1963555               99.54    object
          VEHICLE TYPE CODE 4        1941084               98.40    object
CONTRIBUTING FACTOR VEHICLE 4        1939895               98.34    object
          VEHICLE TYPE CODE 3        1836201               93.08    object
CONTRIBUTING FACTOR VEHICLE 3        1830583               92.80    object
              OFF STREET NAME        1611339               81.68    object
            CROSS STREET NAME         747114               37.87    object
               ON STREET NAME         423368               21.46    object
          VEHICLE TYPE CODE 2         406149               20.59    object
CONTRIBUTING FACTOR VEHICLE 2         322665               16.36    object
   

In [16]:
# SIMPLE STREET INFORMATION PATTERN ANALYSIS
print("üõ£Ô∏è STREET INFORMATION PATTERN ANALYSIS")
print("="*50)

# Check all possible combinations of street fields
on_street = crashes_sample['ON STREET NAME'].notna()
cross_street = crashes_sample['CROSS STREET NAME'].notna()
off_street = crashes_sample['OFF STREET NAME'].notna()

# Create pattern combinations
patterns = {
    'ON + CROSS': (on_street & cross_street & ~off_street).sum(),
    'OFF only': (~on_street & ~cross_street & off_street).sum(),
    'ON only': (on_street & ~cross_street & ~off_street).sum(),
    'No street info': (~on_street & ~cross_street & ~off_street).sum(),
    'CROSS only': (~on_street & cross_street & ~off_street).sum(),
    'All three': (on_street & cross_street & off_street).sum(),
    'ON + OFF': (on_street & ~cross_street & off_street).sum(),
    'CROSS + OFF': (~on_street & cross_street & off_street).sum()
}

total_rows = len(crashes_sample)

print(f"üîÑ STREET INFORMATION PATTERNS:")
# Display patterns sorted by frequency
sorted_patterns = sorted(patterns.items(), key=lambda x: x[1], reverse=True)
for pattern, count in sorted_patterns:
    percentage = count / total_rows * 100
    print(f"   ‚Ä¢ {pattern:<15}: {count:>8,} rows ({percentage:>5.1f}%)")

print(f"\n‚úÖ STREET PATTERN ANALYSIS COMPLETE!")

üõ£Ô∏è STREET INFORMATION PATTERN ANALYSIS
üîÑ STREET INFORMATION PATTERNS:
   ‚Ä¢ ON + CROSS     : 1,225,480 rows ( 62.1%)
   ‚Ä¢ OFF only       :  361,293 rows ( 18.3%)
   ‚Ä¢ ON only        :  323,784 rows ( 16.4%)
   ‚Ä¢ No street info :   62,037 rows (  3.1%)
   ‚Ä¢ CROSS only     :       38 rows (  0.0%)
   ‚Ä¢ All three      :       34 rows (  0.0%)
   ‚Ä¢ ON + OFF       :        0 rows (  0.0%)
   ‚Ä¢ CROSS + OFF    :        0 rows (  0.0%)

‚úÖ STREET PATTERN ANALYSIS COMPLETE!


In [17]:
pd.set_option('display.max_rows', None)
for factor in ['CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2',
               'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4',
               'CONTRIBUTING FACTOR VEHICLE 5']:
    print(f"\nValue counts for {factor}:")
    print(crashes_sample[factor].value_counts(dropna=False))
    print(f"Size of list: {crashes_sample[factor].value_counts(dropna=False).size}")
pd.reset_option('display.max_rows')


Value counts for CONTRIBUTING FACTOR VEHICLE 1:
CONTRIBUTING FACTOR VEHICLE 1
Unspecified                                              656323
Driver Inattention/Distraction                           406870
Failure to Yield Right-of-Way                            121469
Following Too Closely                                    102471
Backing Unsafely                                          74151
Other Vehicular                                           61742
Passing or Lane Usage Improper                            57551
Passing Too Closely                                       52767
Turning Improperly                                        47500
Fatigued/Drowsy                                           37889
Unsafe Lane Changing                                      37528
Traffic Control Disregarded                               36538
Driver Inexperience                                       32135
Unsafe Speed                                              30854
Alcohol Involvement      

In [18]:
# COMPREHENSIVE CONTRIBUTING FACTOR DATA CLEANING
print("üßπ CONTRIBUTING FACTOR DATA STANDARDIZATION")
print("="*60)

# Define all contributing factor columns
factor_columns = ['CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2', 
                 'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4', 
                 'CONTRIBUTING FACTOR VEHICLE 5']

# Create standardization mapping dictionary
standardization_map = {
    # Fix spelling/capitalization inconsistencies
    'Illnes': 'Illness',
    'Drugs (illegal)': 'Drugs (Illegal)',
    'Cell Phone (hand-Held)': 'Cell Phone (Hand-Held)',
    'Cell Phone (hand-held)': 'Cell Phone (Hand-Held)',
    'Cell Phone (hands-free)': 'Cell Phone (Hands-Free)',
    
    # Group related cell phone usage
    'Cell Phone (Hand-Held)': 'Cell Phone Use',
    'Cell Phone (Hands-Free)': 'Cell Phone Use', 
    'Texting': 'Cell Phone Use',
    'Using On Board Navigation Device': 'Electronic Device Distraction',
    'Other Electronic Device': 'Electronic Device Distraction',
    'Listening/Using Headphones': 'Electronic Device Distraction',
    
    # Group distraction-related factors
    'Driver Inattention/Distraction': 'Driver Distraction',
    'Outside Car Distraction': 'Driver Distraction',
    'Passenger Distraction': 'Driver Distraction',
    
    # Group impairment factors
    'Fatigued/Drowsy': 'Driver Impairment',
    'Fell Asleep': 'Driver Impairment',
    'Alcohol Involvement': 'Driver Impairment',
    'Drugs (Illegal)': 'Driver Impairment',
    'Prescription Medication': 'Driver Impairment',
    'Illness': 'Driver Impairment',
    'Lost Consciousness': 'Driver Impairment',
    'Physical Disability': 'Driver Impairment',
    
    # Group driving behavior issues
    'Aggressive Driving/Road Rage': 'Unsafe Driving Behavior',
    'Unsafe Speed': 'Unsafe Driving Behavior',
    'Following Too Closely': 'Unsafe Driving Behavior',
    'Passing Too Closely': 'Unsafe Driving Behavior',
    'Unsafe Lane Changing': 'Unsafe Driving Behavior',
    'Traffic Control Disregarded': 'Unsafe Driving Behavior',
    'Eating or Drinking': 'Unsafe Driving Behavior',
    
    # Group inexperience/skill issues
    'Driver Inexperience': 'Driver Skill Issues',
    'Backing Unsafely': 'Driver Skill Issues',
    'Turning Improperly': 'Driver Skill Issues',
    'Passing or Lane Usage Improper': 'Driver Skill Issues',
    'Failure to Yield Right-of-Way': 'Driver Skill Issues',
    'Failure to Keep Right': 'Driver Skill Issues',
    
    # Group vehicle defects
    'Brakes Defective': 'Vehicle Defects',
    'Steering Failure': 'Vehicle Defects',
    'Tire Failure/Inadequate': 'Vehicle Defects',
    'Accelerator Defective': 'Vehicle Defects',
    'Headlights Defective': 'Vehicle Defects',
    'Other Lighting Defects': 'Vehicle Defects',
    'Tow Hitch Defective': 'Vehicle Defects',
    'Windshield Inadequate': 'Vehicle Defects',
    'Tinted Windows': 'Vehicle Defects',
    
    # Group environmental factors
    'Pavement Slippery': 'Environmental Conditions',
    'Pavement Defective': 'Environmental Conditions',
    'View Obstructed/Limited': 'Environmental Conditions',
    'Glare': 'Environmental Conditions',
    'Obstruction/Debris': 'Environmental Conditions',
    'Animals Action': 'Environmental Conditions',
    'Lane Marking Improper/Inadequate': 'Environmental Conditions',
    'Traffic Control Device Improper/Non-Working': 'Environmental Conditions',
    'Shoulders Defective/Improper': 'Environmental Conditions',
    
    # Clean up unclear entries
    '1': 'Other/Unspecified',
    '80': 'Other/Unspecified',
    'Vehicle Vandalism': 'Other/Unspecified',
    'Driverless/Runaway Vehicle': 'Other/Unspecified',
    
    # Keep some specific categories as-is
    'Oversized Vehicle': 'Oversized Vehicle',
    'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion': 'Pedestrian/Bicyclist Error',
    'Reaction to Uninvolved Vehicle': 'Reaction to Other Vehicle',
    'Reaction to Other Uninvolved Vehicle': 'Reaction to Other Vehicle',
}

print("üîß APPLYING STANDARDIZATION...")

# Store original counts for comparison
original_counts = {}
for col in factor_columns:
    if col in crashes_sample.columns:
        original_counts[col] = crashes_sample[col].value_counts(dropna=False).shape[0]

# Apply standardization to each column
cleaned_counts = {}
for col in factor_columns:
    if col in crashes_sample.columns:
        print(f"   ‚Ä¢ Processing {col}...")
        
        # Apply mapping
        crashes_sample[col] = crashes_sample[col].map(standardization_map).fillna(crashes_sample[col])
        
        # Count unique values after cleaning
        cleaned_counts[col] = crashes_sample[col].value_counts(dropna=False).shape[0]

print(f"\nüìä STANDARDIZATION RESULTS:")
print("-" * 50)
for col in factor_columns:
    if col in crashes_sample.columns:
        original = original_counts.get(col, 0)
        cleaned = cleaned_counts.get(col, 0)
        reduction = original - cleaned
        reduction_pct = (reduction / original * 100) if original > 0 else 0
        
        print(f"{col}:")
        print(f"   ‚Ä¢ Original categories: {original}")
        print(f"   ‚Ä¢ Cleaned categories: {cleaned}")
        print(f"   ‚Ä¢ Reduction: {reduction} categories ({reduction_pct:.1f}%)")

# Show the cleaned value counts for CONTRIBUTING FACTOR VEHICLE 1
print(f"\nüéØ CLEANED VALUE COUNTS - CONTRIBUTING FACTOR VEHICLE 1:")
print("-" * 50)
cleaned_vc = crashes_sample['CONTRIBUTING FACTOR VEHICLE 1'].value_counts(dropna=False)
print(cleaned_vc)
print(f"Total unique categories: {len(cleaned_vc)}")

# Summary of major category groups
print(f"\nüìà MAJOR CONTRIBUTING FACTOR CATEGORIES:")
print("-" * 50)
major_categories = cleaned_vc.head(10)
for category, count in major_categories.items():
    percentage = count / len(crashes_sample) * 100
    print(f"   ‚Ä¢ {category:<30}: {count:>8,} ({percentage:>5.1f}%)")

print(f"\n‚úÖ CONTRIBUTING FACTOR STANDARDIZATION COMPLETE!")
print(f"   ‚Ä¢ Data quality improved through consistent naming")
print(f"   ‚Ä¢ Related factors grouped into logical categories")
print(f"   ‚Ä¢ Reduced complexity while maintaining meaning")

üßπ CONTRIBUTING FACTOR DATA STANDARDIZATION
üîß APPLYING STANDARDIZATION...
   ‚Ä¢ Processing CONTRIBUTING FACTOR VEHICLE 1...
   ‚Ä¢ Processing CONTRIBUTING FACTOR VEHICLE 2...
   ‚Ä¢ Processing CONTRIBUTING FACTOR VEHICLE 3...
   ‚Ä¢ Processing CONTRIBUTING FACTOR VEHICLE 4...
   ‚Ä¢ Processing CONTRIBUTING FACTOR VEHICLE 5...

üìä STANDARDIZATION RESULTS:
--------------------------------------------------
CONTRIBUTING FACTOR VEHICLE 1:
   ‚Ä¢ Original categories: 62
   ‚Ä¢ Cleaned categories: 19
   ‚Ä¢ Reduction: 43 categories (69.4%)
CONTRIBUTING FACTOR VEHICLE 2:
   ‚Ä¢ Original categories: 62
   ‚Ä¢ Cleaned categories: 19
   ‚Ä¢ Reduction: 43 categories (69.4%)
CONTRIBUTING FACTOR VEHICLE 3:
   ‚Ä¢ Original categories: 54
   ‚Ä¢ Cleaned categories: 18
   ‚Ä¢ Reduction: 36 categories (66.7%)
CONTRIBUTING FACTOR VEHICLE 4:
   ‚Ä¢ Original categories: 44
   ‚Ä¢ Cleaned categories: 14
   ‚Ä¢ Reduction: 30 categories (68.2%)
CONTRIBUTING FACTOR VEHICLE 5:
   ‚Ä¢ Original categorie

In [19]:
# Let's see the dramatic improvement in data consistency
print("After standardization - Contributing Factor Analysis")
print("=" * 60)

# Check each factor column
for col in factor_columns:
    print(f"\n{col}:")
    print(f"  Unique values: {crashes_sample[col].nunique()}")
    print(f"  Top 10 values:")
    top_values = crashes_sample[col].value_counts().head(10)
    for value, count in top_values.items():
        percentage = (count / len(crashes_sample)) * 100
        print(f"    {value}: {count:,} ({percentage:.2f}%)")

# Summary of the standardization
print("\n" + "=" * 60)
print("STANDARDIZATION SUMMARY")
print("=" * 60)

# Count records with standardized values (not Unspecified)
factor_data = []
for col in factor_columns:
    total = len(crashes_sample)
    unspecified = (crashes_sample[col] == 'Unspecified').sum()
    specified = total - unspecified
    factor_data.append({
        'Factor': col.replace('CONTRIBUTING FACTOR ', ''),
        'Total Records': f"{total:,}",
        'Unspecified': f"{unspecified:,} ({(unspecified/total)*100:.1f}%)",
        'Specified': f"{specified:,} ({(specified/total)*100:.1f}%)"
    })

import pandas as pd
summary_df = pd.DataFrame(factor_data)
print(summary_df.to_string(index=False))

print(f"\nTotal collision records processed: {len(crashes_sample):,}")
print(f"All contributing factors now have consistent, standardized values!")

After standardization - Contributing Factor Analysis

CONTRIBUTING FACTOR VEHICLE 1:
  Unique values: 18
  Top 10 values:
    Unspecified: 656,323 (33.27%)
    Driver Distraction: 425,604 (21.58%)
    Driver Skill Issues: 335,814 (17.02%)
    Unsafe Driving Behavior: 270,278 (13.70%)
    Driver Impairment: 107,607 (5.45%)
    Other Vehicular: 61,742 (3.13%)
    Environmental Conditions: 44,767 (2.27%)
    Reaction to Other Vehicle: 20,145 (1.02%)
    Vehicle Defects: 13,610 (0.69%)
    Oversized Vehicle: 11,663 (0.59%)

CONTRIBUTING FACTOR VEHICLE 2:
  Unique values: 18
  Top 10 values:
    Unspecified: 1,390,170 (70.47%)
    Driver Distraction: 93,429 (4.74%)
    Driver Skill Issues: 50,459 (2.56%)
    Unsafe Driving Behavior: 46,886 (2.38%)
    Other Vehicular: 30,331 (1.54%)
    Driver Impairment: 19,800 (1.00%)
    Environmental Conditions: 8,989 (0.46%)
    Reaction to Other Vehicle: 3,411 (0.17%)
    Pedestrian/Bicyclist Error: 2,685 (0.14%)
    Oversized Vehicle: 2,106 (0.11%)



In [20]:
# Convert CRASH DATE(datetime[ns64]) and CRASH TIME(str) to datetime
crashes_sample['CRASH DATETIME'] = pd.to_datetime(
    crashes_sample['CRASH DATE'].astype(str) + ' ' + crashes_sample['CRASH TIME'].astype(str),
    errors='coerce'
)
# Drop original columns
crashes_sample = crashes_sample.drop(columns=['CRASH DATE', 'CRASH TIME'])

print("\n‚úì Created 'CRASH DATETIME' column by combining 'CRASH DATE' and 'CRASH TIME'")
print(f"   Sample values:\n{crashes_sample['CRASH DATETIME'].head(3).to_string(index=False)}")


‚úì Created 'CRASH DATETIME' column by combining 'CRASH DATE' and 'CRASH TIME'
   Sample values:
2023-11-01 01:29:00
2021-09-11 09:35:00
2021-12-14 08:13:00


In [21]:
# Fill missing street information strategically
# For rows with OFF STREET only, fill ON STREET with OFF STREET value
off_only_mask = (crashes_sample['ON STREET NAME'].isna() & 
                 crashes_sample['CROSS STREET NAME'].isna() & 
                 crashes_sample['OFF STREET NAME'].notna())
crashes_sample.loc[off_only_mask, 'ON STREET NAME'] = crashes_sample.loc[off_only_mask, 'OFF STREET NAME']

# Fill remaining missing values
crashes_sample['ON STREET NAME'] = crashes_sample['ON STREET NAME'].fillna('Unknown Street')
crashes_sample['CROSS STREET NAME'] = crashes_sample['CROSS STREET NAME'].fillna('No Cross Street')
crashes_sample['OFF STREET NAME'] = crashes_sample['OFF STREET NAME'].fillna('No Off Street')

print(f"‚úì Filled missing street information strategically")
print(f"   ‚Ä¢ Moved OFF STREET to ON STREET where appropriate")
print(f"   ‚Ä¢ Filled remaining gaps with descriptive defaults")

‚úì Filled missing street information strategically
   ‚Ä¢ Moved OFF STREET to ON STREET where appropriate
   ‚Ä¢ Filled remaining gaps with descriptive defaults


In [22]:
# Fill missing vehicle type codes with 'Unknown'
vehicle_type_columns = ['VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2', 
                       'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 
                       'VEHICLE TYPE CODE 5']

for col in vehicle_type_columns:
    crashes_sample[col] = crashes_sample[col].fillna('Unknown')

print(f"‚úì Filled missing values in all vehicle type code columns with 'Unknown'")

‚úì Filled missing values in all vehicle type code columns with 'Unknown'


In [23]:
# Fill Number of Persons Injured and Number of Persons Killed missing values 
# with sum of PEDESTRIANS, CYCLISTS, MOTORISTS injured/killed
injured_columns = ['NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF CYCLIST INJURED', 'NUMBER OF MOTORIST INJURED']
killed_columns = ['NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST KILLED']
crashes_sample['NUMBER OF PERSONS INJURED'] = crashes_sample['NUMBER OF PERSONS INJURED'].fillna(
    crashes_sample[injured_columns].sum(axis=1)
)
crashes_sample['NUMBER OF PERSONS KILLED'] = crashes_sample['NUMBER OF PERSONS KILLED'].fillna(
    crashes_sample[killed_columns].sum(axis=1)
)
print(f"‚úì Filled missing values in 'NUMBER OF PERSONS INJURED' and 'NUMBER OF PERSONS KILLED' using component sums")

‚úì Filled missing values in 'NUMBER OF PERSONS INJURED' and 'NUMBER OF PERSONS KILLED' using component sums


In [24]:
# Clear any previous output and create missing stats analysis
missing_stats = pd.DataFrame({
    'Column': crashes_sample.columns,
    'Missing_Count': crashes_sample.isnull().sum(),
    'Missing_Percentage': (crashes_sample.isnull().sum() / len(crashes_sample) * 100).round(2),
    'Data_Type': crashes_sample.dtypes
})

# Sort by missing count (descending)
missing_stats = missing_stats.sort_values('Missing_Count', ascending=False)

# Missing Values Analysis for Crashes Dataset
print("üîç MISSING VALUES ANALYSIS - CRASHES DATASET")
print("="*50)
print("\nAll columns with their missing value statistics:")
print(missing_stats.to_string(index=False))

üîç MISSING VALUES ANALYSIS - CRASHES DATASET

All columns with their missing value statistics:
                       Column  Missing_Count  Missing_Percentage      Data_Type
CONTRIBUTING FACTOR VEHICLE 5        1963555               99.54         object
CONTRIBUTING FACTOR VEHICLE 4        1939895               98.34         object
CONTRIBUTING FACTOR VEHICLE 3        1830583               92.80         object
CONTRIBUTING FACTOR VEHICLE 2         322665               16.36         object
CONTRIBUTING FACTOR VEHICLE 1           7163                0.36         object
                     LOCATION              0                0.00         object
            CROSS STREET NAME              0                0.00         object
               ON STREET NAME              0                0.00         object
                      BOROUGH              0                0.00         object
                     ZIP CODE              0                0.00         object
                     LA

In [25]:
cleaned_file_path = 'data_cleaned/nyc_vehicle_crashes_cleaned.csv'
crashes_sample.to_csv(cleaned_file_path, index=False)
print(f"\n CLEANED DATA SAVED: {cleaned_file_path}")


 CLEANED DATA SAVED: data_cleaned/nyc_vehicle_crashes_cleaned.csv
