In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')

print("Data Integration: - Crashes & Person Datasets")

Data Integration: - Crashes & Person Datasets


In [2]:
# Load cleaned datasets
print("\n LOADING CLEANED DATASETS")
print("-" * 80)

# Load cleaned crashes data
#crashes_cleaned = pd.read_csv('data/crashes_cleaned.csv', low_memory=False)
crashes_cleaned = pd.read_csv('data_cleaned/nyc_vehicle_crashes_cleaned.csv', low_memory=False)
print(f"‚úì Crashes (cleaned): {crashes_cleaned.shape}")
print(f"  Columns: {crashes_cleaned.columns.tolist()}")

# Load cleaned person data
#person_cleaned = pd.read_csv('data/person_cleaned.csv', low_memory=False)
person_cleaned = pd.read_csv('data_cleaned/person_cleaned.csv', low_memory=False)
print(f"\n‚úì Person (cleaned): {person_cleaned.shape}")
print(f"  Columns: {person_cleaned.columns.tolist()}")

# Display basic info
print(f"\n DATASET OVERVIEW:")
print(f"  Crashes: {len(crashes_cleaned):,} collision records")
print(f"  Person: {len(person_cleaned):,} person records (occupants, pedestrians, cyclists)")


 LOADING CLEANED DATASETS
--------------------------------------------------------------------------------
‚úì Crashes (cleaned): (1972666, 28)
  Columns: ['BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE', 'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME', 'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5', 'COLLISION_ID', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5', 'CRASH DATETIME']
‚úì Crashes (cleaned): (1972666, 28)
  Columns: ['BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE', 'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME', 'NUMBE

In [3]:
# 1- Pre-integration analysis: Check COLLISION_ID coverage
print("\n PRE-INTEGRATION ANALYSIS: COLLISION_ID COVERAGE")
print("=" * 80)

# Check for missing COLLISION_ID in both datasets
crashes_missing_id = crashes_cleaned['COLLISION_ID'].isna().sum()
person_missing_id = person_cleaned['COLLISION_ID'].isna().sum()

print(f"\n1. Missing COLLISION_ID:")
print(f"   ‚Ä¢ Crashes dataset: {crashes_missing_id:,} ({crashes_missing_id / len(crashes_cleaned) * 100:.2f}%)")
print(f"   ‚Ä¢ Person dataset: {person_missing_id:,} ({person_missing_id / len(person_cleaned) * 100:.2f}%)")

# Check unique COLLISION_IDs
crashes_unique_ids = crashes_cleaned['COLLISION_ID'].nunique()
person_unique_ids = person_cleaned['COLLISION_ID'].nunique()

print(f"\n2. Unique COLLISION_IDs:")
print(f"   ‚Ä¢ Crashes dataset: {crashes_unique_ids:,}")
print(f"   ‚Ä¢ Person dataset: {person_unique_ids:,}")

# Find common and unique IDs
crashes_ids = set(crashes_cleaned['COLLISION_ID'].dropna())
person_ids = set(person_cleaned['COLLISION_ID'].dropna())

common_ids = crashes_ids.intersection(person_ids)
crashes_only = crashes_ids - person_ids
person_only = person_ids - crashes_ids

print(f"\n3. COLLISION_ID Overlap:")
print(f"   ‚Ä¢ Common IDs (in both datasets): {len(common_ids):,}")
print(f"   ‚Ä¢ IDs only in Crashes: {len(crashes_only):,}")
print(f"   ‚Ä¢ IDs only in Person: {len(person_only):,}")
print(f"   ‚Ä¢ Match rate: {len(common_ids) / len(crashes_ids) * 100:.2f}%")

# Analyze person records per collision
print(f"\n4. Person Records per Collision:")
persons_per_collision = person_cleaned.groupby('COLLISION_ID').size()
print(f"   ‚Ä¢ Average: {persons_per_collision.mean():.2f} persons/collision")
print(f"   ‚Ä¢ Median: {persons_per_collision.median():.0f} persons/collision")
print(f"   ‚Ä¢ Max: {persons_per_collision.max():.0f} persons/collision")
print(f"   ‚Ä¢ Distribution:")
distribution = persons_per_collision.value_counts().sort_index().head(10)
for count, freq in distribution.items():
    print(f"     - {count} person(s): {freq:,} collisions ({freq / len(persons_per_collision) * 100:.1f}%)")


 PRE-INTEGRATION ANALYSIS: COLLISION_ID COVERAGE

1. Missing COLLISION_ID:
   ‚Ä¢ Crashes dataset: 0 (0.00%)
   ‚Ä¢ Person dataset: 0 (0.00%)

1. Missing COLLISION_ID:
   ‚Ä¢ Crashes dataset: 0 (0.00%)
   ‚Ä¢ Person dataset: 0 (0.00%)

2. Unique COLLISION_IDs:
   ‚Ä¢ Crashes dataset: 1,972,666
   ‚Ä¢ Person dataset: 1,589,868

2. Unique COLLISION_IDs:
   ‚Ä¢ Crashes dataset: 1,972,666
   ‚Ä¢ Person dataset: 1,589,868

3. COLLISION_ID Overlap:
   ‚Ä¢ Common IDs (in both datasets): 1,443,540
   ‚Ä¢ IDs only in Crashes: 529,126
   ‚Ä¢ IDs only in Person: 146,328
   ‚Ä¢ Match rate: 73.18%

4. Person Records per Collision:

3. COLLISION_ID Overlap:
   ‚Ä¢ Common IDs (in both datasets): 1,443,540
   ‚Ä¢ IDs only in Crashes: 529,126
   ‚Ä¢ IDs only in Person: 146,328
   ‚Ä¢ Match rate: 73.18%

4. Person Records per Collision:
   ‚Ä¢ Average: 3.66 persons/collision
   ‚Ä¢ Median: 4 persons/collision
   ‚Ä¢ Max: 77 persons/collision
   ‚Ä¢ Distribution:
     - 1 person(s): 260,356 collisions (

In [4]:
# 3- Perform the integration
print("\n PERFORMING DATA INTEGRATION")
print("=" * 80)

# Check if COLLISION_ID exists and has correct data type
print("Pre-merge data type check:")
print(f"  Crashes COLLISION_ID dtype: {crashes_cleaned['COLLISION_ID'].dtype}")
print(f"  Person COLLISION_ID dtype: {person_cleaned['COLLISION_ID'].dtype}")

# Ensure COLLISION_ID is same type (convert to int64 where possible)
crashes_cleaned['COLLISION_ID'] = pd.to_numeric(crashes_cleaned['COLLISION_ID'], errors='coerce').astype('Int64')
person_cleaned['COLLISION_ID'] = pd.to_numeric(person_cleaned['COLLISION_ID'], errors='coerce').astype('Int64')

# Perform INNER JOIN
print("\nExecuting INNER JOIN...")
start_time = pd.Timestamp.now()

integrated_data = crashes_cleaned.merge(
    person_cleaned,
    on='COLLISION_ID',
    how='inner',
    suffixes=('_CRASH', '_PERSON'),
    indicator=True
)

end_time = pd.Timestamp.now()
merge_duration = (end_time - start_time).total_seconds()

print(f"‚úì Join completed in {merge_duration:.2f} seconds")
print(f"\n INTEGRATION RESULTS:")
print(f"   ‚Ä¢ Original crashes: {len(crashes_cleaned):,} rows")
print(f"   ‚Ä¢ Original persons: {len(person_cleaned):,} rows")
print(f"   ‚Ä¢ Integrated dataset: {len(integrated_data):,} rows")
print(f"   ‚Ä¢ Expansion factor: {len(integrated_data) / len(crashes_cleaned):.2f}x")

# Analyze merge indicator
merge_stats = integrated_data['_merge'].value_counts()
print(f"\n MERGE STATISTICS:")
for merge_type, count in merge_stats.items():
    percentage = count / len(integrated_data) * 100
    if merge_type == 'both':
        print(f"   ‚Ä¢ Matched (crash + person data): {count:,} ({percentage:.1f}%)")

# Drop the merge indicator column
integrated_data = integrated_data.drop('_merge', axis=1)


 PERFORMING DATA INTEGRATION
Pre-merge data type check:
  Crashes COLLISION_ID dtype: int64
  Person COLLISION_ID dtype: int64

Executing INNER JOIN...
‚úì Join completed in 31.63 seconds

 INTEGRATION RESULTS:
   ‚Ä¢ Original crashes: 1,972,666 rows
   ‚Ä¢ Original persons: 5,817,930 rows
   ‚Ä¢ Integrated dataset: 5,258,664 rows
   ‚Ä¢ Expansion factor: 2.67x

 MERGE STATISTICS:
   ‚Ä¢ Matched (crash + person data): 5,258,664 (100.0%)
‚úì Join completed in 31.63 seconds

 INTEGRATION RESULTS:
   ‚Ä¢ Original crashes: 1,972,666 rows
   ‚Ä¢ Original persons: 5,817,930 rows
   ‚Ä¢ Integrated dataset: 5,258,664 rows
   ‚Ä¢ Expansion factor: 2.67x

 MERGE STATISTICS:
   ‚Ä¢ Matched (crash + person data): 5,258,664 (100.0%)


In [5]:
integrated_data.drop(columns=['CRASH_DATETIME'],inplace=True)

In [6]:
integrated_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5258664 entries, 0 to 5258663
Data columns (total 43 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   BOROUGH                        object 
 1   ZIP CODE                       object 
 2   LATITUDE                       float64
 3   LONGITUDE                      float64
 4   LOCATION                       object 
 5   ON STREET NAME                 object 
 6   CROSS STREET NAME              object 
 7   OFF STREET NAME                object 
 8   NUMBER OF PERSONS INJURED      float64
 9   NUMBER OF PERSONS KILLED       float64
 10  NUMBER OF PEDESTRIANS INJURED  int64  
 11  NUMBER OF PEDESTRIANS KILLED   int64  
 12  NUMBER OF CYCLIST INJURED      int64  
 13  NUMBER OF CYCLIST KILLED       int64  
 14  NUMBER OF MOTORIST INJURED     int64  
 15  NUMBER OF MOTORIST KILLED      int64  
 16  CONTRIBUTING FACTOR VEHICLE 1  object 
 17  CONTRIBUTING FACTOR VEHICLE 2  object 
 18  CO

In [7]:
# 4- Post-integration data quality assessment
print("\n POST-INTEGRATION DATA QUALITY ASSESSMENT")
print("=" * 80)

# Check for duplicate columns (from suffixes)
duplicate_cols = [col for col in integrated_data.columns if '_CRASH' in col or '_PERSON' in col]
if len(duplicate_cols) > 0:
    print(f"\n DUPLICATE COLUMNS DETECTED (from merge suffixes):")
    for col in sorted(duplicate_cols):
        print(f"   ‚Ä¢ {col}")
    print(f"\n   Action required: Resolve {len(duplicate_cols)} duplicate columns")
else:
    print("\n‚úì No duplicate columns detected")

# Check for new missing values
print(f"\n MISSING VALUES IN PERSON COLUMNS:")
person_cols = [col for col in integrated_data.columns if col in person_cleaned.columns and col != 'COLLISION_ID']
missing_summary = []
for col in person_cols[:10]:  # Show first 10 person columns
    missing_count = integrated_data[col].isna().sum()
    missing_pct = missing_count / len(integrated_data) * 100
    missing_summary.append({
        'Column': col,
        'Missing': missing_count,
        'Percentage': f"{missing_pct:.1f}%"
    })

missing_df = pd.DataFrame(missing_summary)
print(missing_df.to_string(index=False))

# Data type consistency check
print(f"\n DATA TYPE CONSISTENCY:")
print(f"   ‚Ä¢ Total columns: {len(integrated_data.columns)}")
print(f"   ‚Ä¢ Numeric columns: {len(integrated_data.select_dtypes(include=[np.number]).columns)}")
print(f"   ‚Ä¢ Object/String columns: {len(integrated_data.select_dtypes(include=['object']).columns)}")
print(f"   ‚Ä¢ Datetime columns: {len(integrated_data.select_dtypes(include=['datetime64']).columns)}")


 POST-INTEGRATION DATA QUALITY ASSESSMENT

‚úì No duplicate columns detected

 MISSING VALUES IN PERSON COLUMNS:
               Column  Missing Percentage
          PERSON_TYPE        0       0.0%
        PERSON_INJURY        0       0.0%
         PED_LOCATION  5162328      98.2%
           PED_ACTION  5162424      98.2%
CONTRIBUTING_FACTOR_1  5162844      98.2%
CONTRIBUTING_FACTOR_2  5162971      98.2%
           PERSON_SEX        0       0.0%
             PED_ROLE        0       0.0%
           PERSON_AGE        0       0.0%
             EJECTION        0       0.0%

 DATA TYPE CONSISTENCY:
   ‚Ä¢ Total columns: 43
               Column  Missing Percentage
          PERSON_TYPE        0       0.0%
        PERSON_INJURY        0       0.0%
         PED_LOCATION  5162328      98.2%
           PED_ACTION  5162424      98.2%
CONTRIBUTING_FACTOR_1  5162844      98.2%
CONTRIBUTING_FACTOR_2  5162971      98.2%
           PERSON_SEX        0       0.0%
             PED_ROLE        0       0

In [9]:
integrated_data.isnull().sum()

BOROUGH                                0
ZIP CODE                               0
LATITUDE                               0
LONGITUDE                              0
LOCATION                               0
ON STREET NAME                         0
CROSS STREET NAME                      0
OFF STREET NAME                        0
NUMBER OF PERSONS INJURED              0
NUMBER OF PERSONS KILLED               0
NUMBER OF PEDESTRIANS INJURED          0
NUMBER OF PEDESTRIANS KILLED           0
NUMBER OF CYCLIST INJURED              0
NUMBER OF CYCLIST KILLED               0
NUMBER OF MOTORIST INJURED             0
NUMBER OF MOTORIST KILLED              0
CONTRIBUTING FACTOR VEHICLE 1       7174
CONTRIBUTING FACTOR VEHICLE 2     602468
CONTRIBUTING FACTOR VEHICLE 3    4625055
CONTRIBUTING FACTOR VEHICLE 4    5075155
CONTRIBUTING FACTOR VEHICLE 5    5197031
COLLISION_ID                           0
VEHICLE TYPE CODE 1                    0
VEHICLE TYPE CODE 2                    0
VEHICLE TYPE COD

In [10]:
integrated_data.nunique()

BOROUGH                                6
ZIP CODE                             235
LATITUDE                          110634
LONGITUDE                          81363
LOCATION                          361100
ON STREET NAME                    246894
CROSS STREET NAME                  16594
OFF STREET NAME                   231687
NUMBER OF PERSONS INJURED             30
NUMBER OF PERSONS KILLED               7
NUMBER OF PEDESTRIANS INJURED         14
NUMBER OF PEDESTRIANS KILLED           6
NUMBER OF CYCLIST INJURED              5
NUMBER OF CYCLIST KILLED               3
NUMBER OF MOTORIST INJURED            29
NUMBER OF MOTORIST KILLED              6
CONTRIBUTING FACTOR VEHICLE 1         18
CONTRIBUTING FACTOR VEHICLE 2         18
CONTRIBUTING FACTOR VEHICLE 3         17
CONTRIBUTING FACTOR VEHICLE 4         13
CONTRIBUTING FACTOR VEHICLE 5         12
COLLISION_ID                     1443540
VEHICLE TYPE CODE 1                 1709
VEHICLE TYPE CODE 2                 1910
VEHICLE TYPE COD

In [11]:
integrated_data[['CONTRIBUTING_FACTOR_1','CONTRIBUTING_FACTOR_2']].value_counts(dropna=False)

CONTRIBUTING_FACTOR_1                                  CONTRIBUTING_FACTOR_2                                
NaN                                                    NaN                                                      5162841
Unspecified                                            Unspecified                                                66400
Pedestrian/Bicyclist/Other Pedestrian Error/Confusion  Unspecified                                                10161
                                                       Pedestrian/Bicyclist/Other Pedestrian Error/Confusion       2398
Driver Inattention/Distraction                         Unspecified                                                 1884
                                                                                                                 ...   
View Obstructed/Limited                                Alcohol Involvement                                            1
                                                   

In [12]:
# 1. Combine both columns into one long Series
all_factors = pd.concat([
    integrated_data['CONTRIBUTING_FACTOR_1'],
    integrated_data['CONTRIBUTING_FACTOR_2']
])

# 2. Clean, Drop NaNs, and get Unique values
unique_factors = (
    all_factors
    .replace({'': np.nan, 'Unspecified': np.nan}) # Standardize Nulls
    .str.strip()                                   # Remove whitespace
    .replace({'Illnes': 'Illness'})                # Fix typo
    .dropna()                                      # Remove NaNs
    .unique()                                      # Get only unique values
)

# Check the result
print(f"Found {len(unique_factors)} unique factors.")
print(unique_factors)

Found 54 unique factors.
['Alcohol Involvement'
 'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion'
 'Driver Inattention/Distraction' 'Failure to Yield Right-of-Way'
 'View Obstructed/Limited' 'Other Vehicular' 'Obstruction/Debris'
 'Traffic Control Disregarded' 'Turning Improperly'
 'Outside Car Distraction' 'Driver Inexperience'
 'Passing or Lane Usage Improper' 'Backing Unsafely' 'Drugs (illegal)'
 'Animals Action' 'Physical Disability' 'Pavement Slippery'
 'Following Too Closely' 'Unsafe Speed' 'Cell Phone (hand-Held)'
 'Passenger Distraction' 'Reaction to Uninvolved Vehicle'
 'Fatigued/Drowsy' 'Driverless/Runaway Vehicle' 'Illness'
 'Passing Too Closely' 'Eating or Drinking' 'Texting'
 'Failure to Keep Right' 'Listening/Using Headphones'
 'Unsafe Lane Changing' 'Aggressive Driving/Road Rage' 'Brakes Defective'
 'Lane Marking Improper/Inadequate' 'Fell Asleep' 'Lost Consciousness'
 'Traffic Control Device Improper/Non-Working' 'Cell Phone (hands-free)'
 'Glare' 'Oversized Vehi

In [13]:
# Define the mapping dictionary
factor_groups = {
    # 1. Distraction & Focus (Internal/External)
    'Driver Inattention/Distraction': 'Distraction',
    'Outside Car Distraction': 'Distraction',
    'Passenger Distraction': 'Distraction',
    'Cell Phone (hand-Held)': 'Distraction',
    'Cell Phone (hands-free)': 'Distraction',
    'Texting': 'Distraction',
    'Other Electronic Device': 'Distraction',
    'Using On Board Navigation Device': 'Distraction',
    'Eating or Drinking': 'Distraction',
    'Listening/Using Headphones': 'Distraction',

    # 2. Driver Violations & Mistakes
    'Failure to Yield Right-of-Way': 'Driver Violation',
    'Traffic Control Disregarded': 'Driver Violation',
    'Unsafe Speed': 'Driver Violation',
    'Following Too Closely': 'Driver Violation',
    'Passing or Lane Usage Improper': 'Driver Violation',
    'Passing Too Closely': 'Driver Violation',
    'Backing Unsafely': 'Driver Violation',
    'Turning Improperly': 'Driver Violation',
    'Failure to Keep Right': 'Driver Violation',
    'Unsafe Lane Changing': 'Driver Violation',
    'Aggressive Driving/Road Rage': 'Driver Violation',
    'Driver Inexperience': 'Driver Violation',

    # 3. Impairment & Health
    'Alcohol Involvement': 'Impairment/Health',
    'Drugs (illegal)': 'Impairment/Health',
    'Prescription Medication': 'Impairment/Health',
    'Illness': 'Impairment/Health',
    'Fatigued/Drowsy': 'Impairment/Health',
    'Fell Asleep': 'Impairment/Health',
    'Lost Consciousness': 'Impairment/Health',
    'Physical Disability': 'Impairment/Health',

    # 4. Environment & Road Conditions
    'View Obstructed/Limited': 'Environment',
    'Obstruction/Debris': 'Environment',
    'Pavement Slippery': 'Environment',
    'Pavement Defective': 'Environment',
    'Glare': 'Environment',
    'Lane Marking Improper/Inadequate': 'Environment',
    'Traffic Control Device Improper/Non-Working': 'Environment',
    'Shoulders Defective/Improper': 'Environment',
    'Other Lighting Defects': 'Environment',
    'Animals Action': 'Environment',

    # 5. Vehicle Defects
    'Brakes Defective': 'Vehicle Defect',
    'Steering Failure': 'Vehicle Defect',
    'Tire Failure/Inadequate': 'Vehicle Defect',
    'Accelerator Defective': 'Vehicle Defect',
    'Tow Hitch Defective': 'Vehicle Defect',
    'Headlights Defective': 'Vehicle Defect',
    'Windshield Inadequate': 'Vehicle Defect',
    'Tinted Windows': 'Vehicle Defect',
    'Oversized Vehicle': 'Vehicle Defect',

    # 6. Other / External Context
    'Other Vehicular': 'Other',
    'Reaction to Uninvolved Vehicle': 'Other',
    'Driverless/Runaway Vehicle': 'Other',
    'Vehicle Vandalism': 'Other',
    'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion': 'Pedestrian/Cyclist Error'
}

In [14]:
# 1. Create a temporary dataframe with the mapped values
mapped_factors = pd.DataFrame({
    'f1': integrated_data['CONTRIBUTING_FACTOR_1'].map(factor_groups),
    'f2': integrated_data['CONTRIBUTING_FACTOR_2'].map(factor_groups)
})

# 2. Define a function to join them intelligently
def distinct_join(row):
    # Collect values that are not None/NaN and not empty strings
    items = [x for x in [row['f1'], row['f2']] if pd.notna(x) and x != ""]

    # Remove duplicates while preserving order (e.g., "Distraction, Distraction" -> "Distraction")
    unique_items = list(dict.fromkeys(items))

    return "& ".join(unique_items)

# 3. Apply the function
integrated_data['CONTRIBUTING_FACTOR'] = mapped_factors.apply(distinct_join, axis=1)

In [15]:
integrated_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5258664 entries, 0 to 5258663
Data columns (total 44 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   BOROUGH                        object 
 1   ZIP CODE                       object 
 2   LATITUDE                       float64
 3   LONGITUDE                      float64
 4   LOCATION                       object 
 5   ON STREET NAME                 object 
 6   CROSS STREET NAME              object 
 7   OFF STREET NAME                object 
 8   NUMBER OF PERSONS INJURED      float64
 9   NUMBER OF PERSONS KILLED       float64
 10  NUMBER OF PEDESTRIANS INJURED  int64  
 11  NUMBER OF PEDESTRIANS KILLED   int64  
 12  NUMBER OF CYCLIST INJURED      int64  
 13  NUMBER OF CYCLIST KILLED       int64  
 14  NUMBER OF MOTORIST INJURED     int64  
 15  NUMBER OF MOTORIST KILLED      int64  
 16  CONTRIBUTING FACTOR VEHICLE 1  object 
 17  CONTRIBUTING FACTOR VEHICLE 2  object 
 18  CO

In [16]:
# Replaces exact empty strings with 'Unspecified'
integrated_data['CONTRIBUTING_FACTOR'] = integrated_data['CONTRIBUTING_FACTOR'].replace({'': 'Unspecified'})

In [17]:
integrated_data['CONTRIBUTING_FACTOR'].value_counts(dropna=False)

CONTRIBUTING_FACTOR
Unspecified                                    5229298
Pedestrian/Cyclist Error                         12761
Driver Violation                                  4135
Distraction                                       3343
Other                                             1294
Pedestrian/Cyclist Error& Driver Violation        1232
Impairment/Health                                 1158
Driver Violation& Pedestrian/Cyclist Error         845
Environment                                        775
Distraction& Driver Violation                      730
Driver Violation& Distraction                      465
Pedestrian/Cyclist Error& Distraction              398
Distraction& Pedestrian/Cyclist Error              351
Impairment/Health& Pedestrian/Cyclist Error        293
Pedestrian/Cyclist Error& Environment              267
Pedestrian/Cyclist Error& Impairment/Health        181
Impairment/Health& Driver Violation                175
Environment& Pedestrian/Cyclist Error        

In [18]:
integrated_data.drop(columns=['CONTRIBUTING_FACTOR_1','CONTRIBUTING_FACTOR_2'],inplace=True)

In [19]:
integrated_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5258664 entries, 0 to 5258663
Data columns (total 42 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   BOROUGH                        object 
 1   ZIP CODE                       object 
 2   LATITUDE                       float64
 3   LONGITUDE                      float64
 4   LOCATION                       object 
 5   ON STREET NAME                 object 
 6   CROSS STREET NAME              object 
 7   OFF STREET NAME                object 
 8   NUMBER OF PERSONS INJURED      float64
 9   NUMBER OF PERSONS KILLED       float64
 10  NUMBER OF PEDESTRIANS INJURED  int64  
 11  NUMBER OF PEDESTRIANS KILLED   int64  
 12  NUMBER OF CYCLIST INJURED      int64  
 13  NUMBER OF CYCLIST KILLED       int64  
 14  NUMBER OF MOTORIST INJURED     int64  
 15  NUMBER OF MOTORIST KILLED      int64  
 16  CONTRIBUTING FACTOR VEHICLE 1  object 
 17  CONTRIBUTING FACTOR VEHICLE 2  object 
 18  CO

In [20]:
integrated_data.isnull().sum()

BOROUGH                                0
ZIP CODE                               0
LATITUDE                               0
LONGITUDE                              0
LOCATION                               0
ON STREET NAME                         0
CROSS STREET NAME                      0
OFF STREET NAME                        0
NUMBER OF PERSONS INJURED              0
NUMBER OF PERSONS KILLED               0
NUMBER OF PEDESTRIANS INJURED          0
NUMBER OF PEDESTRIANS KILLED           0
NUMBER OF CYCLIST INJURED              0
NUMBER OF CYCLIST KILLED               0
NUMBER OF MOTORIST INJURED             0
NUMBER OF MOTORIST KILLED              0
CONTRIBUTING FACTOR VEHICLE 1       7174
CONTRIBUTING FACTOR VEHICLE 2     602468
CONTRIBUTING FACTOR VEHICLE 3    4625055
CONTRIBUTING FACTOR VEHICLE 4    5075155
CONTRIBUTING FACTOR VEHICLE 5    5197031
COLLISION_ID                           0
VEHICLE TYPE CODE 1                    0
VEHICLE TYPE CODE 2                    0
VEHICLE TYPE COD

In [21]:
integrated_data.drop(columns=['PED_ACTION','PED_LOCATION'],inplace=True)

In [22]:
integrated_data.isnull().sum()

BOROUGH                                0
ZIP CODE                               0
LATITUDE                               0
LONGITUDE                              0
LOCATION                               0
ON STREET NAME                         0
CROSS STREET NAME                      0
OFF STREET NAME                        0
NUMBER OF PERSONS INJURED              0
NUMBER OF PERSONS KILLED               0
NUMBER OF PEDESTRIANS INJURED          0
NUMBER OF PEDESTRIANS KILLED           0
NUMBER OF CYCLIST INJURED              0
NUMBER OF CYCLIST KILLED               0
NUMBER OF MOTORIST INJURED             0
NUMBER OF MOTORIST KILLED              0
CONTRIBUTING FACTOR VEHICLE 1       7174
CONTRIBUTING FACTOR VEHICLE 2     602468
CONTRIBUTING FACTOR VEHICLE 3    4625055
CONTRIBUTING FACTOR VEHICLE 4    5075155
CONTRIBUTING FACTOR VEHICLE 5    5197031
COLLISION_ID                           0
VEHICLE TYPE CODE 1                    0
VEHICLE TYPE CODE 2                    0
VEHICLE TYPE COD

In [23]:
integrated_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5258664 entries, 0 to 5258663
Data columns (total 40 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   BOROUGH                        object 
 1   ZIP CODE                       object 
 2   LATITUDE                       float64
 3   LONGITUDE                      float64
 4   LOCATION                       object 
 5   ON STREET NAME                 object 
 6   CROSS STREET NAME              object 
 7   OFF STREET NAME                object 
 8   NUMBER OF PERSONS INJURED      float64
 9   NUMBER OF PERSONS KILLED       float64
 10  NUMBER OF PEDESTRIANS INJURED  int64  
 11  NUMBER OF PEDESTRIANS KILLED   int64  
 12  NUMBER OF CYCLIST INJURED      int64  
 13  NUMBER OF CYCLIST KILLED       int64  
 14  NUMBER OF MOTORIST INJURED     int64  
 15  NUMBER OF MOTORIST KILLED      int64  
 16  CONTRIBUTING FACTOR VEHICLE 1  object 
 17  CONTRIBUTING FACTOR VEHICLE 2  object 
 18  CO

In [24]:
integrated_data['PERSON_INJURY'] = integrated_data['PERSON_INJURY'].replace('Unspecified', 'Safe')

In [25]:
integrated_data['PERSON_INJURY'].value_counts(dropna=False)

PERSON_INJURY
Safe       4604674
Injured     650930
Killed        3060
Name: count, dtype: int64

In [26]:
integrated_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5258664 entries, 0 to 5258663
Data columns (total 40 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   BOROUGH                        object 
 1   ZIP CODE                       object 
 2   LATITUDE                       float64
 3   LONGITUDE                      float64
 4   LOCATION                       object 
 5   ON STREET NAME                 object 
 6   CROSS STREET NAME              object 
 7   OFF STREET NAME                object 
 8   NUMBER OF PERSONS INJURED      float64
 9   NUMBER OF PERSONS KILLED       float64
 10  NUMBER OF PEDESTRIANS INJURED  int64  
 11  NUMBER OF PEDESTRIANS KILLED   int64  
 12  NUMBER OF CYCLIST INJURED      int64  
 13  NUMBER OF CYCLIST KILLED       int64  
 14  NUMBER OF MOTORIST INJURED     int64  
 15  NUMBER OF MOTORIST KILLED      int64  
 16  CONTRIBUTING FACTOR VEHICLE 1  object 
 17  CONTRIBUTING FACTOR VEHICLE 2  object 
 18  CO

In [27]:
integrated_data.nunique()

BOROUGH                                6
ZIP CODE                             235
LATITUDE                          110634
LONGITUDE                          81363
LOCATION                          361100
ON STREET NAME                    246894
CROSS STREET NAME                  16594
OFF STREET NAME                   231687
NUMBER OF PERSONS INJURED             30
NUMBER OF PERSONS KILLED               7
NUMBER OF PEDESTRIANS INJURED         14
NUMBER OF PEDESTRIANS KILLED           6
NUMBER OF CYCLIST INJURED              5
NUMBER OF CYCLIST KILLED               3
NUMBER OF MOTORIST INJURED            29
NUMBER OF MOTORIST KILLED              6
CONTRIBUTING FACTOR VEHICLE 1         18
CONTRIBUTING FACTOR VEHICLE 2         18
CONTRIBUTING FACTOR VEHICLE 3         17
CONTRIBUTING FACTOR VEHICLE 4         13
CONTRIBUTING FACTOR VEHICLE 5         12
COLLISION_ID                     1443540
VEHICLE TYPE CODE 1                 1709
VEHICLE TYPE CODE 2                 1910
VEHICLE TYPE COD

## Data Integrity Analysis: Injury/Fatality Count Validation

In [28]:
# Analyze injury/fatality count consistency using COLLISION_ID grouping
print("üîç INJURY/FATALITY COUNT VALIDATION ANALYSIS")
print("=" * 80)

print(f"\n1. Grouping by COLLISION_ID for validation...")
print(f"Total records in integrated_data: {len(integrated_data):,}")

# Group by COLLISION_ID and compare person counts vs crash summary counts
validation_data = integrated_data.groupby('COLLISION_ID').agg({
    'PERSON_INJURY': [
        ('person_injured_count', lambda x: (x == 'Injured').sum()),
        ('person_killed_count', lambda x: (x == 'Killed').sum()),
        ('total_persons', 'count')
    ],
    'NUMBER OF PERSONS INJURED': [('crash_injured_count', 'first')],
    'NUMBER OF PERSONS KILLED': [('crash_killed_count', 'first')]
})

# Flatten column names
validation_data.columns = ['person_injured_count', 'person_killed_count', 'total_persons', 
                          'crash_injured_count', 'crash_killed_count']
validation_data = validation_data.reset_index()

print(f"\n2. Created validation dataset with {len(validation_data):,} unique collisions")

# Calculate discrepancies
validation_data['injured_match'] = validation_data['person_injured_count'] == validation_data['crash_injured_count']
validation_data['killed_match'] = validation_data['person_killed_count'] == validation_data['crash_killed_count']
validation_data['both_match'] = validation_data['injured_match'] & validation_data['killed_match']

# Calculate differences
validation_data['injured_diff'] = validation_data['person_injured_count'] - validation_data['crash_injured_count']
validation_data['killed_diff'] = validation_data['person_killed_count'] - validation_data['crash_killed_count']

# Display validation results and anomaly analysis
print("\nüìä VALIDATION RESULTS")
print("=" * 50)

# Overall match statistics
total_collisions = len(validation_data)
injured_matches = validation_data['injured_match'].sum()
killed_matches = validation_data['killed_match'].sum()
both_matches = validation_data['both_match'].sum()

print(f"\nüéØ MATCH STATISTICS:")
print(f"   ‚Ä¢ Total collisions analyzed: {total_collisions:,}")
print(f"   ‚Ä¢ Injured counts match: {injured_matches:,} ({injured_matches/total_collisions*100:.2f}%)")
print(f"   ‚Ä¢ Killed counts match: {killed_matches:,} ({killed_matches/total_collisions*100:.2f}%)")
print(f"   ‚Ä¢ Both counts match: {both_matches:,} ({both_matches/total_collisions*100:.2f}%)")

# Anomaly percentages
injured_anomalies = total_collisions - injured_matches
killed_anomalies = total_collisions - killed_matches
both_anomalies = total_collisions - both_matches

print(f"\nüö® ANOMALY STATISTICS:")
print(f"   ‚Ä¢ Injured count anomalies: {injured_anomalies:,} ({injured_anomalies/total_collisions*100:.2f}%)")
print(f"   ‚Ä¢ Killed count anomalies: {killed_anomalies:,} ({killed_anomalies/total_collisions*100:.2f}%)")
print(f"   ‚Ä¢ Any count anomaly: {both_anomalies:,} ({both_anomalies/total_collisions*100:.2f}%)")

# Analyze difference patterns
print(f"\nüìà DIFFERENCE PATTERNS:")
print(f"\nInjured Count Differences:")
injured_diff_stats = validation_data['injured_diff'].value_counts().sort_index()
print(f"   ‚Ä¢ Most common differences:")
for diff, count in injured_diff_stats.head(10).items():
    print(f"     - Difference {diff:+}: {count:,} collisions ({count/total_collisions*100:.2f}%)")

print(f"\nKilled Count Differences:")
killed_diff_stats = validation_data['killed_diff'].value_counts().sort_index()
print(f"   ‚Ä¢ Most common differences:")
for diff, count in killed_diff_stats.head(10).items():
    print(f"     - Difference {diff:+}: {count:,} collisions ({count/total_collisions*100:.2f}%)")


üîç INJURY/FATALITY COUNT VALIDATION ANALYSIS

1. Grouping by COLLISION_ID for validation...
Total records in integrated_data: 5,258,664

2. Created validation dataset with 1,443,540 unique collisions

üìä VALIDATION RESULTS

üéØ MATCH STATISTICS:
   ‚Ä¢ Total collisions analyzed: 1,443,540
   ‚Ä¢ Injured counts match: 1,443,023 (99.96%)
   ‚Ä¢ Killed counts match: 1,443,528 (100.00%)
   ‚Ä¢ Both counts match: 1,443,016 (99.96%)

üö® ANOMALY STATISTICS:
   ‚Ä¢ Injured count anomalies: 517 (0.04%)
   ‚Ä¢ Killed count anomalies: 12 (0.00%)
   ‚Ä¢ Any count anomaly: 524 (0.04%)

üìà DIFFERENCE PATTERNS:

Injured Count Differences:
   ‚Ä¢ Most common differences:
     - Difference -19.0: 1 collisions (0.00%)
     - Difference -6.0: 2 collisions (0.00%)
     - Difference -5.0: 1 collisions (0.00%)
     - Difference -4.0: 14 collisions (0.00%)
     - Difference -3.0: 34 collisions (0.00%)
     - Difference -2.0: 71 collisions (0.00%)
     - Difference -1.0: 251 collisions (0.02%)
     -

In [None]:
# Save compressed dataset in multiple formats
print("\nüíæ SAVING COMPRESSED DATASET")
print("=" * 50)
original_size = len(integrated_data)
# CSV with compression
csv_path = 'integrated_crashes_person_compressed.csv'
integrated_data.to_csv(csv_path, index=False, compression='gzip')

# Parquet format (most efficient for deployment)
parquet_path = 'integrated_crashes_person_compressed.parquet'












print(f"   ‚Ä¢ Memory: {integrated_data.memory_usage(deep=True).sum()/1024**2:.2f} MB")print(f"   ‚Ä¢ Shape: {integrated_data.shape}")print(f"   ‚Ä¢ Pickle: {pickle_path}")print(f"   ‚Ä¢ Parquet: {parquet_path}")  print(f"   ‚Ä¢ CSV (gzipped): {csv_path}")print(f"‚úÖ Dataset saved in 3 formats:")integrated_data.to_pickle(pickle_path, compression='gzip')pickle_path = 'integrated_crashes_person_compressed.pkl'# Pickle format (fastest loading)integrated_data.to_parquet(parquet_path, index=False, compression='snappy')print(f"Data quality: {len(integrated_data)/original_size*100:.2f}% retained")

üßπ REMOVING INCONSISTENT RECORDS
Removed: 1,180 records
Remaining: 5,257,484 records
Data quality: 99.98% retained
Removed: 1,180 records
Remaining: 5,257,484 records
Data quality: 99.98% retained


## üì¶ Dataset Compression for Deployment

In [None]:
# Compress dataset for deployment
print("üì¶ COMPRESSING DATASET FOR DEPLOYMENT")

original_memory = integrated_data.memory_usage(deep=True).sum() / 1024**2
print(f"Original memory usage: {original_memory:.2f} MB")

# 1. Downcast numeric columns
print("\n1. Downcasting numeric columns...")
for col in integrated_data.select_dtypes(include=['float64']).columns:
    integrated_data[col] = pd.to_numeric(integrated_data[col], downcast='float')

for col in integrated_data.select_dtypes(include=['int64']).columns:
    if col != 'COLLISION_ID':  # Keep COLLISION_ID as original type
        integrated_data[col] = pd.to_numeric(integrated_data[col], downcast='integer')

# 2. Convert object columns to categorical where beneficial
print("2. Converting to categorical...")
categorical_candidates = ['BOROUGH', 'PERSON_TYPE', 'PERSON_INJURY', 'PERSON_SEX', 
                         'CONTRIBUTING_FACTOR', 'EJECTION', 'EMOTIONAL_STATUS', 
                         'POSITION_IN_VEHICLE', 'BODILY_INJURY', 'SAFETY_EQUIPMENT']

for col in categorical_candidates:
    if col in integrated_data.columns:
        unique_ratio = integrated_data[col].nunique() / len(integrated_data)
        if unique_ratio < 0.5:  # Convert if less than 50% unique values
            integrated_data[col] = integrated_data[col].astype('category')
            print(f"   ‚Ä¢ {col}: {integrated_data[col].nunique()} categories")

# 3. Drop Collision ID
integrated_data = integrated_data.drop(columns=['COLLISION_ID'])
print("   ‚Ä¢ Dropped COLLISION_ID column")

# 4. Compress coordinates to float32
if 'LATITUDE' in integrated_data.columns:
    integrated_data['LATITUDE'] = integrated_data['LATITUDE'].astype('float32')
if 'LONGITUDE' in integrated_data.columns:
    integrated_data['LONGITUDE'] = integrated_data['LONGITUDE'].astype('float32')

compressed_memory = integrated_data.memory_usage(deep=True).sum() / 1024**2
compression_ratio = (1 - compressed_memory/original_memory) * 100

print(f"\n‚úÖ COMPRESSION COMPLETE")
print(f"   ‚Ä¢ Original: {original_memory:.2f} MB")
print(f"   ‚Ä¢ Compressed: {compressed_memory:.2f} MB") 
print(f"   ‚Ä¢ Reduction: {compression_ratio:.1f}%")
print(f"   ‚Ä¢ Final shape: {integrated_data.shape}")

üì¶ COMPRESSING DATASET FOR DEPLOYMENT
Original memory usage: 8753.43 MB

1. Downcasting numeric columns...
2. Converting to categorical...
   ‚Ä¢ BOROUGH: 6 categories
   ‚Ä¢ PERSON_TYPE: 4 categories
   ‚Ä¢ PERSON_INJURY: 3 categories
   ‚Ä¢ PERSON_SEX: 3 categories
   ‚Ä¢ CONTRIBUTING_FACTOR: 47 categories
   ‚Ä¢ EJECTION: 6 categories
   ‚Ä¢ EMOTIONAL_STATUS: 8 categories
   ‚Ä¢ POSITION_IN_VEHICLE: 11 categories
   ‚Ä¢ BODILY_INJURY: 14 categories
   ‚Ä¢ SAFETY_EQUIPMENT: 10 categories
3. Checking for columns to drop...
   ‚Ä¢ Dropping 3 high-null columns: ['CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5']

‚úÖ COMPRESSION COMPLETE
   ‚Ä¢ Original: 8753.43 MB
   ‚Ä¢ Compressed: 5008.44 MB
   ‚Ä¢ Reduction: 42.8%
   ‚Ä¢ Final shape: (5257484, 37)


In [None]:
# 7- Save integrated dataset
print("\n SAVING INTEGRATED DATASET")
print("=" * 80)

output_path = 'integrated_crashes_person.csv'
integrated_data.to_csv(output_path, index=False)

print(f"‚úì Integrated dataset saved to: {output_path}")
print(f"  ‚Ä¢ Rows: {len(integrated_data):,}")
print(f"  ‚Ä¢ Columns: {len(integrated_data.columns)}")


 SAVING INTEGRATED DATASET
‚úì Integrated dataset saved to: integrated_crashes_person.csv
  ‚Ä¢ Rows: 5,258,664
  ‚Ä¢ Columns: 43
‚úì Integrated dataset saved to: integrated_crashes_person.csv
  ‚Ä¢ Rows: 5,258,664
  ‚Ä¢ Columns: 43
