In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')

print("Data Integration: - Crashes & Person Datasets")

In [None]:
# Load cleaned datasets
print("\n LOADING CLEANED DATASETS")
print("-" * 80)

# Load cleaned crashes data
#crashes_cleaned = pd.read_csv('data/crashes_cleaned.csv', low_memory=False)
crashes_cleaned = pd.read_csv('data_cleaned/nyc_vehicle_crashes_cleaned.csv', low_memory=False)
print(f"✓ Crashes (cleaned): {crashes_cleaned.shape}")
print(f"  Columns: {crashes_cleaned.columns.tolist()}")

# Load cleaned person data
#person_cleaned = pd.read_csv('data/person_cleaned.csv', low_memory=False)
person_cleaned = pd.read_csv('data_cleaned/person_cleaned.csv', low_memory=False)
print(f"\n✓ Person (cleaned): {person_cleaned.shape}")
print(f"  Columns: {person_cleaned.columns.tolist()}")

# Display basic info
print(f"\n DATASET OVERVIEW:")
print(f"  Crashes: {len(crashes_cleaned):,} collision records")
print(f"  Person: {len(person_cleaned):,} person records (occupants, pedestrians, cyclists)")

In [None]:
# 1- Pre-integration analysis: Check COLLISION_ID coverage
print("\n PRE-INTEGRATION ANALYSIS: COLLISION_ID COVERAGE")
print("=" * 80)

# Check for missing COLLISION_ID in both datasets
crashes_missing_id = crashes_cleaned['COLLISION_ID'].isna().sum()
person_missing_id = person_cleaned['COLLISION_ID'].isna().sum()

print(f"\n1. Missing COLLISION_ID:")
print(f"   • Crashes dataset: {crashes_missing_id:,} ({crashes_missing_id / len(crashes_cleaned) * 100:.2f}%)")
print(f"   • Person dataset: {person_missing_id:,} ({person_missing_id / len(person_cleaned) * 100:.2f}%)")

# Check unique COLLISION_IDs
crashes_unique_ids = crashes_cleaned['COLLISION_ID'].nunique()
person_unique_ids = person_cleaned['COLLISION_ID'].nunique()

print(f"\n2. Unique COLLISION_IDs:")
print(f"   • Crashes dataset: {crashes_unique_ids:,}")
print(f"   • Person dataset: {person_unique_ids:,}")

# Find common and unique IDs
crashes_ids = set(crashes_cleaned['COLLISION_ID'].dropna())
person_ids = set(person_cleaned['COLLISION_ID'].dropna())

common_ids = crashes_ids.intersection(person_ids)
crashes_only = crashes_ids - person_ids
person_only = person_ids - crashes_ids

print(f"\n3. COLLISION_ID Overlap:")
print(f"   • Common IDs (in both datasets): {len(common_ids):,}")
print(f"   • IDs only in Crashes: {len(crashes_only):,}")
print(f"   • IDs only in Person: {len(person_only):,}")
print(f"   • Match rate: {len(common_ids) / len(crashes_ids) * 100:.2f}%")

# Analyze person records per collision
print(f"\n4. Person Records per Collision:")
persons_per_collision = person_cleaned.groupby('COLLISION_ID').size()
print(f"   • Average: {persons_per_collision.mean():.2f} persons/collision")
print(f"   • Median: {persons_per_collision.median():.0f} persons/collision")
print(f"   • Max: {persons_per_collision.max():.0f} persons/collision")
print(f"   • Distribution:")
distribution = persons_per_collision.value_counts().sort_index().head(10)
for count, freq in distribution.items():
    print(f"     - {count} person(s): {freq:,} collisions ({freq / len(persons_per_collision) * 100:.1f}%)")

In [None]:
# 3- Perform the integration
print("\n PERFORMING DATA INTEGRATION")
print("=" * 80)

# Check if COLLISION_ID exists and has correct data type
print("Pre-merge data type check:")
print(f"  Crashes COLLISION_ID dtype: {crashes_cleaned['COLLISION_ID'].dtype}")
print(f"  Person COLLISION_ID dtype: {person_cleaned['COLLISION_ID'].dtype}")

# Ensure COLLISION_ID is same type (convert to int64 where possible)
crashes_cleaned['COLLISION_ID'] = pd.to_numeric(crashes_cleaned['COLLISION_ID'], errors='coerce').astype('Int64')
person_cleaned['COLLISION_ID'] = pd.to_numeric(person_cleaned['COLLISION_ID'], errors='coerce').astype('Int64')

# Perform INNER JOIN
print("\nExecuting INNER JOIN...")
start_time = pd.Timestamp.now()

integrated_data = crashes_cleaned.merge(
    person_cleaned,
    on='COLLISION_ID',
    how='inner',
    suffixes=('_CRASH', '_PERSON'),
    indicator=True
)

end_time = pd.Timestamp.now()
merge_duration = (end_time - start_time).total_seconds()

print(f"✓ Join completed in {merge_duration:.2f} seconds")
print(f"\n INTEGRATION RESULTS:")
print(f"   • Original crashes: {len(crashes_cleaned):,} rows")
print(f"   • Original persons: {len(person_cleaned):,} rows")
print(f"   • Integrated dataset: {len(integrated_data):,} rows")
print(f"   • Expansion factor: {len(integrated_data) / len(crashes_cleaned):.2f}x")

# Analyze merge indicator
merge_stats = integrated_data['_merge'].value_counts()
print(f"\n MERGE STATISTICS:")
for merge_type, count in merge_stats.items():
    percentage = count / len(integrated_data) * 100
    if merge_type == 'both':
        print(f"   • Matched (crash + person data): {count:,} ({percentage:.1f}%)")

# Drop the merge indicator column
integrated_data = integrated_data.drop('_merge', axis=1)

In [None]:
integrated_data.drop(columns=['CRASH_DATETIME'],inplace=True)

In [None]:
integrated_data.info()

In [None]:
# 4- Post-integration data quality assessment
print("\n POST-INTEGRATION DATA QUALITY ASSESSMENT")
print("=" * 80)

# Check for duplicate columns (from suffixes)
duplicate_cols = [col for col in integrated_data.columns if '_CRASH' in col or '_PERSON' in col]
if len(duplicate_cols) > 0:
    print(f"\n DUPLICATE COLUMNS DETECTED (from merge suffixes):")
    for col in sorted(duplicate_cols):
        print(f"   • {col}")
    print(f"\n   Action required: Resolve {len(duplicate_cols)} duplicate columns")
else:
    print("\n✓ No duplicate columns detected")

# Check for new missing values
print(f"\n MISSING VALUES IN PERSON COLUMNS:")
person_cols = [col for col in integrated_data.columns if col in person_cleaned.columns and col != 'COLLISION_ID']
missing_summary = []
for col in person_cols[:10]:  # Show first 10 person columns
    missing_count = integrated_data[col].isna().sum()
    missing_pct = missing_count / len(integrated_data) * 100
    missing_summary.append({
        'Column': col,
        'Missing': missing_count,
        'Percentage': f"{missing_pct:.1f}%"
    })

missing_df = pd.DataFrame(missing_summary)
print(missing_df.to_string(index=False))

# Data type consistency check
print(f"\n DATA TYPE CONSISTENCY:")
print(f"   • Total columns: {len(integrated_data.columns)}")
print(f"   • Numeric columns: {len(integrated_data.select_dtypes(include=[np.number]).columns)}")
print(f"   • Object/String columns: {len(integrated_data.select_dtypes(include=['object']).columns)}")
print(f"   • Datetime columns: {len(integrated_data.select_dtypes(include=['datetime64']).columns)}")

In [None]:
# 7- Save integrated dataset
print("\n SAVING INTEGRATED DATASET")
print("=" * 80)

output_path = 'integrated_crashes_person.csv'
integrated_data.to_csv(output_path, index=False)

print(f"✓ Integrated dataset saved to: {output_path}")
print(f"  • Rows: {len(integrated_data):,}")
print(f"  • Columns: {len(integrated_data.columns)}")

In [None]:
integrated_data.isnull().sum()