In [1]:
pip install geohash

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
pip install python-geohash





[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [3]:
pip install h3

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
pip install s2sphere

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from s2sphere import CellId, LatLng
latlng = LatLng.from_degrees(latitude, longitude)
row['S2'] = CellId.from_lat_lng(latlng).id()

In [None]:
import csv

# Load data from a CSV file
def load_data(filename):
    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        return [row for row in reader]

# Load datasets
crashes_data = load_data('Crashes.csv')
people_data = load_data('People.csv')
vehicles_data = load_data('Vehicles.csv')

In [2]:
# Helper functions for mean, median, mode
def calculate_mean(data, column):
    values = [float(row[column]) for row in data if row[column].strip()]
    return sum(values) / len(values) if values else 0

def calculate_median(data, column):
    values = sorted([float(row[column]) for row in data if row[column].strip()])
    n = len(values)
    if n == 0:
        return 0
    elif n % 2 == 1:
        return values[n // 2]
    else:
        return (values[n // 2 - 1] + values[n // 2]) / 2

def calculate_mode(data, column):
    count = {}
    for row in data:
        value = row[column].strip()
        if value:
            count[value] = count.get(value, 0) + 1
    return max(count, key=count.get) if count else 'Unknown'

# Cleaning Crashes Data
def clean_crashes_data(data):
    mean_latitude = calculate_mean(data, 'LATITUDE')
    mean_longitude = calculate_mean(data, 'LONGITUDE')
    mode_report_type = calculate_mode(data, 'REPORT_TYPE')
    mode_beat_occurrence = calculate_mode(data, 'BEAT_OF_OCCURRENCE')
    
    for row in data:
        # Handle LOCATION based on LAT/LON
        if not row['LATITUDE'].strip():
            row['LATITUDE'] = str(mean_latitude)
        if not row['LONGITUDE'].strip():
            row['LONGITUDE'] = str(mean_longitude)
        
        # Generate LOCATION if possible
        if row['LATITUDE'].strip() and row['LONGITUDE'].strip():
            row['LOCATION'] = f"{row['LATITUDE']}, {row['LONGITUDE']}"
        else:
            row['LOCATION'] = 'Unknown'

        # Fill REPORT_TYPE with mode or 'Unknown'
        if not row['REPORT_TYPE'].strip():
            row['REPORT_TYPE'] = mode_report_type

        # Fill BEAT_OF_OCCURRENCE with mode if missing
        if not row['BEAT_OF_OCCURRENCE'].strip():
            row['BEAT_OF_OCCURRENCE'] = mode_beat_occurrence

    return data

# Cleaning People Data
def clean_people_data(data):
    mean_age = calculate_mean(data, 'AGE')
    mode_values = {
        'DRIVER_ACTION': calculate_mode(data, 'DRIVER_ACTION'),
        'DRIVER_VISION': calculate_mode(data, 'DRIVER_VISION'),
        'PHYSICAL_CONDITION': calculate_mode(data, 'PHYSICAL_CONDITION'),
        'BAC_RESULT': calculate_mode(data, 'BAC_RESULT'),
        'CITY': calculate_mode(data, 'CITY'),
        'STATE': calculate_mode(data, 'STATE'),
        'SEX': calculate_mode(data, 'SEX'),
        'SAFETY_EQUIPMENT': calculate_mode(data, 'SAFETY_EQUIPMENT'),
        'AIRBAG_DEPLOYED': calculate_mode(data, 'AIRBAG_DEPLOYED'),
        'EJECTION': calculate_mode(data, 'EJECTION'),
        'INJURY_CLASSIFICATION': calculate_mode(data, 'INJURY_CLASSIFICATION')
    }
    
    for row in data:
        # Fill AGE with mean if missing
        if not row['AGE'].strip():
            row['AGE'] = str(mean_age)
        
        # Fill other fields with precomputed modes
        for column, mode_value in mode_values.items():
            if not row[column].strip():
                row[column] = mode_value
    
    return data

# Cleaning Vehicles Data
def clean_vehicles_data(data):
    mode_values = {
        'UNIT_TYPE': calculate_mode(data, 'UNIT_TYPE'),
        'MAKE': calculate_mode(data, 'MAKE'),
        'MODEL': calculate_mode(data, 'MODEL'),
        'VEHICLE_TYPE': calculate_mode(data, 'VEHICLE_TYPE'),
        'VEHICLE_USE': calculate_mode(data, 'VEHICLE_USE'),
        'LIC_PLATE_STATE': calculate_mode(data, 'LIC_PLATE_STATE'),
        'VEHICLE_YEAR': calculate_median(data, 'VEHICLE_YEAR'),  # Use median for numeric
        'TRAVEL_DIRECTION': calculate_mode(data, 'TRAVEL_DIRECTION'),
        'MANEUVER': calculate_mode(data, 'MANEUVER'),
        'OCCUPANT_CNT': calculate_mode(data, 'OCCUPANT_CNT'),
        'FIRST_CONTACT_POINT': calculate_mode(data, 'FIRST_CONTACT_POINT')
    }

    for row in data:
        # Fill categorical columns with modes or numeric with median
        for column, mode_value in mode_values.items():
            if not row[column].strip():
                row[column] = str(mode_value) if isinstance(mode_value, float) else mode_value

    return data

# Apply cleaning functions
cleaned_crashes_data = clean_crashes_data(crashes_data)
cleaned_people_data = clean_people_data(people_data)
cleaned_vehicles_data = clean_vehicles_data(vehicles_data)

# Verify remaining missing values
def check_missing_values(data, dataset_name):
    missing_count = {key: 0 for key in data[0].keys()}
    for row in data:
        for key, value in row.items():
            if value is None or value.strip() == '':
                missing_count[key] += 1

    print(f"\nMissing Values Summary for {dataset_name}:")
    total_missing = 0
    for key, count in missing_count.items():
        print(f"{key}: {count} missing values")
        total_missing += count
    print(f"Total Missing Values in {dataset_name}: {total_missing}")

# Check final missing values
check_missing_values(cleaned_crashes_data, "crashes_data")
check_missing_values(cleaned_people_data, "people_data")
check_missing_values(cleaned_vehicles_data, "vehicles_data")


Missing Values Summary for crashes_data:
RD_NO: 0 missing values
CRASH_DATE: 0 missing values
POSTED_SPEED_LIMIT: 0 missing values
TRAFFIC_CONTROL_DEVICE: 0 missing values
DEVICE_CONDITION: 0 missing values
WEATHER_CONDITION: 0 missing values
LIGHTING_CONDITION: 0 missing values
FIRST_CRASH_TYPE: 0 missing values
TRAFFICWAY_TYPE: 0 missing values
ALIGNMENT: 0 missing values
ROADWAY_SURFACE_COND: 0 missing values
ROAD_DEFECT: 0 missing values
REPORT_TYPE: 0 missing values
CRASH_TYPE: 0 missing values
DATE_POLICE_NOTIFIED: 0 missing values
PRIM_CONTRIBUTORY_CAUSE: 0 missing values
SEC_CONTRIBUTORY_CAUSE: 0 missing values
STREET_NO: 0 missing values
STREET_DIRECTION: 2 missing values
STREET_NAME: 1 missing values
BEAT_OF_OCCURRENCE: 0 missing values
NUM_UNITS: 0 missing values
MOST_SEVERE_INJURY: 7 missing values
INJURIES_TOTAL: 0 missing values
INJURIES_FATAL: 0 missing values
INJURIES_INCAPACITATING: 0 missing values
INJURIES_NON_INCAPACITATING: 0 missing values
INJURIES_REPORTED_NOT_E

In [3]:
def clean_crashes_data(data):
    mean_latitude = calculate_mean(data, 'LATITUDE')
    mean_longitude = calculate_mean(data, 'LONGITUDE')
    mode_report_type = calculate_mode(data, 'REPORT_TYPE')
    mode_street_direction = calculate_mode(data, 'STREET_DIRECTION')
    mode_street_name = calculate_mode(data, 'STREET_NAME')
    mode_most_severe_injury = calculate_mode(data, 'MOST_SEVERE_INJURY')
    
    for row in data:
        # Fill LATITUDE and LONGITUDE with mean if missing
        if not row['LATITUDE'].strip():
            row['LATITUDE'] = str(mean_latitude)
        if not row['LONGITUDE'].strip():
            row['LONGITUDE'] = str(mean_longitude)
        
        # Generate LOCATION if possible
        if row['LATITUDE'].strip() and row['LONGITUDE'].strip():
            row['LOCATION'] = f"{row['LATITUDE']}, {row['LONGITUDE']}"
        else:
            row['LOCATION'] = 'Unknown'

        # Fill REPORT_TYPE with mode if missing
        if not row['REPORT_TYPE'].strip():
            row['REPORT_TYPE'] = mode_report_type

        # Fill STREET_DIRECTION and STREET_NAME with mode
        if not row['STREET_DIRECTION'].strip():
            row['STREET_DIRECTION'] = mode_street_direction
        if not row['STREET_NAME'].strip():
            row['STREET_NAME'] = mode_street_name

        # Fill MOST_SEVERE_INJURY with mode
        if not row['MOST_SEVERE_INJURY'].strip():
            row['MOST_SEVERE_INJURY'] = mode_most_severe_injury

    return data

# Run the function on crashes_data
cleaned_crashes_data = clean_crashes_data(crashes_data)


In [6]:
check_missing_values(cleaned_crashes_data, "crashes_data")
check_missing_values(cleaned_people_data, "people_data")
check_missing_values(cleaned_vehicles_data, "vehicles_data")



Missing Values Summary for crashes_data:
RD_NO: 0 missing values
CRASH_DATE: 0 missing values
POSTED_SPEED_LIMIT: 0 missing values
TRAFFIC_CONTROL_DEVICE: 0 missing values
DEVICE_CONDITION: 0 missing values
WEATHER_CONDITION: 0 missing values
LIGHTING_CONDITION: 0 missing values
FIRST_CRASH_TYPE: 0 missing values
TRAFFICWAY_TYPE: 0 missing values
ALIGNMENT: 0 missing values
ROADWAY_SURFACE_COND: 0 missing values
ROAD_DEFECT: 0 missing values
REPORT_TYPE: 0 missing values
CRASH_TYPE: 0 missing values
DATE_POLICE_NOTIFIED: 0 missing values
PRIM_CONTRIBUTORY_CAUSE: 0 missing values
SEC_CONTRIBUTORY_CAUSE: 0 missing values
STREET_NO: 0 missing values
STREET_DIRECTION: 0 missing values
STREET_NAME: 0 missing values
BEAT_OF_OCCURRENCE: 0 missing values
NUM_UNITS: 0 missing values
MOST_SEVERE_INJURY: 0 missing values
INJURIES_TOTAL: 0 missing values
INJURIES_FATAL: 0 missing values
INJURIES_INCAPACITATING: 0 missing values
INJURIES_NON_INCAPACITATING: 0 missing values
INJURIES_REPORTED_NOT_E

In [7]:
def clean_people_data(data):
    mean_age = calculate_mean(data, 'AGE')
    median_damage = calculate_median(data, 'DAMAGE')

    # Precompute modes for categorical columns
    mode_values = {
        'CITY': calculate_mode(data, 'CITY'),
        'STATE': calculate_mode(data, 'STATE'),
        'SEX': calculate_mode(data, 'SEX'),
        'SAFETY_EQUIPMENT': calculate_mode(data, 'SAFETY_EQUIPMENT'),
        'AIRBAG_DEPLOYED': calculate_mode(data, 'AIRBAG_DEPLOYED'),
        'EJECTION': calculate_mode(data, 'EJECTION'),
        'INJURY_CLASSIFICATION': calculate_mode(data, 'INJURY_CLASSIFICATION')
    }

    # Precompute grouped medians for DAMAGE based on INJURY_CLASSIFICATION
    injury_group_medians = {}
    for group in set(row['INJURY_CLASSIFICATION'] for row in data if row['INJURY_CLASSIFICATION'].strip()):
        group_rows = [r for r in data if r['INJURY_CLASSIFICATION'] == group and r['DAMAGE'].strip()]
        injury_group_medians[group] = calculate_median(group_rows, 'DAMAGE')

    for row in data:
        # Fill AGE with mean if missing
        if not row['AGE'].strip():
            row['AGE'] = str(mean_age)

        # Fill VEHICLE_ID with a placeholder if missing
        if not row['VEHICLE_ID'].strip():
            row['VEHICLE_ID'] = 'Unknown_Vehicle'

        # Fill DAMAGE using grouped medians or overall median
        if not row['DAMAGE'].strip():
            group = row['INJURY_CLASSIFICATION']
            row['DAMAGE'] = str(injury_group_medians.get(group, median_damage))

        # Fill other categorical fields with precomputed modes
        for column in mode_values.keys():
            if not row[column].strip():
                row[column] = mode_values[column]

    return data

# Apply the optimized cleaning function
cleaned_people_data = clean_people_data(people_data)


In [8]:
def clean_vehicles_data(data):
    # Precompute median and mode values for each column
    median_vehicle_year = calculate_median(data, 'VEHICLE_YEAR')
    mode_values = {
        'UNIT_TYPE': calculate_mode(data, 'UNIT_TYPE'),
        'MAKE': calculate_mode(data, 'MAKE'),
        'MODEL': calculate_mode(data, 'MODEL'),
        'VEHICLE_TYPE': calculate_mode(data, 'VEHICLE_TYPE'),
        'VEHICLE_USE': calculate_mode(data, 'VEHICLE_USE'),
        'LIC_PLATE_STATE': calculate_mode(data, 'LIC_PLATE_STATE'),
        'VEHICLE_DEFECT': calculate_mode(data, 'VEHICLE_DEFECT'),
        'TRAVEL_DIRECTION': calculate_mode(data, 'TRAVEL_DIRECTION'),
        'MANEUVER': calculate_mode(data, 'MANEUVER'),
        'OCCUPANT_CNT': calculate_mode(data, 'OCCUPANT_CNT'),
        'FIRST_CONTACT_POINT': calculate_mode(data, 'FIRST_CONTACT_POINT')
    }

    for row in data:
        # Fill VEHICLE_ID with a placeholder if missing
        if not row['VEHICLE_ID'].strip():
            row['VEHICLE_ID'] = 'Unknown_Vehicle'

        # Fill VEHICLE_DEFECT with mode if missing
        if not row['VEHICLE_DEFECT'].strip():
            row['VEHICLE_DEFECT'] = mode_values['VEHICLE_DEFECT']

        # Fill LIC_PLATE_STATE and VEHICLE_YEAR
        if not row['LIC_PLATE_STATE'].strip():
            row['LIC_PLATE_STATE'] = mode_values['LIC_PLATE_STATE']
        if not row['VEHICLE_YEAR'].strip():
            row['VEHICLE_YEAR'] = str(median_vehicle_year)

        # Fill other categorical columns with modes
        for column, mode_value in mode_values.items():
            if not row[column].strip():
                row[column] = mode_value

    return data

# Apply the optimized cleaning function
cleaned_vehicles_data = clean_vehicles_data(vehicles_data)


In [12]:
def clean_people_data(data):
    # Precompute necessary statistics for columns with missing values
    median_damage = calculate_median(data, 'DAMAGE')

    # Calculate group-specific medians for DAMAGE based on INJURY_CLASSIFICATION
    injury_group_medians = {}
    for group in set(row['INJURY_CLASSIFICATION'] for row in data if row['INJURY_CLASSIFICATION'].strip()):
        group_rows = [r for r in data if r['INJURY_CLASSIFICATION'] == group and r['DAMAGE'].strip()]
        injury_group_medians[group] = calculate_median(group_rows, 'DAMAGE')

    for row in data:
        # Fill VEHICLE_ID with a placeholder if missing
        if not row['VEHICLE_ID'].strip():
            row['VEHICLE_ID'] = 'Unknown_Vehicle'

        # Fill DAMAGE using group-specific median or overall median
        if not row['DAMAGE'].strip():
            group = row['INJURY_CLASSIFICATION']
            row['DAMAGE'] = str(injury_group_medians.get(group, median_damage))

    return data

# Apply the targeted cleaning function
cleaned_people_data = clean_people_data(people_data)


In [13]:
def clean_vehicles_data(data):
    # Precompute mode for VEHICLE_DEFECT
    mode_vehicle_defect = calculate_mode(data, 'VEHICLE_DEFECT')

    for row in data:
        # Fill VEHICLE_ID with a placeholder if missing
        if not row['VEHICLE_ID'].strip():
            row['VEHICLE_ID'] = 'Unknown_Vehicle'

        # Fill VEHICLE_DEFECT with mode if missing
        if not row['VEHICLE_DEFECT'].strip():
            row['VEHICLE_DEFECT'] = mode_vehicle_defect

    return data

# Apply the targeted cleaning function
cleaned_vehicles_data = clean_vehicles_data(vehicles_data)


In [14]:
# Verify missing values post-optimization
check_missing_values(cleaned_crashes_data, "crashes_data")
check_missing_values(cleaned_people_data, "people_data")
check_missing_values(cleaned_vehicles_data, "vehicles_data")



Missing Values Summary for crashes_data:
RD_NO: 0 missing values
CRASH_DATE: 0 missing values
POSTED_SPEED_LIMIT: 0 missing values
TRAFFIC_CONTROL_DEVICE: 0 missing values
DEVICE_CONDITION: 0 missing values
WEATHER_CONDITION: 0 missing values
LIGHTING_CONDITION: 0 missing values
FIRST_CRASH_TYPE: 0 missing values
TRAFFICWAY_TYPE: 0 missing values
ALIGNMENT: 0 missing values
ROADWAY_SURFACE_COND: 0 missing values
ROAD_DEFECT: 0 missing values
REPORT_TYPE: 0 missing values
CRASH_TYPE: 0 missing values
DATE_POLICE_NOTIFIED: 0 missing values
PRIM_CONTRIBUTORY_CAUSE: 0 missing values
SEC_CONTRIBUTORY_CAUSE: 0 missing values
STREET_NO: 0 missing values
STREET_DIRECTION: 0 missing values
STREET_NAME: 0 missing values
BEAT_OF_OCCURRENCE: 0 missing values
NUM_UNITS: 0 missing values
MOST_SEVERE_INJURY: 0 missing values
INJURIES_TOTAL: 0 missing values
INJURIES_FATAL: 0 missing values
INJURIES_INCAPACITATING: 0 missing values
INJURIES_NON_INCAPACITATING: 0 missing values
INJURIES_REPORTED_NOT_E

In [15]:
import csv

# Function to save cleaned data to a CSV file
def save_to_csv(data, filename):
    # Get the headers from the first row of data
    headers = data[0].keys()
    
    # Write data to CSV
    with open(filename, 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()  # Write the header
        writer.writerows(data)  # Write all data rows

# Save each cleaned dataset to CSV
save_to_csv(cleaned_crashes_data, 'Cleaned_Crashes.csv')
save_to_csv(cleaned_people_data, 'Cleaned_People.csv')
save_to_csv(cleaned_vehicles_data, 'Cleaned_Vehicles.csv')
