In [1]:
import pandas as pd
import numpy as np
import pyarrow

In [2]:
# Adding engine='pyarrow' usually solves memory issues instantly
df = pd.read_csv('integrated_crashes_person.csv', engine='pyarrow')

In [3]:
# 'index=False' prevents saving the row numbers as a separate column
df.to_parquet('Final_Data.parquet', index=False)

In [4]:
df = pd.read_parquet(path='Final_Data.parquet', engine='pyarrow')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5257484 entries, 0 to 5257483
Data columns (total 39 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   BOROUGH                        object        
 1   ZIP CODE                       object        
 2   LATITUDE                       float64       
 3   LONGITUDE                      float64       
 4   LOCATION                       object        
 5   ON STREET NAME                 object        
 6   CROSS STREET NAME              object        
 7   OFF STREET NAME                object        
 8   NUMBER OF PERSONS INJURED      float64       
 9   NUMBER OF PERSONS KILLED       float64       
 10  NUMBER OF PEDESTRIANS INJURED  int64         
 11  NUMBER OF PEDESTRIANS KILLED   int64         
 12  NUMBER OF CYCLIST INJURED      int64         
 13  NUMBER OF CYCLIST KILLED       int64         
 14  NUMBER OF MOTORIST INJURED     int64         
 15  NUMBER OF MOTOR

In [6]:
df.nunique()

BOROUGH                               6
ZIP CODE                            235
LATITUDE                          82931
LONGITUDE                         53578
LOCATION                         361025
ON STREET NAME                   246835
CROSS STREET NAME                 16593
OFF STREET NAME                  231634
NUMBER OF PERSONS INJURED            30
NUMBER OF PERSONS KILLED              7
NUMBER OF PEDESTRIANS INJURED        14
NUMBER OF PEDESTRIANS KILLED          6
NUMBER OF CYCLIST INJURED             5
NUMBER OF CYCLIST KILLED              3
NUMBER OF MOTORIST INJURED           29
NUMBER OF MOTORIST KILLED             6
CONTRIBUTING FACTOR VEHICLE 1        18
CONTRIBUTING FACTOR VEHICLE 2        18
CONTRIBUTING FACTOR VEHICLE 3        16
CONTRIBUTING FACTOR VEHICLE 4        13
CONTRIBUTING FACTOR VEHICLE 5        12
VEHICLE TYPE CODE 1                1708
VEHICLE TYPE CODE 2                1910
VEHICLE TYPE CODE 3                 276
VEHICLE TYPE CODE 4                 105


In [7]:
unspecified_counts = (df == 'Unspecified').sum()

In [8]:
print(unspecified_counts[unspecified_counts > 0])

CONTRIBUTING FACTOR VEHICLE 1    1100742
CONTRIBUTING FACTOR VEHICLE 2    3789164
CONTRIBUTING FACTOR VEHICLE 3     583970
CONTRIBUTING FACTOR VEHICLE 4     170687
CONTRIBUTING FACTOR VEHICLE 5      57070
CONTRIBUTING_FACTOR              5228118
dtype: int64


In [9]:
# Replace exact matches of 'Unspecified' with 'u' everywhere
df = df.replace('Unspecified', 'u')

In [10]:
# Data Type Optimizations
optimized_dtypes = {
    # --- Ultra-Light Categories (<32,768 uniques) ---
    'BOROUGH': 'category',
    'ZIP CODE': 'category',
    'CROSS STREET NAME': 'category',
    'CONTRIBUTING FACTOR VEHICLE 1': 'category',
    'CONTRIBUTING FACTOR VEHICLE 2': 'category',
    'CONTRIBUTING FACTOR VEHICLE 3': 'category',
    'CONTRIBUTING FACTOR VEHICLE 4': 'category',
    'CONTRIBUTING FACTOR VEHICLE 5': 'category',
    'VEHICLE TYPE CODE 1': 'category',
    'VEHICLE TYPE CODE 2': 'category',
    'VEHICLE TYPE CODE 3': 'category',
    'VEHICLE TYPE CODE 4': 'category',
    'VEHICLE TYPE CODE 5': 'category',
    'PERSON_TYPE': 'category',
    'PERSON_INJURY': 'category',
    'PERSON_SEX': 'category',
    'PED_ROLE': 'category',
    'EJECTION': 'category',
    'EMOTIONAL_STATUS': 'category',
    'POSITION_IN_VEHICLE': 'category',
    'BODILY_INJURY': 'category',
    'COMPLAINT': 'category',
    'SAFETY_EQUIPMENT': 'category',
    'CONTRIBUTING_FACTOR': 'category',

    # --- Numeric Optimizations ---
    'LATITUDE': 'float32',
    'LONGITUDE': 'float32',
    'NUMBER OF PERSONS INJURED': 'UInt8',
    'NUMBER OF PERSONS KILLED': 'UInt8',
    'NUMBER OF PEDESTRIANS INJURED': 'UInt8',
    'NUMBER OF PEDESTRIANS KILLED': 'UInt8',
    'NUMBER OF CYCLIST INJURED': 'UInt8',
    'NUMBER OF CYCLIST KILLED': 'UInt8',
    'NUMBER OF MOTORIST INJURED': 'UInt8',
    'NUMBER OF MOTORIST KILLED': 'UInt8',
    'PERSON_AGE': 'UInt8',

    # --- Datetime Optimization ---
    'CRASH DATETIME': 'datetime64[ns]'
}

# Apply all at once
df = df.astype(optimized_dtypes)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5257484 entries, 0 to 5257483
Data columns (total 39 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   BOROUGH                        category      
 1   ZIP CODE                       category      
 2   LATITUDE                       float32       
 3   LONGITUDE                      float32       
 4   LOCATION                       object        
 5   ON STREET NAME                 object        
 6   CROSS STREET NAME              category      
 7   OFF STREET NAME                object        
 8   NUMBER OF PERSONS INJURED      UInt8         
 9   NUMBER OF PERSONS KILLED       UInt8         
 10  NUMBER OF PEDESTRIANS INJURED  UInt8         
 11  NUMBER OF PEDESTRIANS KILLED   UInt8         
 12  NUMBER OF CYCLIST INJURED      UInt8         
 13  NUMBER OF CYCLIST KILLED       UInt8         
 14  NUMBER OF MOTORIST INJURED     UInt8         
 15  NUMBER OF MOTOR

In [12]:
df.head()

Unnamed: 0,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,...,PERSON_SEX,PED_ROLE,PERSON_AGE,EJECTION,EMOTIONAL_STATUS,POSITION_IN_VEHICLE,BODILY_INJURY,COMPLAINT,SAFETY_EQUIPMENT,CONTRIBUTING_FACTOR
0,BROOKLYN,11230,40.621792,-73.970024,"(40.62179, -73.970024)",OCEAN PARKWAY,AVENUE K,No Off Street,1,0,...,F,Driver,63,Not Ejected,Does Not Apply,Driver,Does Not Apply,Does Not Apply,Seat Belt,u
1,BROOKLYN,11230,40.621792,-73.970024,"(40.62179, -73.970024)",OCEAN PARKWAY,AVENUE K,No Off Street,1,0,...,M,Driver,26,Ejected,Conscious,Driver,Knee-Lower Leg Foot,Complaint of Pain or Nausea,No Equipment,u
2,BROOKLYN,11208,40.667202,-73.866501,"(40.667202, -73.8665)",1211 LORING AVENUE,No Cross Street,1211 LORING AVENUE,0,0,...,F,Driver,28,Not Ejected,Does Not Apply,Unknown,Does Not Apply,Does Not Apply,No Equipment,u
3,BROOKLYN,11208,40.667202,-73.866501,"(40.667202, -73.8665)",1211 LORING AVENUE,No Cross Street,1211 LORING AVENUE,0,0,...,F,Registrant,28,Not Ejected,Does Not Apply,Driver,Does Not Apply,Does Not Apply,No Equipment,u
4,BROOKLYN,11233,40.683304,-73.917274,"(40.683304, -73.917274)",SARATOGA AVENUE,DECATUR STREET,No Off Street,0,0,...,M,Pedestrian,46,Not Ejected,Does Not Apply,"Front passenger, if two or more persons, inclu...",Does Not Apply,Does Not Apply,No Equipment,u


In [13]:
df.nunique()

BOROUGH                               6
ZIP CODE                            235
LATITUDE                          82931
LONGITUDE                         53578
LOCATION                         361025
ON STREET NAME                   246835
CROSS STREET NAME                 16593
OFF STREET NAME                  231634
NUMBER OF PERSONS INJURED            30
NUMBER OF PERSONS KILLED              7
NUMBER OF PEDESTRIANS INJURED        14
NUMBER OF PEDESTRIANS KILLED          6
NUMBER OF CYCLIST INJURED             5
NUMBER OF CYCLIST KILLED              3
NUMBER OF MOTORIST INJURED           29
NUMBER OF MOTORIST KILLED             6
CONTRIBUTING FACTOR VEHICLE 1        18
CONTRIBUTING FACTOR VEHICLE 2        18
CONTRIBUTING FACTOR VEHICLE 3        16
CONTRIBUTING FACTOR VEHICLE 4        13
CONTRIBUTING FACTOR VEHICLE 5        12
VEHICLE TYPE CODE 1                1708
VEHICLE TYPE CODE 2                1910
VEHICLE TYPE CODE 3                 276
VEHICLE TYPE CODE 4                 105


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5257484 entries, 0 to 5257483
Data columns (total 39 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   BOROUGH                        category      
 1   ZIP CODE                       category      
 2   LATITUDE                       float32       
 3   LONGITUDE                      float32       
 4   LOCATION                       object        
 5   ON STREET NAME                 object        
 6   CROSS STREET NAME              category      
 7   OFF STREET NAME                object        
 8   NUMBER OF PERSONS INJURED      UInt8         
 9   NUMBER OF PERSONS KILLED       UInt8         
 10  NUMBER OF PEDESTRIANS INJURED  UInt8         
 11  NUMBER OF PEDESTRIANS KILLED   UInt8         
 12  NUMBER OF CYCLIST INJURED      UInt8         
 13  NUMBER OF CYCLIST KILLED       UInt8         
 14  NUMBER OF MOTORIST INJURED     UInt8         
 15  NUMBER OF MOTOR

In [15]:
for col in ['ON STREET NAME', 'OFF STREET NAME']:
    df[col] = df[col].astype('category')

# We use memory_usage='deep' to see the true size including text strings
print("Real Memory Usage:")
df.info(memory_usage='deep')

Real Memory Usage:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5257484 entries, 0 to 5257483
Data columns (total 39 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   BOROUGH                        category      
 1   ZIP CODE                       category      
 2   LATITUDE                       float32       
 3   LONGITUDE                      float32       
 4   LOCATION                       object        
 5   ON STREET NAME                 category      
 6   CROSS STREET NAME              category      
 7   OFF STREET NAME                category      
 8   NUMBER OF PERSONS INJURED      UInt8         
 9   NUMBER OF PERSONS KILLED       UInt8         
 10  NUMBER OF PEDESTRIANS INJURED  UInt8         
 11  NUMBER OF PEDESTRIANS KILLED   UInt8         
 12  NUMBER OF CYCLIST INJURED      UInt8         
 13  NUMBER OF CYCLIST KILLED       UInt8         
 14  NUMBER OF MOTORIST INJURED     UInt8         
 

In [16]:
df.drop('LOCATION', axis=1, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5257484 entries, 0 to 5257483
Data columns (total 38 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   BOROUGH                        category      
 1   ZIP CODE                       category      
 2   LATITUDE                       float32       
 3   LONGITUDE                      float32       
 4   ON STREET NAME                 category      
 5   CROSS STREET NAME              category      
 6   OFF STREET NAME                category      
 7   NUMBER OF PERSONS INJURED      UInt8         
 8   NUMBER OF PERSONS KILLED       UInt8         
 9   NUMBER OF PEDESTRIANS INJURED  UInt8         
 10  NUMBER OF PEDESTRIANS KILLED   UInt8         
 11  NUMBER OF CYCLIST INJURED      UInt8         
 12  NUMBER OF CYCLIST KILLED       UInt8         
 13  NUMBER OF MOTORIST INJURED     UInt8         
 14  NUMBER OF MOTORIST KILLED      UInt8         
 15  CONTRIBUTING FA

In [17]:
df.drop(columns='OFF STREET NAME', inplace=True,axis=1)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5257484 entries, 0 to 5257483
Data columns (total 37 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   BOROUGH                        category      
 1   ZIP CODE                       category      
 2   LATITUDE                       float32       
 3   LONGITUDE                      float32       
 4   ON STREET NAME                 category      
 5   CROSS STREET NAME              category      
 6   NUMBER OF PERSONS INJURED      UInt8         
 7   NUMBER OF PERSONS KILLED       UInt8         
 8   NUMBER OF PEDESTRIANS INJURED  UInt8         
 9   NUMBER OF PEDESTRIANS KILLED   UInt8         
 10  NUMBER OF CYCLIST INJURED      UInt8         
 11  NUMBER OF CYCLIST KILLED       UInt8         
 12  NUMBER OF MOTORIST INJURED     UInt8         
 13  NUMBER OF MOTORIST KILLED      UInt8         
 14  CONTRIBUTING FACTOR VEHICLE 1  category      
 15  CONTRIBUTING FA

In [19]:
df.to_parquet('Final_Data.parquet', index=False)

In [None]:
# Create a sample dataset for deployment testing
sample_size = 10000  # Adjust size as needed
df_sample = df.sample(n=sample_size, random_state=42)

print(f"Sample dataset created: {df_sample.shape}")
print(f"Memory usage: {df_sample.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Save sample as parquet for testing
df_sample.to_parquet('Sample_Data.parquet', index=False)
print("✅ Sample saved as 'Sample_Data.parquet'")