# <b> NYCOpenData Collsion Data Analysis <b>
<hr>

## Install Requirements

In [1]:
import datetime as dt
import requests
import numpy as np
import pandas as pd

from pandas.api.types import is_datetime64_any_dtype


## Fetch NYCOpenData API

### Create Empty Dataframe

In [2]:
# Heaeders from source: https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95/about_data
collision_data_headers = [
    'crash_date', 'crash_time', 'borough', 'zip_code', 'latitude', 'longitude',
    'location', 'on_street_name', 'off_street_name', 'cross_street_name',
    'number_of_persons_injured', 'number_of_persons_killed',
    'number_of_pedestrians_injured', 'number_of_pedestrians_killed',
    'number_of_cyclist_injured', 'number_of_cyclist_killed',
    'number_of_motorist_injured', 'number_of_motorist_killed',
    'contributing_factor_vehicle_1', 'contributing_factor_vehicle_2',
    'contributing_factor_vehicle_3', 'contributing_factor_vehicle_4',
    'contributing_factor_vehicle_5', 'collision_id', 'vehicle_type_code1',
    'vehicle_type_code2', 'vehicle_type_code_3', 'vehicle_type_code_4',
    'vehicle_type_code_5'
]

# Create empty DataFrame with headers
collision_data = pd.DataFrame(columns=collision_data_headers)

### Fetch Data

In [None]:
num_of_records = 100 # number of records per API fetch
starting_page = 0
ending_page = 100

while True:
    try:
        print(f"\n--------------------\nCalling URL with 'offset' or records {starting_page} to {ending_page}, with record per call size of {num_of_records}")
        response = requests.get(f'https://data.cityofnewyork.us/resource/h9gi-nx95.json?$limit={num_of_records}&$offset={ending_page}')

        if response.status_code == 200:
            print("Response: ", response.status_code)
            data = pd.DataFrame(response.json())

            if not data.empty:
                collision_data = pd.concat([collision_data, data], ignore_index=True)
                print(f"✅ Fetched {len(data)} records, total records so far: {len(collision_data)}")
                starting_page = ending_page
                ending_page += 100

                # COMMENT OUT IF IN PRODUCTION
                if starting_page >= 500:
                    print("Reached Temporary Limit of 500 records.")
                    break

            if data.empty:
                print("No more data to fetch.")
                break

    except Exception as e:
        print("An error occured: ", e)


--------------------
Calling URL with 'offset' or records 0 to 100, with record per call size of 100
Response:  200
✅ Fetched 100 records, total records so far: 100

--------------------
Calling URL with 'offset' or records 100 to 200, with record per call size of 100
Response:  200
✅ Fetched 100 records, total records so far: 200

--------------------
Calling URL with 'offset' or records 200 to 300, with record per call size of 100
Response:  200
✅ Fetched 100 records, total records so far: 300

--------------------
Calling URL with 'offset' or records 300 to 400, with record per call size of 100
Response:  200
✅ Fetched 100 records, total records so far: 400

--------------------
Calling URL with 'offset' or records 400 to 500, with record per call size of 100
Response:  200
✅ Fetched 100 records, total records so far: 500
Reached Temporary Limit of 500 records.


### Create File for RAW Output and Copy For Analysis

In [4]:
# Raw NYCOpendata Output
collision_data.to_csv('RAW_collision_data.csv')

# For Analysis
collision_df = collision_data.copy()

<hr>

## EDA, Cleaning & Preparing for Analytics

### Schema Info & Preview Data

In [5]:
collision_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   crash_date                     500 non-null    object
 1   crash_time                     500 non-null    object
 2   borough                        334 non-null    object
 3   zip_code                       333 non-null    object
 4   latitude                       471 non-null    object
 5   longitude                      471 non-null    object
 6   location                       471 non-null    object
 7   on_street_name                 361 non-null    object
 8   off_street_name                224 non-null    object
 9   cross_street_name              139 non-null    object
 10  number_of_persons_injured      500 non-null    object
 11  number_of_persons_killed       500 non-null    object
 12  number_of_pedestrians_injured  500 non-null    object
 13  numbe

In [6]:
collision_df.head()

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,off_street_name,cross_street_name,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5
0,2021-12-10T00:00:00.000,1:10,,,40.662575,-73.93448,"{'latitude': '40.662575', 'longitude': '-73.93...",SCHENECTADY AVENUE,,,...,Unspecified,,,,4485706,Station Wagon/Sport Utility Vehicle,Sedan,,,
1,2021-12-09T00:00:00.000,0:00,,,40.7448,-73.953415,"{'latitude': '40.7448', 'longitude': '-73.9534...",47 ROAD,,,...,,,,,4485384,Sedan,,,,
2,2021-12-08T00:00:00.000,18:36,,,40.79672,-73.97618,"{'latitude': '40.79672', 'longitude': '-73.976...",HENRY HUDSON PARKWAY,,,...,Unspecified,,,,4484799,Sedan,Station Wagon/Sport Utility Vehicle,,,
3,2021-12-09T00:00:00.000,20:30,QUEENS,11004.0,40.752777,-73.70743,"{'latitude': '40.752777', 'longitude': '-73.70...",,,269-01 76 AVENUE,...,,,,,4485077,Sedan,,,,
4,2021-12-09T00:00:00.000,10:13,BROOKLYN,11203.0,40.638523,-73.92607,"{'latitude': '40.638523', 'longitude': '-73.92...",KINGS HIGHWAY,FARRAGUT ROAD,,...,Traffic Control Disregarded,,,,4485090,Sedan,Sedan,,,


### Check & Correct Duplicate Records - `collision_id`

In [7]:
# Compare total records vs unique records
total_records_column = collision_df.columns.get_loc('collision_id')
total_records = collision_df[collision_df.columns[total_records_column]].count()

unique_count = collision_df['collision_id'].nunique()

if unique_count == total_records:
    print(f"✅ All {total_records} records are unique.")
else:
    print(f"⚠️ Not all {total_records} records are unique. There are {total_records - unique_count} repeated records.\nProcessing removal.\nFinal count: {unique_count}")

    # Empty Dataframe for handling duplicates
    duplicates_df = pd.DataFrame(None)

    # Here are repeated records
    duplicates_df = collision_df[collision_df.duplicated(subset=['collision_id'], keep=False)].sort_values(by='collision_id')

    # Duplicates df to csv
    print(f"CSV generated with duplicates")
    duplicates_df.to_csv('collision_data_duplicates.csv')

    # Return updated dataframe
    collision_df = collision_df.drop_duplicates(subset=['collision_id'], keep='first', inplace=False)

✅ All 500 records are unique.


### 🕐 Datatime Validation & Clean Up & Transformation - `crash_date` and `crash_time`

#### Reformat date and time

In [8]:
# crash_date
if not is_datetime64_any_dtype(collision_df['crash_date']):
    collision_df['crash_date'] = pd.to_datetime(
        collision_df['crash_date'], errors = 'coerce'
    )

# crash_time
is_time_objects = collision_df['crash_time'].dropna().map(
    lambda x: isinstance(x, dt.time)
).all()

if not is_time_objects:
    collision_df['crash_time'] = pd.to_datetime(
        collision_df['crash_time'], errors = 'coerce', format='mixed'
    ).dt.time

collision_df[['crash_date','crash_time']].head()

Unnamed: 0,crash_date,crash_time
0,2021-12-10,01:10:00
1,2021-12-09,00:00:00
2,2021-12-08,18:36:00
3,2021-12-09,20:30:00
4,2021-12-09,10:13:00


#### Crate Column for Aggregate Count

In [9]:
# Create new columns to assist with time analysis
# crash_date_YM: Year-Month (YYYY-MM)

# -- test 1 -- 
# crash_date_Month: Month name (April, May, etc.)
collision_df['crash_date_YM'] = collision_df['crash_date'].dt.to_period('M')
collision_df['crash_date_Month'] = collision_df['crash_date'].dt.month_name()

# Reorder the columns for better readabilility
collision_df.insert(1, 'crash_date_YM', collision_df.pop('crash_date_YM'))
collision_df.insert(2, 'crash_date_Month', collision_df.pop('crash_date_Month'))

# Lets create categories for crash_time
collision_df['hour_classification'] = None
collision_df.insert(3, 'hour_classification', collision_df.pop('hour_classification'))


for index,row in collision_df.iterrows():
    # 11:00PM to 6:59AM = Night Time
    if row['crash_time'] >= dt.time(hour=23) or row['crash_time'] < dt.time(hour=7):
        collision_df.loc[index, 'hour_classification'] = 'Night'
    # 7:00 AM to 10:59AM = Morning
    elif row['crash_time'] >= dt.time(hour=7) or row['crash_time'] < dt.time(hour=11):
        collision_df.loc[index, 'hour_classification'] = 'Morning'
    # 11:00 AM to 1:59PM = Midday
    elif row['crash_time'] >= dt.time(hour=11) or row['crash_time'] < dt.time(hour=14):
        collision_df.loc[index, 'hour_classification'] = 'Midday'
    # 2:00 PM to 6:59PM = Afternoon
    elif row['crash_time'] >= dt.time(hour=14) or row['crash_time'] < dt.time(hour=19):
        collision_df.loc[index, 'hour_classification'] = 'Midday'
    else:
        # 7:00 PM to 10:59PM = Evening
        collision_df.loc[index, 'hour_classification'] = 'Evening'

# Lets make time readable to the normal user
collision_df['crash_time_fmat_12hr'] = collision_df['crash_time'].apply(lambda x: x.strftime('%I:%M %p'))
collision_df.insert(3, 'crash_time_fmat_12hr', collision_df.pop('crash_time_fmat_12hr'))

# Preview these new columns
collision_df.head()

Unnamed: 0,crash_date,crash_date_YM,crash_date_Month,crash_time_fmat_12hr,hour_classification,crash_time,borough,zip_code,latitude,longitude,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5
0,2021-12-10,2021-12,December,01:10 AM,Night,01:10:00,,,40.662575,-73.93448,...,Unspecified,,,,4485706,Station Wagon/Sport Utility Vehicle,Sedan,,,
1,2021-12-09,2021-12,December,12:00 AM,Night,00:00:00,,,40.7448,-73.953415,...,,,,,4485384,Sedan,,,,
2,2021-12-08,2021-12,December,06:36 PM,Morning,18:36:00,,,40.79672,-73.97618,...,Unspecified,,,,4484799,Sedan,Station Wagon/Sport Utility Vehicle,,,
3,2021-12-09,2021-12,December,08:30 PM,Morning,20:30:00,QUEENS,11004.0,40.752777,-73.70743,...,,,,,4485077,Sedan,,,,
4,2021-12-09,2021-12,December,10:13 AM,Morning,10:13:00,BROOKLYN,11203.0,40.638523,-73.92607,...,Traffic Control Disregarded,,,,4485090,Sedan,Sedan,,,


#### Collisions by Year-Month

In [10]:
# collision_data['crash_date_YM'] = collision_data['crash_date'].dt.to_period('M')
year_month_count = collision_df.groupby(['crash_date_YM','crash_date_Month']).size()
year_month_count.to_csv('year_month_count.csv', index=True)

# Print snippet of analytics
year_month_count.head()

crash_date_YM  crash_date_Month
2019-05        May                   1
2020-01        January               1
2021-02        February              1
2021-03        March                 2
2021-04        April               133
dtype: int64

### 📍 Location Validation

#### Check if Lat, Long are Missing

In [11]:
geo_columns_to_check =  ['latitude', 'longitude']

missing_column = False
for col in geo_columns_to_check:
    if collision_df[col].isnull().any():
        missing_column = True
        print(f"❌ Missing values found in column: {col}")
    else:
        print(f"✅ No missing values in column: {col}")

❌ Missing values found in column: latitude
❌ Missing values found in column: longitude


#### New Lat and Long

In [12]:
# Ensure the columns lat and long are correct data types
collision_df['latitude'] = pd.to_numeric(collision_df['latitude'], errors='coerce').astype(float)
collision_df['longitude'] = pd.to_numeric(collision_df['longitude'], errors='coerce').astype(float)

# Craete flag for missing lat/long
collision_df['flag_orig_latlong_missing'] = False
collision_df['Lat_x'] = np.nan
collision_df['Lon_y'] = np.nan

# Check for missing lat/long and set flag
for index,row in collision_df.iterrows():
    if pd.isna(row['latitude']) or pd.isna(row['longitude']) or row['latitude'] == 0 or row['longitude'] == 0:
        collision_df.loc[index, "flag_orig_latlong_missing"] = True
        print(f"❌ Missing lat/long at index {index}, {row['collision_id']}: {row['latitude']}, {row['longitude']}")
    else:
        print(f"✅ Lat/Long found at at index {index}, {row['collision_id']}: {row['latitude']}, {row['longitude']}")
        collision_df.loc[index, "Lat_x"] = collision_df.loc[index, 'latitude']
        collision_df.loc[index, "Lon_y"] = collision_df.loc[index, 'longitude']

# Remove original lat/long columns
collision_df.drop(columns=['latitude', 'longitude'], axis=1, inplace=True)

✅ Lat/Long found at at index 0, 4485706: 40.662575, -73.93448
✅ Lat/Long found at at index 1, 4485384: 40.7448, -73.953415
✅ Lat/Long found at at index 2, 4484799: 40.79672, -73.97618
✅ Lat/Long found at at index 3, 4485077: 40.752777, -73.70743
✅ Lat/Long found at at index 4, 4485090: 40.638523, -73.92607
✅ Lat/Long found at at index 5, 4513697: 40.637833, -74.08193
✅ Lat/Long found at at index 6, 4513935: 40.797836, -73.96946
✅ Lat/Long found at at index 7, 4513794: 40.734375, -73.87342
✅ Lat/Long found at at index 8, 4514267: 40.699947, -73.736626
✅ Lat/Long found at at index 9, 4514237: 40.75632, -73.999275
✅ Lat/Long found at at index 10, 4513470: 40.69614, -73.81789
✅ Lat/Long found at at index 11, 4513751: 40.631687, -73.9205
✅ Lat/Long found at at index 12, 4514075: 40.88839, -73.84666
✅ Lat/Long found at at index 13, 4513857: 40.844425, -73.8639
✅ Lat/Long found at at index 14, 4514202: 40.833965, -73.8629
✅ Lat/Long found at at index 15, 4514347: 40.705738, -73.944695
✅ Lat/L

#### Indicate Number of Usable Records

In [13]:
# Count the records we cannot use for our final analysis
records_bad_latlon = collision_df[collision_df['flag_orig_latlong_missing'] == True].shape[0]
total_records = collision_df.shape[0]
usable_records = total_records - records_bad_latlon

# Output the number of usable records
if records_bad_latlon > 0:
    print(f"🧠 Based on location cleanup, we are able to perform an analysis on {usable_records} records. There were {records_bad_latlon} records which were omitted from the analysis due to improper capture of latitude and longitude by source.")
    # Generate CSV with flags for these missing lat/long records
    collision_df[collision_df['flag_orig_latlong_missing'] == True].to_csv('BADDATA_latlon.csv', index=True)

    # Override collision_df with usable records
    collision_df = collision_df[collision_df['flag_orig_latlong_missing'] == False]
else:
    print(f"✅ All {total_records} records are usable for analysis.")

🧠 Based on location cleanup, we are able to perform an analysis on 468 records. There were 32 records which were omitted from the analysis due to improper capture of latitude and longitude by source.


### Merge Contributing Factors & Vehicle Type Into Individual Lsits

In [14]:
def remove_duplicates_from_list(input_list):
    return list(pd.unique(pd.Series(input_list)))

# Combine contributing factors into a single list
collision_df['combined_contributing_factors'] = collision_df.apply(lambda row:
                                  list(filter(None, [
                                      row['contributing_factor_vehicle_1'],
                                      row['contributing_factor_vehicle_2'],
                                      row['contributing_factor_vehicle_3'],
                                      row['contributing_factor_vehicle_4'],
                                      row['contributing_factor_vehicle_5']
                                  ]))
                                  , axis=1)

# Remove duplicates from the lists
collision_df['combined_contributing_factors'] = collision_df['combined_contributing_factors'].apply(remove_duplicates_from_list)

# Combine vehicle types into a single list
collision_df['combined_vehicle_types'] = collision_df.apply(lambda row:
                                  list(filter(None, [
                                      row['vehicle_type_code1'],
                                      row['vehicle_type_code2'],
                                      row['vehicle_type_code_3'],
                                      row['vehicle_type_code_4'],
                                      row['vehicle_type_code_5']
                                  ]))
                                  , axis=1)

# Remove duplicates from the lists
collision_df['combined_vehicle_types'] = collision_df['combined_vehicle_types'].apply(remove_duplicates_from_list)

# Drop the original columns now that we have combined lists
collision_df.drop(columns=[
    'contributing_factor_vehicle_1', 
    'contributing_factor_vehicle_2',
    'contributing_factor_vehicle_3', 
    'contributing_factor_vehicle_4',
    'contributing_factor_vehicle_5', 
    'vehicle_type_code1',
    'vehicle_type_code2', 
    'vehicle_type_code_3', 
    'vehicle_type_code_4',
    'vehicle_type_code_5'
], axis=1, inplace=True)

# Remove 'Unspecified', 'nan', None, '' from the lists
remove_set = {'Unspecified', 'nan', None, '', 'unspecified' }


def clean_list(input_list):
    return [x for x in input_list if x not in remove_set and not pd.isna(x)]

collision_df['combined_contributing_factors'] = collision_df['combined_contributing_factors'].apply(clean_list)

collision_df['combined_vehicle_types'] = collision_df['combined_vehicle_types'].apply(clean_list)

collision_df.head()

Unnamed: 0,crash_date,crash_date_YM,crash_date_Month,crash_time_fmat_12hr,hour_classification,crash_time,borough,zip_code,location,on_street_name,...,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,collision_id,flag_orig_latlong_missing,Lat_x,Lon_y,combined_contributing_factors,combined_vehicle_types
0,2021-12-10,2021-12,December,01:10 AM,Night,01:10:00,,,"{'latitude': '40.662575', 'longitude': '-73.93...",SCHENECTADY AVENUE,...,0,0,0,0,4485706,False,40.662575,-73.93448,[Driver Inattention/Distraction],"[Station Wagon/Sport Utility Vehicle, Sedan]"
1,2021-12-09,2021-12,December,12:00 AM,Night,00:00:00,,,"{'latitude': '40.7448', 'longitude': '-73.9534...",47 ROAD,...,0,0,0,0,4485384,False,40.7448,-73.953415,[],[Sedan]
2,2021-12-08,2021-12,December,06:36 PM,Morning,18:36:00,,,"{'latitude': '40.79672', 'longitude': '-73.976...",HENRY HUDSON PARKWAY,...,0,0,0,0,4484799,False,40.79672,-73.97618,[Passing or Lane Usage Improper],"[Sedan, Station Wagon/Sport Utility Vehicle]"
3,2021-12-09,2021-12,December,08:30 PM,Morning,20:30:00,QUEENS,11004.0,"{'latitude': '40.752777', 'longitude': '-73.70...",,...,0,0,0,0,4485077,False,40.752777,-73.70743,[Illnes],[Sedan]
4,2021-12-09,2021-12,December,10:13 AM,Morning,10:13:00,BROOKLYN,11203.0,"{'latitude': '40.638523', 'longitude': '-73.92...",KINGS HIGHWAY,...,0,0,2,0,4485090,False,40.638523,-73.92607,[Traffic Control Disregarded],[Sedan]


### Output Cleaned Collision Data to DF

In [15]:
collision_df.to_csv('FINAL_collision_data.csv', index=False)