# <b> NYCOpenData Collsion Data Analysis <b>
<hr>

## Install Requirements

In [1]:
import datetime as dt
import requests
import numpy as np
import pandas as pd


## Fetch NYCOpenData API

### Create Empty Dataframe

In [2]:
# Heaeders from source: https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95/about_data
collision_data_headers = [
    'crash_date', 'crash_time', 'borough', 'zip_code', 'latitude', 'longitude',
    'location', 'on_street_name', 'off_street_name', 'cross_street_name',
    'number_of_persons_injured', 'number_of_persons_killed',
    'number_of_pedestrians_injured', 'number_of_pedestrians_killed',
    'number_of_cyclist_injured', 'number_of_cyclist_killed',
    'number_of_motorist_injured', 'number_of_motorist_killed',
    'contributing_factor_vehicle_1', 'contributing_factor_vehicle_2',
    'contributing_factor_vehicle_3', 'contributing_factor_vehicle_4',
    'contributing_factor_vehicle_5', 'collision_id', 'vehicle_type_code1',
    'vehicle_type_code2', 'vehicle_type_code_3', 'vehicle_type_code_4',
    'vehicle_type_code_5'
]

# Create empty DataFrame with headers
collision_data = pd.DataFrame(columns=collision_data_headers)

### Fetch Data

In [3]:
page_size = 50
starting_page = 0
ending_page = 50

while True:
    try:
        print(f"\n--------------------\nCalling URL with 'offset' or records {starting_page} to {ending_page}, with page size {page_size}")
        response = requests.get(f'https://data.cityofnewyork.us/resource/h9gi-nx95.json?$limit={page_size}&$offset={ending_page}')

        if response.status_code == 200:
            print("Response: ", response.status_code)
            data = pd.DataFrame(response.json())

            if not data.empty:
                collision_data = pd.concat([collision_data, data], ignore_index=True)
                print(f"✅ Fetched {len(data)} records, total records so far: {len(collision_data)}")
                starting_page = ending_page
                ending_page += 50

                if starting_page >= 50:
                    print("Reached Temporary Limit of 50 records.")
                    break

            if data.empty:
                print("No more data to fetch.")
                break

    except Exception as e:
        print("An error occured: ", e)


--------------------
Calling URL with 'offset' or records 0 to 50, with page size 50
Response:  200
✅ Fetched 50 records, total records so far: 50
Reached Temporary Limit of 50 records.


### Create File for RAW Output and Copy For Analysis

In [4]:
# Raw NYCOpendata Output
collision_data.to_csv('RAW_collision_data.csv')

# For Analysis
collision_df = collision_data.copy()

<hr>

## EDA, Cleaning & Preparing for Analytics

### Schema Info & Preview Data

In [5]:
collision_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   crash_date                     50 non-null     object
 1   crash_time                     50 non-null     object
 2   borough                        29 non-null     object
 3   zip_code                       29 non-null     object
 4   latitude                       47 non-null     object
 5   longitude                      47 non-null     object
 6   location                       47 non-null     object
 7   on_street_name                 39 non-null     object
 8   off_street_name                22 non-null     object
 9   cross_street_name              11 non-null     object
 10  number_of_persons_injured      50 non-null     object
 11  number_of_persons_killed       50 non-null     object
 12  number_of_pedestrians_injured  50 non-null     object
 13  number_

In [6]:
collision_df.head()

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,off_street_name,cross_street_name,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5
0,2021-07-09T00:00:00.000,0:43,,,40.720535,-73.88885,"{'latitude': '40.720535', 'longitude': '-73.88...",ELIOT AVENUE,,,...,,,,,4456659,Bus,,,,
1,2022-04-24T00:00:00.000,16:45,,,40.607685,-74.13892,"{'latitude': '40.607685', 'longitude': '-74.13...",STATEN ISLAND EXPRESSWAY,,,...,Unspecified,,,,4521660,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,
2,2022-04-24T00:00:00.000,4:49,,,40.855972,-73.869896,"{'latitude': '40.855972', 'longitude': '-73.86...",BOSTON ROAD,BRONX PARK EAST,,...,Unspecified,,,,4521759,Station Wagon/Sport Utility Vehicle,Sedan,,,
3,2022-04-22T00:00:00.000,17:17,,,40.790276,-73.9396,"{'latitude': '40.790276', 'longitude': '-73.93...",EAST 107 STREET,,,...,,,,,4522226,E-Bike,,,,
4,2022-04-24T00:00:00.000,1:30,BROOKLYN,11220.0,40.642986,-74.01621,"{'latitude': '40.642986', 'longitude': '-74.01...",,,5610 4 AVENUE,...,,,,,4522015,Station Wagon/Sport Utility Vehicle,,,,


### Check & Correct Duplicate Records - `collision_id`

In [7]:
# Count unique collision ids
total_records = len(collision_df)
unique_count = collision_df['collision_id'].nunique()

if unique_count == total_records:
    print(f"✅ All {total_records} records are unique.")
else:
    print(f"⚠️ Not all {total_records} records are unique. There are {total_records - unique_count} repeated records.\nProcessing removal.\nFinal count: {unique_count}")

    # Empty Dataframe for handling duplicates
    duplicates_df = pd.DataFrame(None)

    # Here are repeated records
    duplicates_df = collision_df[collision_df.duplicated(subset=['collision_id'], keep=False)].sort_values(by='collision_id')

    # Duplicates df to csv
    print(f"CSV generated with duplicates")
    duplicates_df.to_csv('collision_data_duplicates.csv')

    # Return updated dataframe
    collision_df = collision_df.drop_duplicates(subset=['collision_id'], keep='first', inplace=False)

✅ All 50 records are unique.


### 🕐 Datatime Validation & Clean Up - `crash_date` and `crash_time`

#### Reformat date and time

In [8]:
# Convert 'crash_date' and 'crash_time' columns to datetime objects
collision_df['crash_date'] = pd.to_datetime(collision_df['crash_date'])
collision_df['crash_time'] = pd.to_datetime(collision_df['crash_time'], format='mixed').dt.time

print(f" •CRASH DATE INFO \n{collision_df['crash_date'].info()}\n")
print(f" •CRASH TIME INFO \n{collision_df['crash_time'].info()}")

collision_df[['crash_date','crash_time']].head()

<class 'pandas.core.series.Series'>
RangeIndex: 50 entries, 0 to 49
Series name: crash_date
Non-Null Count  Dtype         
--------------  -----         
50 non-null     datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 532.0 bytes
 •CRASH DATE INFO 
None

<class 'pandas.core.series.Series'>
RangeIndex: 50 entries, 0 to 49
Series name: crash_time
Non-Null Count  Dtype 
--------------  ----- 
50 non-null     object
dtypes: object(1)
memory usage: 532.0+ bytes
 •CRASH TIME INFO 
None


Unnamed: 0,crash_date,crash_time
0,2021-07-09,00:43:00
1,2022-04-24,16:45:00
2,2022-04-24,04:49:00
3,2022-04-22,17:17:00
4,2022-04-24,01:30:00


#### Crate Column for Aggregate Count

In [9]:
# Create new columns to assist with time analysis
# crash_date_YM: Year-Month (YYYY-MM)
# crash_date_Month: Month name (April, May, etc.)
collision_df['crash_date_YM'] = collision_df['crash_date'].dt.to_period('M')
collision_df['crash_date_Month'] = collision_df['crash_date'].dt.month_name()

# Reorder the columns for better readabilility
collision_df.insert(1, 'crash_date_YM', collision_df.pop('crash_date_YM'))
collision_df.insert(2, 'crash_date_Month', collision_df.pop('crash_date_Month'))

# Lets create categories for crash_time
collision_df['hour_classification'] = None
collision_df.insert(3, 'hour_classification', collision_df.pop('hour_classification'))

for index,row in collision_df.iterrows():
    # 11:00PM to 6:59AM = Night Time
    if row['crash_time'] >= dt.time(hour=23) or row['crash_time'] < dt.time(hour=7):
        collision_df.loc[index, 'hour_classification'] = 'Night'
    # 7:00 AM to 10:59AM = Morning
    elif row['crash_time'] >= dt.time(hour=7) or row['crash_time'] < dt.time(hour=11):
        collision_df.loc[index, 'hour_classification'] = 'Morning'
    # 11:00 AM to 1:59PM = Midday
    elif row['crash_time'] >= dt.time(hour=11) or row['crash_time'] < dt.time(hour=14):
        collision_df.loc[index, 'hour_classification'] = 'Midday'
    # 2:00 PM to 6:59PM = Afternoon
    elif row['crash_time'] >= dt.time(hour=14) or row['crash_time'] < dt.time(hour=19):
        collision_df.loc[index, 'hour_classification'] = 'Midday'
    else:
        # 7:00 PM to 10:59PM = Evening
        collision_df.loc[index, 'hour_classification'] = 'Evening'

# Lets make time readable to the normal user
collision_df['crash_time_fmat_12hr'] = collision_df['crash_time'].apply(lambda x: x.strftime('%I:%M %p'))
collision_df.insert(3, 'crash_time_fmat_12hr', collision_df.pop('crash_time_fmat_12hr'))

# Preview these new columns
collision_df.head()

Unnamed: 0,crash_date,crash_date_YM,crash_date_Month,crash_time_fmat_12hr,hour_classification,crash_time,borough,zip_code,latitude,longitude,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5
0,2021-07-09,2021-07,July,12:43 AM,Night,00:43:00,,,40.720535,-73.88885,...,,,,,4456659,Bus,,,,
1,2022-04-24,2022-04,April,04:45 PM,Morning,16:45:00,,,40.607685,-74.13892,...,Unspecified,,,,4521660,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,
2,2022-04-24,2022-04,April,04:49 AM,Night,04:49:00,,,40.855972,-73.869896,...,Unspecified,,,,4521759,Station Wagon/Sport Utility Vehicle,Sedan,,,
3,2022-04-22,2022-04,April,05:17 PM,Morning,17:17:00,,,40.790276,-73.9396,...,,,,,4522226,E-Bike,,,,
4,2022-04-24,2022-04,April,01:30 AM,Night,01:30:00,BROOKLYN,11220.0,40.642986,-74.01621,...,,,,,4522015,Station Wagon/Sport Utility Vehicle,,,,


#### Collisions by Year-Month

In [10]:
# collision_data['crash_date_YM'] = collision_data['crash_date'].dt.to_period('M')
year_month_count = collision_df.groupby(['crash_date_YM','crash_date_Month']).size()
year_month_count.to_csv('year_month_count.csv', index=False)

# Print snippet of analytics
year_month_count.head()

crash_date_YM  crash_date_Month
2021-06        June                 1
2021-07        July                 1
2021-12        December            23
2022-03        March                1
2022-04        April               24
dtype: int64

### 📍 Location Validation

#### Check if Lat, Long are Missing

In [11]:
geo_columns_to_check =  ['latitude', 'longitude']

missing_column = False
for col in geo_columns_to_check:
    if collision_df[col].isnull().any():
        missing_column = True
        print(f"❌ Missing values found in column: {col}")
    else:
        print(f"✅ No missing values in column: {col}")

❌ Missing values found in column: latitude
❌ Missing values found in column: longitude


#### New Lat and Long

In [12]:
# Ensure the columns lat and long are correct data types
collision_df['latitude'] = pd.to_numeric(collision_df['latitude'], errors='coerce').astype(float)
collision_df['longitude'] = pd.to_numeric(collision_df['longitude'], errors='coerce').astype(float)

# Craete flag for missing lat/long
collision_df['flag_orig_latlong_missing'] = False
collision_df['Lat_x'] = np.nan
collision_df['Lon_y'] = np.nan

for index,row in collision_df.iterrows():
    if pd.isna(row['latitude']) or pd.isna(row['longitude']) or row['latitude'] == 0 or row['longitude'] == 0:
        collision_df.loc[index, "flag_orig_latlong_missing"] = True
        print(f"❌ Missing lat/long at index {index}, {row['collision_id']}: {row['latitude']}, {row['longitude']}")
    else:
        print(f"✅ Lat/Long found at at index {index}, {row['collision_id']}: {row['latitude']}, {row['longitude']}")
        collision_df.loc[index, "Lat_x"] = collision_df.loc[index, 'latitude']
        collision_df.loc[index, "Lon_y"] = collision_df.loc[index, 'longitude']

✅ Lat/Long found at at index 0, 4456659: 40.720535, -73.88885
✅ Lat/Long found at at index 1, 4521660: 40.607685, -74.13892
✅ Lat/Long found at at index 2, 4521759: 40.855972, -73.869896
✅ Lat/Long found at at index 3, 4522226: 40.790276, -73.9396
✅ Lat/Long found at at index 4, 4522015: 40.642986, -74.01621
❌ Missing lat/long at index 5, 4521460: 0.0, 0.0
✅ Lat/Long found at at index 6, 4522156: 40.843906, -73.92413
✅ Lat/Long found at at index 7, 4521633: 40.89481, -73.86183
❌ Missing lat/long at index 8, 4522124: 0.0, 0.0
✅ Lat/Long found at at index 9, 4521937: 40.85169, -73.95238
✅ Lat/Long found at at index 10, 4521801: 40.861862, -73.91275
✅ Lat/Long found at at index 11, 4522285: 40.666256, -73.900215
✅ Lat/Long found at at index 12, 4521853: 40.767242, -73.986206
✅ Lat/Long found at at index 13, 4522228: 40.62417, -73.97048
✅ Lat/Long found at at index 14, 4521702: 40.679955, -73.97491
✅ Lat/Long found at at index 15, 4522167: 40.692356, -73.94282
✅ Lat/Long found at at index 

#### Indicate Number of Usable Records

In [13]:
# Count the records we cannot use for our final analysis
records_bad_latlon = collision_df[collision_df['flag_orig_latlong_missing'] == True].shape[0]
total_records = collision_df.shape[0]
usable_records = total_records - records_bad_latlon

# Output the number of usable records
if records_bad_latlon > 0:
    print(f"🧠 Based on location cleanup, we are able to perform an analysis on {usable_records} records. There were {records_bad_latlon} records which were omitted from the analysis due to improper capture of latitude and longitude.")
    # Generate CSV with flags for these missing lat/long records
    collision_df[collision_df['flag_orig_latlong_missing'] == True].to_csv('BADDATA_latlon.csv', index=True)

    # Override collision_df with usable records
    collision_df = collision_df[collision_df['flag_orig_latlong_missing'] == False]
else:
    print(f"✅ All {total_records} records are usable for analysis.")

🧠 Based on location cleanup, we are able to perform an analysis on 45 records. There were 5 records which were omitted from the analysis due to improper capture of latitude and longitude.


### Output Cleaned Collision Data to DF

In [14]:
collision_df.to_csv('FINAL_collision_data.csv', index=False)