## Joining 3 CSV Related Data



In [None]:
# Library
import pandas as pd

### Drivers Data

In [4]:

### Read the csv File and Load the data with specified dtype(data type) and low_memory flag
driverData = pd.read_csv("./Data-Sources/Crash_Reporting_-_Drivers_Data.csv",
                   dtype={'Local Case Number': 'str'},
                   low_memory=False)

print(driverData.columns)


Index(['Report Number', 'Local Case Number', 'Agency Name', 'ACRS Report Type',
       'Crash Date/Time', 'Route Type', 'Road Name', 'Cross-Street Name',
       'Off-Road Description', 'Municipality', 'Related Non-Motorist',
       'Collision Type', 'Weather', 'Surface Condition', 'Light',
       'Traffic Control', 'Driver Substance Abuse',
       'Non-Motorist Substance Abuse', 'Person ID', 'Driver At Fault',
       'Injury Severity', 'Circumstance', 'Driver Distracted By',
       'Drivers License State', 'Vehicle ID', 'Vehicle Damage Extent',
       'Vehicle First Impact Location', 'Vehicle Body Type',
       'Vehicle Movement', 'Vehicle Going Dir', 'Speed Limit',
       'Driverless Vehicle', 'Parked Vehicle', 'Vehicle Year', 'Vehicle Make',
       'Vehicle Model', 'Latitude', 'Longitude', 'Location'],
      dtype='object')


In [2]:

# Get the total number of rows and columns
driRows, driColumns = driverData.shape

# Print the total rows and columns of Drivers Data
print(f"Total rows: {driRows}")
print(f"Total columns: {driColumns}")


Total rows: 184897
Total columns: 39



### Non - Motorist Data

In [5]:

# Read the csv File and Load the data with specified dtype(data type) and low_memory flag
nonMotoristData = pd.read_csv("./Data-Sources/Crash_Reporting_-_Non-Motorists_Data_20240922.csv",
                              dtype={'Local Case Number': 'str'},
                              low_memory=False)

print(nonMotoristData.columns);

Index(['Report Number', 'Local Case Number', 'Agency Name', 'ACRS Report Type',
       'Crash Date/Time', 'Route Type', 'Road Name', 'Cross-Street Name',
       'Off-Road Description', 'Municipality', 'Related Non-Motorist',
       'Collision Type', 'Weather', 'Surface Condition', 'Light',
       'Traffic Control', 'Driver Substance Abuse',
       'Non-Motorist Substance Abuse', 'Person ID', 'Pedestrian Type',
       'Pedestrian Movement', 'Pedestrian Actions', 'Pedestrian Location',
       'At Fault', 'Injury Severity', 'Safety Equipment', 'Latitude',
       'Longitude', 'Location'],
      dtype='object')


In [6]:

# Get the total number of rows and columns
nMrows, nMcolumns = nonMotoristData.shape

# Print the total rows and columns of Non Motorist Data
print(f"Total rows: {nMrows}")
print(f"Total columns: {nMcolumns}")


Total rows: 6104
Total columns: 29



### Incidents Data 

In [7]:

# Read the csv File and Load the data with specified dtype(data type) and low_memory flag
incidentsData = pd.read_csv("./Data-Sources/Crash_Reporting_-_Incidents_Data_20240922.csv",
                            dtype={'Local Case Number' : 'str'},
                            low_memory=False)

print(incidentsData.columns);


Index(['Report Number', 'Local Case Number', 'Agency Name', 'ACRS Report Type',
       'Crash Date/Time', 'Hit/Run', 'Route Type', 'Lane Direction',
       'Lane Type', 'Number of Lanes', 'Direction', 'Distance',
       'Distance Unit', 'Road Grade', 'Road Name', 'Cross-Street Name',
       'Off-Road Description', 'Municipality', 'Related Non-Motorist',
       'At Fault', 'Collision Type', 'Weather', 'Surface Condition', 'Light',
       'Traffic Control', 'Driver Substance Abuse',
       'Non-Motorist Substance Abuse', 'First Harmful Event',
       'Second Harmful Event', 'Junction', 'Intersection Type',
       'Road Alignment', 'Road Condition', 'Road Division', 'Latitude',
       'Longitude', 'Location'],
      dtype='object')


In [15]:

# Get the total number of rows and columns
iRows, iColumns = incidentsData.shape

# Print the total rows and columns of Non Motorist Data
print(f"Total rows: {iRows}")
print(f"Total columns: {iColumns}")

Total rows: 104780
Total columns: 37



## Merged Datasets 

In [16]:
# Concatenate DataFrames
frames = [driverData, nonMotoristData, incidentsData]
result = pd.concat(frames, ignore_index=True, sort=False)

# Key Points:
# ignore_index=True: 
    # This reindexes the resulting DataFrame, which can be useful if you want a fresh index.
# sort=False: 
    # This keeps the original order of the columns as they appear in the concatenated DataFrames 
    # rather than sorting them alphabetically.

# Print the columns of the concatenated DataFrame
print(result.columns)

Index(['Report Number', 'Local Case Number', 'Agency Name', 'ACRS Report Type',
       'Crash Date/Time', 'Route Type', 'Road Name', 'Cross-Street Name',
       'Off-Road Description', 'Municipality', 'Related Non-Motorist',
       'Collision Type', 'Weather', 'Surface Condition', 'Light',
       'Traffic Control', 'Driver Substance Abuse',
       'Non-Motorist Substance Abuse', 'Person ID', 'Driver At Fault',
       'Injury Severity', 'Circumstance', 'Driver Distracted By',
       'Drivers License State', 'Vehicle ID', 'Vehicle Damage Extent',
       'Vehicle First Impact Location', 'Vehicle Body Type',
       'Vehicle Movement', 'Vehicle Going Dir', 'Speed Limit',
       'Driverless Vehicle', 'Parked Vehicle', 'Vehicle Year', 'Vehicle Make',
       'Vehicle Model', 'Latitude', 'Longitude', 'Location', 'Pedestrian Type',
       'Pedestrian Movement', 'Pedestrian Actions', 'Pedestrian Location',
       'At Fault', 'Safety Equipment', 'Hit/Run', 'Lane Direction',
       'Lane Type', 'Nu

In [17]:
# Print the total rows and columns of the concatenated data
rows, columns = result.shape
print(f"Total rows after concat: {rows}")
print(f"Total columns after concat: {columns}")

Total rows after concat: 295781
Total columns after concat: 60



### Common Columns

In [24]:

# Get the columns of each DataFrame
driver_columns = set(driverData.columns)
non_motorist_columns = set(nonMotoristData.columns)
incidents_columns = set(incidentsData.columns)

# Find common columns
common_columns = driver_columns.intersection(non_motorist_columns).intersection(incidents_columns)

print("Common columns in all DataFrames:")
for column in common_columns:
    print(column)


Common columns in all DataFrames:
Weather
Agency Name
Report Number
Longitude
Road Name
ACRS Report Type
Surface Condition
Crash Date/Time
Driver Substance Abuse
Cross-Street Name
Traffic Control
Municipality
Non-Motorist Substance Abuse
Location
Related Non-Motorist
Latitude
Light
Off-Road Description
Route Type
Local Case Number
Collision Type


In [26]:
# Number of Common Columns
print(len(common_columns))

21
