Briefly describe the Motor Vehicle Collisions dataset and its relevance to traffic safety in New York City.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from pandas import Timestamp
import datetime as dt
from datetime import datetime


In [None]:

# Load the dataset
data = pd.read_csv("Motor_Vehicle_Collisions.csv")
data.head(100)

In [None]:
print(data.columns)

In [None]:
data.info()

In [None]:


# Convert 'CRASH DATE' to datetime format
data['CRASH DATE'] = pd.to_datetime(data['CRASH DATE'])

# Extract hour, day, month, and year from the 'CRASH DATE' column
data['HOUR'] = data['CRASH DATE'].dt.hour
data['DAY'] = data['CRASH DATE'].dt.day
data['MONTH'] = data['CRASH DATE'].dt.month
data['YEAR'] = data['CRASH DATE'].dt.year

# Fill missing values in the 'BOROUGH' column with 'UNKNOWN'
data['BOROUGH'] = data['BOROUGH'].fillna('UNKNOWN')

# Fill missing values in the 'ZIP CODE' column with the mode (most frequent value)
data['ZIP CODE'] = data['ZIP CODE'].fillna(data['ZIP CODE'].mode().iloc[0])

# Fill missing values in 'LATITUDE' and 'LONGITUDE' columns with the mean of each column
data['LATITUDE'] = data['LATITUDE'].fillna(data['LATITUDE'].mean())
data['LONGITUDE'] = data['LONGITUDE'].fillna(data['LONGITUDE'].mean())

# Fill missing values in contributing factor columns with 'UNSPECIFIED'
contributing_factor_columns = ['CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5']
for col in contributing_factor_columns:
    data[col] = data[col].fillna('UNSPECIFIED')

# Fill missing values in vehicle type columns with 'UNKNOWN'
vehicle_type_columns = ['VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5']
for col in vehicle_type_columns:
    data[col] = data[col].fillna('UNKNOWN')

# Verify if there are any missing values left
print(data.isna().sum())


In [None]:
import pandas as pd

# Assuming your dataset is stored in a DataFrame called 'data'
missing_values = data.isnull().sum()
print(missing_values)


In [None]:
# Fill missing values in the 'BOROUGH' column with 'Unknown'
data['BOROUGH'].fillna('Unknown', inplace=True)

# Fill missing values in the 'ZIP CODE' column with the most frequent value
most_frequent_zip = data['ZIP CODE'].mode().iloc[0]
data['ZIP CODE'].fillna(most_frequent_zip, inplace=True)

# Drop the 'OFF STREET NAME' column if it has a large number of missing values and isn't crucial for our analysis
data.drop('OFF STREET NAME', axis=1, inplace=True)


In [None]:
missing_values_after = data.isnull().sum()
print(missing_values_after)


                                              ---- Let's Start the data cleaning----
We need to make sure the data is clean before starting our analysis. As a reminder, we should check for:

1-Duplicate records
2-Consistent formatting
3-Missing values
4-Obviously wrong values (x)

In [None]:
dup_rows = data.duplicated().sum()
dup_rows

Now we will drop duplicate records

In [None]:
data = data.drop_duplicates()
data.head()

In [None]:
#Drop the missing values in 'LOCATION' column
data = data[data['LOCATION'].notna()]
data = data[data['CONTRIBUTING FACTOR VEHICLE 1'].notna()]

                         visualize the distribution of all numerical variables in our dataset using histograms

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Select only the numerical columns in the dataset
numerical_columns = data.select_dtypes(include=[np.number]).columns

# Set the number of plots per row
plots_per_row = 3

# Calculate the number of rows needed for the subplots
num_rows = (len(numerical_columns) + plots_per_row - 1) // plots_per_row

# Create a figure and axes for the subplots
fig, axes = plt.subplots(num_rows, plots_per_row, figsize=(15, num_rows * 5))

# Loop through the numerical columns and create a histogram for each one
for i, col_name in enumerate(numerical_columns):
    row = i // plots_per_row
    col = i % plots_per_row
    ax = axes[row, col]
    
    data[col_name].hist(bins=30, ax=ax)
    ax.set_xlabel(col_name)
    ax.set_ylabel('Frequency')
    ax.set_title(f'Distribution of {col_name}')

# Adjust the layout of the subplots
plt.tight_layout()

# Show the histograms
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



# Select only the numerical columns in the dataset
numerical_columns = data.select_dtypes(include=[np.number]).columns

# Create a scatterplot matrix using Seaborn's pairplot function
sns.pairplot(data[numerical_columns], corner=True)

# Show the plot
plt.show()


In [None]:
import pandas as pd
import numpy as np


# Select only the numerical columns in the dataset
numerical_columns = data.select_dtypes(include=[np.number]).columns

# Calculate the correlation matrix
correlation_matrix = data[numerical_columns].corr()

# Display the correlation matrix as a table
display(correlation_matrix.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2))


     Let's analyze some of the relationships:

LATITUDE and LONGITUDE: The correlation coefficient is -0.96, indicating a strong negative correlation. As the latitude increases, the longitude decreases, and vice versa.

NUMBER OF PERSONS INJURED and NUMBER OF MOTORIST INJURED: The correlation coefficient is 0.91, indicating a strong positive correlation. This means that when the number of motorists injured increases, the number of persons injured also tends to increase.

NUMBER OF PERSONS KILLED and NUMBER OF PEDESTRIANS KILLED: The correlation coefficient is 0.70, indicating a moderate to strong positive correlation. This suggests that when the number of pedestrians killed increases, the number of persons killed also tends to increase.

NUMBER OF CYCLIST INJURED and NUMBER OF PEDESTRIANS INJURED: The correlation coefficient is -0.03, indicating a very weak negative correlation. This suggests that there is almost no relationship between the number of cyclists injured and the number of pedestrians injured.

NUMBER OF PERSONS INJURED and COLLISION_ID: The correlation coefficient is 0.05, indicating a very weak positive correlation. This suggests that there is almost no relationship between the number of persons injured and the collision ID.

It's important to note that correlation does not imply causation. Just because two variables are correlated does not mean that one variable causes the other. It's also possible that an external factor influences both variables, or the correlation may simply be a coincidence. Always consider the context and domain knowledge when interpreting correlation coefficients

In [None]:

import pandas as pd



# Select only the numerical columns in the dataset
numerical_columns = data.select_dtypes(include=[np.number]).columns

# Calculate the correlation matrix using pandas
correlation_matrix = data[numerical_columns].corr()

# Print the correlation matrix
print(correlation_matrix)

This is a correlation matrix showing the correlation coefficients between pairs of numerical variables in your dataset. You can interpret the relationships between the variables based on these coefficients. A positive value indicates a positive relationship, a negative value indicates a negative relationship, and a value close to 0 indicates no relationship. The closer the value is to -1 or 1, the stronger the relationship.

Here are some insights from the correlation matrix:

LATITUDE and LONGITUDE have a strong negative correlation (-0.962222). This suggests that as latitude increases, longitude decreases, and vice versa.

There's a strong positive relationship between NUMBER OF PERSONS INJURED and NUMBER OF MOTORIST INJURED (0.907074). This indicates that when more people are injured in a collision, it is likely that more motorists are injured as well.

NUMBER OF PERSONS KILLED has a strong positive relationship with NUMBER OF PEDESTRIANS KILLED (0.698555), NUMBER OF CYCLIST KILLED (0.278279), and NUMBER OF MOTORIST KILLED (0.656525). This suggests that when more people are killed in a collision, it is likely that more pedestrians, cyclists, or motorists are killed as well.

Other relationships between variables are weak or negligible, as the correlation coefficients are close to 0.

In [None]:

data.info()


                                              ----Done with data cleaning----


                                    -----------     Summary statistics      ---------------------------

In [None]:
# Generate basic statistics for the dataset
basic_stats = data.describe()

# Print the basic statistics
print(basic_stats)

There are nearly 2 million rows in the dataset.The latitude values range from 0 to 43.34444, while the longitude values range from -201.36 to 0. These ranges suggest that there may be some errors or inconsistencies in the data. The average number of persons injured in each collision is 0.297, while the average number of persons killed is 0.0014. This suggests that most collisions are relatively minor and result in few or no injuries or fatalities. Similarly, the average number of pedestrians and cyclists injured or killed in each collision is also quite low.
The average number of motorists injured in each collision is 0.216, which is higher than the average number of pedestrians or cyclists injured, but still relatively low.
The average number of collisions per collision ID is 3.09, which suggests that there are many repeat incidents in the dataset.

In [None]:
# Assuming data has been loaded into a pandas DataFrame called 'data'

# Group data by year and count the number of collisions
collisions_by_year = data['YEAR'].value_counts().sort_index()
print("Collisions by year:")
print(collisions_by_year)

# Group data by month and count the number of collisions
collisions_by_month = data['MONTH'].value_counts().sort_index()
print("\nCollisions by month:")
print(collisions_by_month)

# Group data by day and count the number of collisions
collisions_by_day = data['DAY'].value_counts().sort_index()
print("\nCollisions by day:")
print(collisions_by_day)

# Group data by hour and count the number of collisions
collisions_by_hour = data['HOUR'].value_counts().sort_index()
print("\nCollisions by hour:")
print(collisions_by_hour)


We can see  number of collisions per year increased steadily from 2012 to 2018, with a peak of 230871 collisions in 2018. After 2018, the number of collisions decreased significantly, with only 26841 collisions reported in 2023.When looking at the data by month, it appears that the number of collisions is relatively consistent throughout the year, with a slight increase in the summer months (June to August) and a peak in October. The data by day shows that the number of collisions is relatively consistent throughout the month, with no significant differences between days. However, it is worth noting that the number of collisions on the 31st is significantly lower than on other days. This is likely due to the fact that not all months have 31 days.
t is possible that the COVID-19 pandemic had an impact on the number of collisions in 2019, 2020, and 2022. During the pandemic, many countries implemented lockdowns and restrictions on movement to slow the spread of the virus. This likely led to a decrease in the number of vehicles on the road and a corresponding decrease in the number of collisions. In addition, many people may have changed their transportation patterns during the pandemic, opting to work from home or use alternative modes of transportation such as walking or biking. This could also have contributed to a decrease in the number of collisions. It is worth noting that the data you provided shows a significant decrease in the number of collisions in 2020 and 2022 compared to previous years. This is consistent with the hypothesis that the COVID-19 pandemic had an impact on the number of collisions.

In [None]:
# Find the top 10 most common contributing factors
most_common_factors = data['CONTRIBUTING FACTOR VEHICLE 1'].value_counts().head(10)
print("Top 10 most common contributing factors:")
print(most_common_factors)


The most common contributing factor to collisions is "Unspecified," followed by "Driver Inattention/Distraction" and "Failure to Yield Right-of-Way." These three factors alone account for a significant proportion of collisions. Other common contributing factors include "Following Too Closely," "Backing Unsafely," and "Other Vehicular." These factors suggest that driver behavior plays a significant role in many collisions. It is worth noting that "Fatigued/Drowsy" is also among the top 10 most common contributing factors. This suggests that driver fatigue may be an important issue to address in efforts to reduce the number of collisions. Overall, this data provides some useful insights into the factors that contribute to collisions. Understanding these factors can help inform efforts to improve road safety and reduce the number of collisions.

In [None]:
# Find the top 10 most common vehicle types involved in collisions
most_common_vehicles = data['VEHICLE TYPE CODE 1'].value_counts().head(10)
print("Top 10 most common vehicle types involved in collisions:")
print(most_common_vehicles)


Sedans and station wagons/sport utility vehicles are the most common types of vehicles involved in collisions. These two vehicle types alone account for a significant proportion of collisions.
It is also worth noting that passenger vehicles and sport utility/station wagon vehicles are also among the top 10 most common vehicle types involved in collisions. This suggests that these types of vehicles are commonly involved in collisions. Other common vehicle types involved in collisions include taxis, pick-up trucks, and vans. The presence of taxis on this list may be due to the fact that taxis are often on the road for longer periods of time and may be more likely to be involved in collisions as a result.

In [None]:
# Group data by borough and count the number of collisions
collisions_by_borough = data['BOROUGH'].value_counts()
print("Collisions by borough:")
print(collisions_by_borough)

# Group data by zip code and count the number of collisions
collisions_by_zip_code = data['ZIP CODE'].value_counts()
print("\nCollisions by zip code:")
print(collisions_by_zip_code)


The borough with the highest number of collisions is Brooklyn, followed by Queens and Manhattan. The borough with the lowest number of collisions is Staten Island.

In [None]:
# Exclude records with "UNKNOWN" vehicle type
filtered_vehicle_data = data[data['VEHICLE TYPE CODE 1'] != 'UNKNOWN']

# Find the most common vehicle types involved in collisions, excluding "UNKNOWN"
most_common_vehicles = filtered_vehicle_data['VEHICLE TYPE CODE 1'].value_counts().head(10)

print("Most common vehicle types involved in collisions, excluding 'UNKNOWN':")
print(most_common_vehicles)


In [None]:
# Assuming 'data' is the DataFrame containing the cleaned dataset
contributing_factors = data['CONTRIBUTING FACTOR VEHICLE 1'].value_counts()

# Display the top 10 most common contributing factors
print(contributing_factors.head(10))


# How do the number of collisions vary by time (hour, day, month, year)?


In [None]:
import pandas as pd

# Assuming 'data' is the DataFrame containing the cleaned dataset
data['CRASH DATE'] = pd.to_datetime(data['CRASH DATE'])
data['CRASH TIME'] = pd.to_datetime(data['CRASH TIME'])

# Extract hour, day, month, and year
data['HOUR'] = data['CRASH TIME'].dt.hour
data['DAY'] = data['CRASH DATE'].dt.day
data['MONTH'] = data['CRASH DATE'].dt.month
data['YEAR'] = data['CRASH DATE'].dt.year

# Group data by year, month, day, and hour, and count the number of collisions
collisions_by_year = data['YEAR'].value_counts().sort_index()
collisions_by_month = data['MONTH'].value_counts().sort_index()
collisions_by_day = data['DAY'].value_counts().sort_index()
collisions_by_hour = data['HOUR'].value_counts().sort_index()

# Display the number of collisions by time
print("Collisions by year:")
print(collisions_by_year)
print("\nCollisions by month:")
print(collisions_by_month)
print("\nCollisions by day:")
print(collisions_by_day)
print("\nCollisions by hour:")
print(collisions_by_hour)


In [None]:
# Assuming 'data' is the DataFrame containing the cleaned dataset

# Group data by boroughs and count the number of collisions
collisions_by_borough = data['BOROUGH'].value_counts()

# Group data by zip codes and count the number of collisions
collisions_by_zip = data['ZIP CODE'].value_counts()

# Display the number of collisions by borough and zip code
print("Collisions by borough:")
print(collisions_by_borough)
print("\nCollisions by zip code:")
print(collisions_by_zip)

In [None]:

# Assuming 'data' is the DataFrame containing the cleaned dataset

# Combine all 'VEHICLE TYPE CODE' columns into a single Series
vehicle_types = pd.concat([data['VEHICLE TYPE CODE 1'], data['VEHICLE TYPE CODE 2'],
                           data['VEHICLE TYPE CODE 3'], data['VEHICLE TYPE CODE 4'],
                           data['VEHICLE TYPE CODE 5']])

# Count the occurrences of each vehicle type
vehicle_type_counts = vehicle_types.value_counts()

# Display the most common vehicle types involved in collisions
print("Most common vehicle types involved in collisions:")
print(vehicle_type_counts)

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming our data is stored in a DataFrame called "data"
numerical_columns = [
    'LATITUDE',
    'LONGITUDE',
    'NUMBER OF PERSONS INJURED',
    'NUMBER OF PERSONS KILLED',
    'NUMBER OF PEDESTRIANS INJURED',
    'NUMBER OF PEDESTRIANS KILLED',
    'NUMBER OF CYCLIST INJURED',
    'NUMBER OF CYCLIST KILLED',
    'NUMBER OF MOTORIST INJURED',
    'NUMBER OF MOTORIST KILLED',
]

# Standardize the data
scaler = StandardScaler()
standardized_data = scaler.fit_transform(data[numerical_columns])
standardized_df = pd.DataFrame(standardized_data, columns=numerical_columns)

# Replace the original columns with the standardized ones
data[numerical_columns] = standardized_df[numerical_columns]


In [None]:
# d. Standardize or normalize data if required, especially when working with different scales or units.

In [None]:

# Get the numerical columns
numerical_columns = ['LATITUDE', 'LONGITUDE', 'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
                     'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
                     'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
                     'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED']

# Calculate summary statistics
summary_stats = data[numerical_columns].describe()
print(summary_stats)


In [None]:
import matplotlib.pyplot as plt

# Create a bar chart of collisions by borough
borough_counts = data['BOROUGH'].value_counts()
plt.bar(borough_counts.index, borough_counts.values)
plt.title('Collisions by Borough')
plt.xlabel('Borough')
plt.ylabel('Number of Collisions')
plt.show()


In [None]:
plt.figure(figsize=(10,5)) # Set figure size
plt.hist(data['HOUR'], bins=24)
plt.title('Collisions by Hour')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Collisions')
plt.xticks(range(0, 24)) # Set x-axis ticks from 0 to 23
plt.show()

In [None]:
# Create a horizontal bar chart of vehicle types
vehicle_counts = data['VEHICLE TYPE CODE 1'].value_counts()
plt.barh(vehicle_counts.index[:10], vehicle_counts.values[:10])
plt.title('Vehicle Types Involved in Collisions')
plt.xlabel('Number of Collisions')
plt.ylabel('Vehicle Type')
plt.gca().invert_yaxis()
plt.show()

In [None]:
factor_counts = data['CONTRIBUTING FACTOR VEHICLE 1'].value_counts()
fig, ax = plt.subplots(figsize=(12,6))
ax.bar(factor_counts.index, factor_counts.values)
ax.set_title('Contributing Factors')
ax.set_xlabel('Factor')
ax.set_ylabel('Number of Collisions')
ax.tick_params(axis='x', rotation=90)
plt.show()

# Perform exploratory data analysis (EDA):
a. Calculate basic summary statistics, such as mean, median, mode, standard deviation, etc., to get an overall understanding of the dataset.


Mean: The mean or average of a numerical column can be calculated using the mean() function in Pandas.

In [None]:
# Calculate mean, median, mode, and standard deviation for numerical columns
for col in numerical_columns:
    print(f"Column: {col}")
    print(f"Mean: {data[col].mean()}")
    print(f"Median: {data[col].median()}")
    print(f"Mode: {data[col].mode()[0]}")
    print(f"Standard Deviation: {data[col].std()}")
    print(" ")

In [None]:
numerical_columns = ['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED', 
                     'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED', 
                     'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED', 
                     'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED']

for column in numerical_columns:
    print(f"Summary Statistics for {column}")
    print(f"Mean: {data[column].mean()}")
    print(f"Median: {data[column].median()}")
    print(f"Mode: {data[column].mode()[0]}")
    print(f"Standard Deviation: {data[column].std()}")
    print("\n")

In [None]:
# Group the data by borough and calculate the mean number of persons injured and killed
borough_summary = data.groupby('BOROUGH')[['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED']].mean()

# Print the summary statistics for each borough
print(borough_summary)

In [None]:
import matplotlib.pyplot as plt

# Create a scatter plot of persons injured vs persons killed
plt.scatter(data['NUMBER OF PERSONS INJURED'], data['NUMBER OF PERSONS KILLED'])
plt.title('Persons Injured vs Persons Killed in Motor Vehicle Collisions')
plt.xlabel('Number of Persons Injured')
plt.ylabel('Number of Persons Killed')
plt.show()

In [None]:
correlation = data['NUMBER OF PERSONS INJURED'].corr(data['NUMBER OF PERSONS KILLED'])
print(f"Correlation between number of persons injured and killed: {correlation}")

it seems that there is a very weak positive correlation between the number of persons injured and the number of persons killed in the motor vehicle collisions dataset. However, it's important to note that correlation does not imply causation and there may be other factors at play that influence the relationship between these variables. Additionally, it would be useful to visualize this relationship using a scatter plot to gain a better understanding of the nature and strength of the correlation.

In [None]:
# Convert CRASH DATE to datetime format
data['CRASH DATE'] = pd.to_datetime(data['CRASH DATE'], format='%m/%d/%Y')

# Create a new column for the date only
data['DATE'] = data['CRASH DATE'].dt.date

# Group the data by date and count the number of collisions
collisions_per_day = data.groupby('DATE').size().reset_index(name='counts')

# Create the time series plot
plt.plot(collisions_per_day['DATE'], collisions_per_day['counts'])
plt.title('Number of Collisions by Day')
plt.xlabel('Date')
plt.ylabel('Number of Collisions')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select the relevant numerical columns
num_cols = ['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
            'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
            'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
            'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED']

# Create box plots for each numerical column
for col in num_cols:
    sns.boxplot(x=data[col])
    plt.title(col)
    plt.show()


In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Select the numerical columns for clustering
num_cols = ['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
            'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
            'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED']

# Standardize the data
data_std = (data[num_cols] - data[num_cols].mean()) / data[num_cols].std()

# Apply PCA to reduce the dimensions
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data_std)

# Apply KMeans clustering to the reduced data
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(data_pca)
labels = kmeans.predict(data_pca)

# Visualize the clusters using a scatter plot
plt.scatter(data_pca[:, 0], data_pca[:, 1], c=labels, cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('KMeans Clustering with PCA')
plt.show()


In [None]:
# Select only columns with numerical data
numerical_cols = data.select_dtypes(include=np.number).columns

# Compute the correlation matrix
corr_matrix = data[numerical_cols].corr()

# Print the correlation matrix
print(corr_matrix)


The correlation matrix shows the correlation coefficients between all pairs of numeric variables in the dataset. The values in the matrix range from -1 to 1, where 1 indicates a perfect positive correlation, 0 indicates no correlation, and -1 indicates a perfect negative correlation.

From the matrix, we can see that latitude and longitude are strongly negatively correlated, which is expected since they represent the same information in opposite directions. We can also see that the number of persons injured and the number of motorist injured are strongly positively correlated, which makes sense since motor vehicles are often involved in accidents that cause injuries.

Additionally, we can see that there is a moderate positive correlation between the number of persons injured and the cluster variable, which indicates that there may be some relationship between the location of accidents and the severity of injuries. However, further analysis is needed to investigate this relationship more deeply.

Overall, the correlation matrix provides a useful starting point for identifying potential relationships between variables in the dataset.

In [None]:
!pip install matplotlib seaborn




In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your dataset is in a Pandas DataFrame named 'data'

# Bar chart
top_n = 15
vehicle_type_counts = data['VEHICLE TYPE CODE 1'].value_counts().head(top_n)
plt.figure(figsize=(12, 6))
sns.barplot(x=vehicle_type_counts.index, y=vehicle_type_counts.values)
plt.title('Number of Accidents by Vehicle Type (Top 15)', fontsize=14)
plt.xlabel('Vehicle Type', fontsize=12)
plt.ylabel('Number of Accidents', fontsize=12)
plt.xticks(rotation=90, fontsize=10)
plt.show()

# Scatter plot
plt.figure(figsize=(12, 6))
sns.scatterplot(x='LONGITUDE', y='LATITUDE', data=data, alpha=0.5, s=10, hue='BOROUGH', palette='viridis')
plt.title('Accidents Distribution by Location', fontsize=14)
plt.xlabel('Longitude', fontsize=12)
plt.ylabel('Latitude', fontsize=12)
plt.legend(fontsize=10)
plt.show()

# Heatmap (correlation matrix)
plt.figure(figsize=(12, 6))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', annot_kws={"size": 8})
plt.title('Correlation Matrix Heatmap', fontsize=14)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()


In [None]:
!pip install plotly



In [None]:
import pandas as pd
import plotly.express as px

# Assuming our dataset is in a Pandas DataFrame named 'data'

# Bar chart (Top 15 Vehicle Types)
top_n = 15
vehicle_type_counts = data['VEHICLE TYPE CODE 1'].value_counts().head(top_n)
fig = px.bar(x=vehicle_type_counts.index, y=vehicle_type_counts.values,
             labels={'x': 'Vehicle Type', 'y': 'Number of Accidents'},
             title='Number of Accidents by Vehicle Type (Top 15)')
fig.show()

# Interactive scatter plot
fig = px.scatter(data_frame=data, x='LONGITUDE', y='LATITUDE',
                 color='BOROUGH', hover_name='COLLISION_ID',
                 title='Accidents Distribution by Location',
                 labels={'LONGITUDE': 'Longitude', 'LATITUDE': 'Latitude', 'BOROUGH': 'Borough'})
fig.update_traces(marker=dict(size=6, opacity=0.7))
fig.show()

# Heatmap (correlation matrix)
correlation_matrix = data.corr()
fig = px.imshow(correlation_matrix, color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
fig.update_layout(title='Correlation Matrix Heatmap',
                  xaxis=dict(tickmode='array', tickvals=list(range(len(correlation_matrix.columns))), ticktext=correlation_matrix.columns),
                  yaxis=dict(tickmode='array', tickvals=list(range(len(correlation_matrix.index))), ticktext=correlation_matrix.index))
fig.show()


In [None]:
import pandas as pd
import plotly.express as px

# Assuming your dataset is in a Pandas DataFrame named 'data'

# Remove rows with missing borough or contributing factor
filtered_data = data.dropna(subset=['BOROUGH', 'CONTRIBUTING FACTOR VEHICLE 1'])

# Get the top 10 contributing factors
top_factors = filtered_data['CONTRIBUTING FACTOR VEHICLE 1'].value_counts().head(10).index.tolist()

# Filter the data to only include the top 10 contributing factors
filtered_data = filtered_data[filtered_data['CONTRIBUTING FACTOR VEHICLE 1'].isin(top_factors)]

# Create the Sunburst chart
fig = px.sunburst(filtered_data, path=['BOROUGH', 'CONTRIBUTING FACTOR VEHICLE 1'], title='Accidents by Borough and Top 10 Contributing Factors')
fig.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming our dataset is in a Pandas DataFrame named 'data'

# Convert 'CRASH DATE' to datetime objects
data['CRASH_DATE'] = pd.to_datetime(data['CRASH DATE'])

# Set the 'CRASH_DATE' column as the index
data.set_index('CRASH_DATE', inplace=True)

# Resample the data to get the number of accidents per month
monthly_accidents = data.resample('M').size()

# Plot the number of accidents per month
plt.figure(figsize=(15, 6))
plt.plot(monthly_accidents)
plt.title('Number of Accidents per Month')
plt.xlabel('Month')
plt.ylabel('Number of Accidents')
plt.show()


In [None]:
!pip install geopandas


In [None]:
!pip install folium


In [None]:
print(data.columns)

In [None]:
data['DAY_OF_WEEK'] = data['CRASH DATE'].dt.dayofweek


In [None]:
data['TOTAL_PERSONS'] = data['NUMBER OF PERSONS INJURED'] + data['NUMBER OF PERSONS KILLED']


In [None]:
data['IS_WEEKEND'] = data['DAY_OF_WEEK'].isin([5, 6])


In [None]:
# Convert 'CRASH DATE' and 'CRASH TIME' to datetime objects
data['CRASH_DATE'] = pd.to_datetime(data['CRASH DATE'])
data['CRASH_TIME'] = pd.to_datetime(data['CRASH TIME'], format='%H:%M').dt.time

# Create a separate column for the hour of the crash
data['CRASH_HOUR'] = data['CRASH_TIME'].apply(lambda x: x.hour)

def get_time_of_day(hour):
    if 6 <= hour < 12:
        return "morning"
    elif 12 <= hour < 18:
        return "afternoon"
    elif 18 <= hour < 24:
        return "evening"
    else:
        return "night"

data['TIME_OF_DAY'] = data['CRASH_HOUR'].apply(get_time_of_day)


In [None]:
data['TOTAL_PEDESTRIAN_CASUALTIES'] = data['NUMBER OF PEDESTRIANS INJURED'] + data['NUMBER OF PEDESTRIANS KILLED']
data['TOTAL_CYCLIST_CASUALTIES'] = data['NUMBER OF CYCLIST INJURED'] + data['NUMBER OF CYCLIST KILLED']
data['TOTAL_MOTORIST_CASUALTIES'] = data['NUMBER OF MOTORIST INJURED'] + data['NUMBER OF MOTORIST KILLED']


In [None]:
data['IS_HIT_AND_RUN'] = data['CONTRIBUTING FACTOR VEHICLE 1'].str.contains('Unspecified') | data['CONTRIBUTING FACTOR VEHICLE 2'].str.contains('Unspecified')


In [None]:
def get_season(date):
    year = date.year
    seasons = {
        "winter": (pd.Timestamp(year, 12, 21), pd.Timestamp(year + 1, 3, 20)),
        "spring": (pd.Timestamp(year, 3, 21), pd.Timestamp(year, 6, 20)),
        "summer": (pd.Timestamp(year, 6, 21), pd.Timestamp(year, 9, 22)),
        "fall": (pd.Timestamp(year, 9, 23), pd.Timestamp(year, 12, 20)),
    }
    for season, (start_date, end_date) in seasons.items():
        if start_date <= date <= end_date:
            return season

data['SEASON'] = data['CRASH DATE'].apply(get_season)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting accidents count by time of day
plt.figure(figsize=(10, 6))
sns.countplot(x='TIME_OF_DAY', data=data, order=['morning', 'afternoon', 'evening', 'night'])
plt.title('Accidents Count by Time of Day')
plt.xlabel('Time of Day')
plt.ylabel('Accident Count')
plt.show()


In [None]:
# Plotting accidents count by day of the week and weekend/weekday
plt.figure(figsize=(10, 6))
sns.countplot(x='DAY_OF_WEEK', data=data, hue='IS_WEEKEND')
plt.title('Accidents Count by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Accident Count')
plt.show()


In [None]:
# Plotting casualties count by season
plt.figure(figsize=(10, 6))
season_casualties = data.groupby('SEASON')[['TOTAL_PEDESTRIAN_CASUALTIES', 'TOTAL_CYCLIST_CASUALTIES', 'TOTAL_MOTORIST_CASUALTIES']].sum()
season_casualties.plot(kind='bar', stacked=True)
plt.title('Casualties Count by Season')
plt.xlabel('Season')
plt.ylabel('Casualty Count')
plt.show()


In [None]:
print(data.columns)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='DAY_OF_WEEK', y='TOTAL_PERSONS', data=data, estimator=sum, ci=None)
plt.title('Total Persons Involved in Accidents by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Total Persons')
plt.show()


In [None]:
sns.countplot(data=data, x='BOROUGH')
plt.title('Accidents by Borough')
plt.xticks(rotation=45)
plt.show()


In [None]:
data_by_factor = data.groupby('CONTRIBUTING FACTOR VEHICLE 1').sum()['TOTAL_PERSONS'].sort_values(ascending=False).head(10)

data_by_factor.plot(kind='bar')
plt.title('Total Persons Injured by Contributing Factor')
plt.xlabel('Contributing Factor')
plt.ylabel('Total Persons Injured')
plt.show()


In [None]:
sns.countplot(data=data, x='VEHICLE TYPE CODE 1', order=data['VEHICLE TYPE CODE 1'].value_counts().iloc[:10].index)
plt.title('Accidents by Vehicle Type')
plt.xticks(rotation=100)
plt.show()


In [None]:
sns.barplot(data=data, x='SEASON', y='TOTAL_PERSONS', estimator=sum)
plt.title('Total Persons Injured by Season')
plt.show()


In [None]:
pip install seaborn


This code will generate a heatmap displaying the correlations between all numeric variables in your dataset. Positive correlations are displayed in blue, while negative correlations are displayed in red. The stronger the correlation, the more intense the color.

Keep in mind that correlation doesn't imply causation. High correlation between two variables can help identify relationships, but it's important to investigate further and consider other factors before making conclusions.

In addition, multicollinearity can cause issues when building regression models. If you identify strong correlations between independent variables, consider using techniques like dimensionality reduction or variable selection to address this issue.


In [None]:
# Calculate the correlation matrix
corr_matrix = data.corr()

# Round the correlation matrix to 2 decimal places
rounded_corr_matrix = corr_matrix.round(2)

# Print the rounded correlation matrix
print(rounded_corr_matrix)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the correlation matrix
corr_matrix = data.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 12))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=.5, vmin=-1, vmax=1)

# Customize the plot
plt.title("Correlation Matrix Heatmap")
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

# Show the plot
plt.show()


In [None]:
# Fill missing values for numeric columns
numeric_columns = data.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    data[col] = data[col].fillna(data[col].median())

# Fill missing values for categorical columns
categorical_columns = data.select_dtypes(include=[object]).columns
for col in categorical_columns:
    data[col] = data[col].fillna(method='bfill').fillna(method='ffill')


In [None]:
data = pd.get_dummies(data, columns=['BOROUGH', 'TIME_OF_DAY', 'SEASON'], drop_first=True)


In [None]:
data = data.drop(columns=['CRASH_DATE', 'CRASH_TIME', 'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME'])


In [None]:
X = data.drop('IS_HIT_AND_RUN', axis=1)
y = data['IS_HIT_AND_RUN']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
print(X_train.dtypes)
print(y_train.dtypes)



In [None]:
data['CRASH DATE'] = pd.to_datetime(data['CRASH DATE'])
data['ZIP CODE'] = data['ZIP CODE'].astype(str)

In [None]:
print(data.columns)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np

# Data cleaning and preparation
df = pd.read_csv("Motor_Vehicle_Collisions.csv",  low_memory=False)
# We will remove the irrelevant columns for our model and any rows with missing values
df = df[['BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE', 'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
         'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
         'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED']]

# Replace empty strings with NaN values
df.replace('', np.nan, inplace=True)

# Convert 'ZIP CODE' column to numeric, forcing non-numeric values to NaN
df['ZIP CODE'] = pd.to_numeric(df['ZIP CODE'], errors='coerce')

# Fill missing values with the mean of that column for numeric columns only
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# One-hot encoding for the 'BOROUGH' column
df = pd.get_dummies(df, columns=['BOROUGH'])

# Split the data into training and testing sets
X = df.drop(['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED'], axis=1)
y_injured = df['NUMBER OF PERSONS INJURED']
y_killed = df['NUMBER OF PERSONS KILLED']
X_train, X_test, y_injured_train, y_injured_test = train_test_split(X, y_injured, test_size=0.2, random_state=42)
X_train, X_test, y_killed_train, y_killed_test = train_test_split(X, y_killed, test_size=0.2, random_state=42)

# Fit a linear regression model for predicting the number of persons injured
lr_injured = LinearRegression()
lr_injured.fit(X_train, y_injured_train)

# Fit a linear regression model for predicting the number of persons killed
lr_killed = LinearRegression()
lr_killed.fit(X_train, y_killed_train)

# Evaluate the models
injured_score = lr_injured.score(X_test, y_injured_test)
killed_score = lr_killed.score(X_test, y_killed_test)

print(f"R^2 score for predicting the number of persons injured: {injured_score}")
print(f"R^2 score for predicting the number of persons killed: {killed_score}")


The R^2 score (coefficient of determination) is a statistical measure that represents the proportion of the variance in the dependent variable (in this case, the number of persons injured or killed) that can be predicted by the independent variables (features) in the model. It ranges from 0 to 1, with 1 indicating that the model can perfectly predict the dependent variable based on the independent variables and 0 indicating that the model cannot make any predictions.

In our case, you have the following R^2 scores:

R^2 score for predicting the number of persons injured: 0.9912693647103508
R^2 score for predicting the number of persons killed: 0.968012761315077
These scores are very close to 1, which indicates that the linear regression models are doing an excellent job of predicting the number of persons injured and killed based on the features in the dataset.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
import numpy as np

# Load and preprocess our dataset as before...

# Split the data into features and targets
X = df.drop(['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED'], axis=1)
y_injured = df['NUMBER OF PERSONS INJURED']
y_killed = df['NUMBER OF PERSONS KILLED']

# Initialize Linear Regression models for predicting the number of persons injured and killed
lr_injured = LinearRegression()
lr_killed = LinearRegression()

# Perform 5-fold cross-validation
cv_injured_scores = cross_val_score(lr_injured, X, y_injured, cv=5)
cv_killed_scores = cross_val_score(lr_killed, X, y_killed, cv=5)

# Calculate the average R^2 score across the 5 folds
injured_mean_score = np.mean(cv_injured_scores)
killed_mean_score = np.mean(cv_killed_scores)

print(f"Mean R^2 score for predicting the number of persons injured (5-fold cross-validation): {injured_mean_score}")
print(f"Mean R^2 score for predicting the number of persons killed (5-fold cross-validation): {killed_mean_score}")


These results indicate that our models are performing well on predicting the number of persons injured and killed in motor vehicle collisions. The mean R^2 scores from 5-fold cross-validation are close to the R^2 scores obtained from the initial train-test split:

Mean R^2 score for predicting the number of persons injured (5-fold cross-validation): 0.991481399024942
Mean R^2 score for predicting the number of persons killed (5-fold cross-validation): 0.9838041399138382
These high R^2 scores suggest that the models are able to explain a large proportion of the variance in the target variables (number of persons injured and killed) based on the input features. It also indicates that the models are likely generalizing well to unseen data, as the cross-validation scores are consistent across different folds.

We can now consider our models validated and ready for deployment or further analysis. we might want to explore feature importance, test the models on new data, or examine the residuals to ensure that the model assumptions are being met. Additionally, you could investigate other algorithms or perform hyperparameter tuning to see if you can further improve the performance of our models.

In [None]:
# Print the type of the lr_injured object
print(type(lr_injured))

# Check if the object has been fitted
if hasattr(lr_injured, 'coef_'):
    print("The model has been fitted.")
else:
    print("The model has not been fitted.")


In [None]:
# Fit a linear regression model for predicting the number of persons injured
lr_injured = LinearRegression()
lr_injured.fit(X_train, y_injured_train)

# Fit a linear regression model for predicting the number of persons killed
lr_killed = LinearRegression()
lr_killed.fit(X_train, y_killed_train)


In [None]:
importance_injured = lr_injured.coef_
importance_killed = lr_killed.coef_

# Print the feature importance for each model
for i, feature in enumerate(X.columns):
    print(f"{feature} importance for predicting the number of persons injured: {importance_injured[i]}")
    print(f"{feature} importance for predicting the number of persons killed: {importance_killed[i]}")


These outputs represent the importance of each feature in predicting the number of persons injured and killed using the linear regression models. The importance is represented by the coefficients of the linear regression models. These coefficients indicate how much the dependent variable (number of persons injured or killed) is expected to change when the corresponding feature value increases by 1 unit, holding all other features constant.

For instance, the coefficient for "NUMBER OF PEDESTRIANS INJURED" in the persons injured model is 0.9971412315417622, meaning that for every additional pedestrian injured, the number of persons injured is expected to increase by about 0.997, holding all other factors constant.

Similarly, for the persons killed model, the coefficient for "NUMBER OF PEDESTRIANS KILLED" is 1.0000199679308033, indicating that for each additional pedestrian killed, the number of persons killed is expected to increase by about 1.000, holding all other factors constant.