In [151]:
import pandas as pd
import altair as alt

dir = '../Data'

weather = pd.read_csv(f'{dir}/weather_clean.csv')
collisions = pd.read_csv(f'{dir}/collisions_clean.csv')

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [152]:
# select the year from the CRASH DATE column
collisions['YEAR'] = collisions['CRASH DATE'].astype(str).str[:4]

## Is there any type of vehicle more prone to participate in accidents?

In [153]:
collisions["VEHICLE TYPE CODE 1"].unique()

array(['VAN', 'CAR', 'BUS', 'TRACTOR', 'TAXI', 'SCOOTER', 'TRUCK',
       'MOTORCYCLE', 'BICYCLE', 'OTHERS', 'AMBULANCE', 'FDNY', 'UNKNOWN'],
      dtype=object)

Given that we aggregated the type of vehicles to a total of just 14 classes, the bar chart is again the initial option that comes to our mind. 

In [154]:
df = collisions[['VEHICLE TYPE CODE 1']]

alt.Chart(df).mark_bar().encode(
    x=alt.X('VEHICLE TYPE CODE 1:N', title='Vehicle Type', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('count():Q', title='Number of Collisions')
).properties(
    width=500,
    height=200,
    title='Number of Collisions by Vehicle Type'
).configure_mark(
    color='lightblue'
)

The result is decent but at the same time there are some classes barely noticeable. We want to clarify that the answer of this question is impossible with the available data, since we would need the traffic percenatge or proportion of every type of vehicle to determine if there is a vehicle more prone to have crashes than others. This happens because, for example, there are much more cars than ambulances, so the number of total car crashes is much bigger than the numbe of total ambulance crashes, and without the traffic proportions we can't really say if one of the vehicles is more prone to have an accident than the others. 

Having stated this, we thought that polar area charts could be a good option to make the classes with less crashes more noticeable. We are aware that the comparison between areas is more difficult, however we believe that the main (erroneous for the reasons previously explained) conclusions for the question are still clear enough and pretty understandable.

In [209]:
df = collisions[['VEHICLE TYPE CODE 1']]

alt.Chart(collisions).encode(
    alt.Theta("VEHICLE TYPE CODE 1:N").stack(True),
    alt.Radius("count()").scale(type="sqrt", zero=True, rangeMin=20),
    color=alt.Color("VEHICLE TYPE CODE 1:N"),
).mark_arc(innerRadius=20, stroke="#fff")

In [157]:
collisions.columns

Index(['COLLISION_ID', 'DAY NAME', 'CRASH DATE', 'CRASH MOMENT',
       'CRASH TIME INTERVAL', 'BOROUGH', 'ZIP CODE', 'LOCATION', 'STREET NAME',
       'NUMBER OF INJURED', 'NUMBER OF KILLED',
       'CONTRIBUTING FACTOR VEHICLE 1', 'VEHICLE TYPE CODE 1', 'YEAR'],
      dtype='object')

In [158]:
# create a new column that sums NUMBER OF PERSONS INJURED and NUMBER OF PERSONS KILLED
collisions['TOTAL INJURED/KILLED'] = collisions['NUMBER OF INJURED'] + collisions['NUMBER OF KILLED']

We consider that a lollipop chart can also be useful since it allows an easy comparison of several instances. The result is the following and the result is similar to the one of a typical barchart. Simple and effective, but with the same problem of some types of vehicle being unnoticeable and impossible to differentiate (the ones with the lowest counts of crashes).

In [207]:
df = collisions[['VEHICLE TYPE CODE 1', 'TOTAL INJURED/KILLED']]

df1 = df.groupby('VEHICLE TYPE CODE 1').sum("TOTAL INJURED/KILLED").reset_index()
df2 = df.groupby('VEHICLE TYPE CODE 1').count().reset_index()

df2.columns = ['VEHICLE TYPE CODE 1', 'TOTAL COLLISIONS']

df = pd.merge(df1, df2, on='VEHICLE TYPE CODE 1')


lolli = alt.Chart(df).mark_bar(
    size=3
).encode(
    x=alt.X('TOTAL COLLISIONS:Q',
            title='Total Collisions'),
    y=alt.Y('VEHICLE TYPE CODE 1:N',
            title='Vehicle Type',
            sort=alt.EncodingSortField(field="TOTAL COLLISIONS", order="descending"),
            axis=alt.Axis(labelAngle=0, grid=True)),
    color=alt.Color('VEHICLE TYPE CODE 1:N',
                    title='Vehicle Type',
                    legend=None),
)

pop = alt.Chart(df).mark_circle(
    tooltip=True,
    size=80,
    opacity=1
).encode(
    x=alt.X('TOTAL COLLISIONS:Q',
            title='Total Collisions'),
    y=alt.Y('VEHICLE TYPE CODE 1:N',
            title='Vehicle Type',
            sort=alt.EncodingSortField(
                field="TOTAL COLLISIONS",
                order="descending"),
            axis=alt.Axis(labelAngle=0, grid=True)),
    color=alt.Color('VEHICLE TYPE CODE 1:N',
                    title='Vehicle Type',
                    legend=None),
    tooltip=['VEHICLE TYPE CODE 1:N', 'TOTAL COLLISIONS:Q', 'TOTAL INJURED/KILLED:Q']
).properties(
    title='Lollipop Plot of Collisions by Vehicle Type and Contributing Factor'
)

lolli + pop

Finally, to add some extra information to the plot, we decided to encode the total number of injured/killed per type of vehicle with the size of the lollipop. However this produces...

BOLUDO, NORMALIZÁ EL NUMERO DE HERIDOS/MUERTOS POR EL NUMERO DE CRASHES PARA QUE SE PUEDAN VER MEJOR LAS BOLITAS

In [208]:
lolli = alt.Chart(df).mark_bar(
    size=2
).encode(
    x=alt.X('TOTAL COLLISIONS:Q',
            title='Total Collisions'),
    y=alt.Y('VEHICLE TYPE CODE 1:N',
            title='Vehicle Type',
            sort=alt.EncodingSortField(field="TOTAL COLLISIONS", order="descending"),
            axis=alt.Axis(labelAngle=0, grid=True)),
    color=alt.Color('VEHICLE TYPE CODE 1:N',
                    title='Vehicle Type',
                    legend=None),
)

pop = alt.Chart(df).mark_circle(
    tooltip=True,
    opacity=1
).encode(
    x=alt.X('TOTAL COLLISIONS:Q',
            title='Total Collisions'),
    y=alt.Y('VEHICLE TYPE CODE 1:N',
            title='Vehicle Type',
            sort=alt.EncodingSortField(
                field="TOTAL COLLISIONS",
                order="descending"),
            axis=alt.Axis(labelAngle=0, grid=True)),
    size=alt.Size('TOTAL INJURED/KILLED:Q',
                    title='Total Injured/Killed'),
    color=alt.Color('VEHICLE TYPE CODE 1:N',
                    title='Vehicle Type',
                    legend=None),
    tooltip=['VEHICLE TYPE CODE 1:N', 'TOTAL COLLISIONS:Q', 'TOTAL INJURED/KILLED:Q']
).properties(
    title='Lollipop Plot of Collisions by Vehicle Type and Contributing Factor'
)

lolli + pop