In [None]:
import pandas as pd
import altair as alt

dir = '../Data'

weather = pd.read_csv(f'{dir}/weather_clean.csv')
collisions = pd.read_csv(f'{dir}/collisions_clean.csv')

alt.data_transformers.enable("vegafusion")

In [None]:
# select the year from the CRASH DATE column
collisions['YEAR'] = collisions['CRASH DATE'].astype(str).str[:4]

## Are accidents more frequent during weekdays or weekends? Is there any difference between before COVID-19 and after?

In [None]:
# create a new dataframe with the required columns
df = collisions.loc[:, ['YEAR', 'DAY NAME']]
df.insert(0, 'COUNT', 1)
collisions['WEEKDAY'] = collisions['DAY NAME'].apply(lambda x: 'Weekday' if x in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] else 'Weekend')

df['WEEKDAY'] = df['DAY NAME'].apply(lambda x: 'Weekday' if x in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] else 'Weekend')

# group the data by year and weekday/weekend
df = df.groupby(['YEAR', 'WEEKDAY']).count().reset_index()
# divide count by 5 if it is a weekday
df['COUNT'] = df.apply(lambda x: x['COUNT']/5 if x['WEEKDAY'] == 'Weekday' else x['COUNT']/2, axis=1)

In [None]:
slope = alt.Chart(df).mark_line().encode(
    x=alt.X('YEAR:N', title='Year', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('COUNT:Q', title='Collisions per Day'),
    color=alt.Color('WEEKDAY:N', legend=alt.Legend(title='Day Type'))
)

pts = alt.Chart(df).mark_point(
    filled=True,
    opacity=1
).encode(
    x=alt.X('YEAR:N', title='Year', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('COUNT:Q', title='Collisions per Day'),
    color=alt.Color('WEEKDAY:N', legend=None)
)
alt.layer(slope, pts).properties(width=100, height=300, title='Collisions per Day Type by Year')

In [None]:
df = collisions.groupby('YEAR')[['NUMBER OF INJURED', 'NUMBER OF KILLED']].sum().reset_index()
df['TOTAL INJURED/KILLED'] = df['NUMBER OF INJURED'] + df['NUMBER OF KILLED']

df2 = collisions.groupby('YEAR')[['COLLISION_ID']].count()

df = df.merge(df2, on='YEAR')

df['TOTAL PER 10'] = 10 * df['TOTAL INJURED/KILLED'] / df['COLLISION_ID']
df = df.reset_index()
df

In [None]:
len(collisions['VEHICLE TYPE CODE 1'].unique())

In [None]:
data = pd.DataFrame([dict(id=i) for i in range(1, 21)])

data['color'] = ['kill/injured' if i < 6 else 'non' for i in range(1, 11)] + ['kill/injured' if i < 10 else 'non' for i in range(1, 11)]

data['year'] = ['2018'] * 10 + ['2020'] * 10
data.head()

In [None]:

car = ("M640 320V368C640 385.7 625.7 400 608 400H574.7C567.1 445.4 527.6 480 480 480C432.4 480 392.9 445.4 385.3 400H254.7C247.1 445.4 207.6 480 160 480C112.4 480 72.94 445.4 65.33 400H32C14.33 400 0 385.7 0 368V256C0 228.9 16.81 205.8 40.56 196.4L82.2 92.35C96.78 55.9 132.1 32 171.3 32H353.2C382.4 32 409.1 45.26 428.2 68.03L528.2 193C591.2 200.1 640 254.8 640 319.1V320zM171.3 96C158.2 96 146.5 103.1 141.6 116.1L111.3 192H224V96H171.3zM272 192H445.4L378.2 108C372.2 100.4 362.1 96 353.2 96H272V192zM525.3 400C527 394.1 528 389.6 528 384C528 357.5 506.5 336 480 336C453.5 336 432 357.5 432 384C432 389.6 432.1 394.1 434.7 400C441.3 418.6 459.1 432 480 432C500.9 432 518.7 418.6 525.3 400zM205.3 400C207 394.1 208 389.6 208 384C208 357.5 186.5 336 160 336C133.5 336 112 357.5 112 384C112 389.6 112.1 394.1 114.7 400C121.3 418.6 139.1 432 160 432C180.9 432 198.7 418.6 205.3 400z"
)

alt.Chart(data).transform_calculate(
    row="ceil(datum.id/10)"
).transform_calculate(
    col="datum.id - datum.row*10"
).mark_point(
    filled=True,
    size=0.04
).encode(
    alt.X("col:O", axis=None),
    alt.Y("row:O", axis=None),
    alt.ShapeValue(car),
    alt.Row("year:O",
            spacing=0,
            header=alt.Header(labelAngle=-90, title='Year')),
    color=alt.Color('color:N',
                    scale=alt.Scale(domain=['kill/injured', 'non'],range=['#d62728', '#ff7f0e']),
                    legend=None)
).transform_window(
    x='rank(id)',
    groupby=['year']
).properties(
    width=900,
    height=130,
    title='Count of Collisions with Injuries and Kills out of 10'
)

The first and simpler option is to make a barplot. Since we want the names of the days to be legible the idea is to make an horizontal bar chart.

In [None]:
alt.Chart(collisions).mark_bar().encode(
    x=alt.X('count():Q',
            title='Number of Collisions'),
    y=alt.Y('DAY NAME:O',
            sort=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
            title='Day of the Week'),
    color=alt.Color('DAY NAME:O',
                    scale=alt.Scale(domain=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
                                    range=['lightgray', 'lightgray', 'lightgray', 'lightgray', 'lightgray', '#fcef00', '#fcef00']),
                                    legend=None)
).properties(
    width=500,
    height=200,
    title='Number of Collisions by Day of the Week'
)

As mentioned, this option is simple but effective in its task. However, we also intend to show if there is a difference between before and after COVID-19 (that is between 2018 and 2020), so in future iterations of the visualization we should take this into account.

The next visualizaiton intends to answer the same question, but also tries to show if there is a time period of the day (morning, afternoon, night) that has more crashes than others. Nevertheless, the outcome is not really useful since the comparison between bars and inside bars is pretty diffuclt, moreover, the colors catch the attention so it's also more difficult (it takes longer since there are more variables encoded) than in the previous visualization to identify the days with more crashes.

In [None]:
bars = alt.Chart(collisions[['DAY NAME', 'CRASH MOMENT']]).mark_bar().transform_calculate(
    order="{'Morning': 1, 'Afternoon': 2, 'Night': 3}[datum['CRASH MOMENT']]"
).encode(
    x=alt.X('count():Q',
            title='Number of Collisions',
            stack='zero'),
    y=alt.Y('DAY NAME:N',
            sort=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
            title='Day of the Week'),
    color=alt.Color('CRASH MOMENT:N',
                    title='Moment of the Day',
                    scale=alt.Scale(domain=['Morning', 'Afternoon', 'Night'],
                                    range=['lightblue', 'lightgreen', 'pink'])),
    order=alt.Order('order:O')
).properties(
    width=500,
    height=200,
    title='Number of Collisions by Day of the Week'
)

text = alt.Chart(collisions[['DAY NAME', 'CRASH MOMENT']]).mark_text(
    align='right',
    color='black',
    baseline='middle'
).transform_calculate(
    order="{'Morning': 1, 'Afternoon': 2, 'Night': 3}[datum['CRASH MOMENT']]"
).encode(
   x=alt.X('count():Q',
            title='Number of Collisions',
            stack='zero'),
    y=alt.Y('DAY NAME:N',
            sort=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
            title='Day of the Week'),
    detail=alt.Detail('CRASH MOMENT:N'),
    text=alt.Text('count():Q'),
    order=alt.Order('order:O')
)
bars + text

To deal with the differentiation between 2018 and 2020, an interesting option is to do a paired horizontal bar chart. Since there is only two classes inside every row, the comaprison between instances from different rows is not as difficult as it would be if there were more categories. Therefor, we consider this visualization to accomplish its objectives, but we will keep iterating to see if a better result can be found.

In [None]:
df = collisions[['YEAR', 'DAY NAME']]

alt.Chart(df).transform_aggregate(
    count='count()',
    groupby=['DAY NAME', 'YEAR']
).mark_bar().encode(
    x=alt.X('count:Q',
            title='Number of Collisions'),
    y=alt.Y('YEAR:O',
            axis=alt.Axis(grid=False, ticks=False, labels=False),
            title=''),
    color=alt.Color('YEAR:N',
                    scale=alt.Scale(domain=['2018', '2020'],
                                    range=['lightblue', 'lightgreen']),
                    legend=alt.Legend(title='Year')),
    row=alt.Row('DAY NAME:O',
                      header=alt.Header(title='Day of the Week'),
                    #   spacing=9,
                      sort=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
                      title='Day of the Week')
).properties(
    title='Number of Collisions by Day of the Week and Year'
)

In [None]:
collisions.columns

In [None]:
df = collisions[['YEAR', 'DAY NAME', 'WEEKDAY']]

alt.Chart(df).transform_aggregate(
    count='count()',
    groupby=['WEEKDAY', 'YEAR']
).mark_bar().encode(
    x=alt.X('count:Q',
            title='Number of Collisions'),
    y=alt.Y('YEAR:O',
            axis=alt.Axis(grid=False, ticks=False, labels=False),
            title=''),
    color=alt.Color('YEAR:N',
                    scale=alt.Scale(domain=['2018', '2020'],
                                    range=['lightblue', 'lightgreen']),
                    legend=alt.Legend(title='Year')),
    row=alt.Row('WEEKDAY:O',
                      header=alt.Header(title='Day of the Week', labelAngle=0),
                    #   spacing=9,
                      sort=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
                      title='Day of the Week')
).properties(
    title='Number of Collisions by Day of the Week and Year'
)

Another option we considered useful was a small multiples of the initial barchart, one for each year, since we consider that the fact of only havig two barcharts that share indexes and are aligned makes the comparison between instances of the different charts pretty easy and, of course, the comparison between instances of the same chart is as good as for the initial plot.

In [None]:
alt.Chart(collisions).mark_bar().encode(
    x=alt.X('count():Q'),
    y=alt.Y('DAY NAME:O',
            title='Day of the Week',
            sort=alt.SortField(field='DAY NAME', order='ascending')),
    color=alt.Color('YEAR:O',
                    scale=alt.Scale(domain=['2018', '2020'], range=['lightblue', 'lightgreen']),
                    legend=None),
    column=alt.Column('YEAR:O', title='Year'),
    # row=alt.Row('WEEKDAY:O')
).properties(
    width=200,
    height=200,
    title='Number of Collisions by Day of the Week (2018 vs. 2020)'
)

In [None]:
alt.Chart(collisions).mark_bar(
    opacity=0.4
).encode(
    x=alt.X('count():Q'),
    y=alt.Y('DAY NAME:O',
            title='Day of the Week',
            sort=alt.SortField(field='DAY NAME', order='ascending')),
    color=alt.Color('YEAR:O',
                    scale=alt.Scale(domain=['2018', '2020'], range=['blue', 'green']),
                    legend=None),
    # column=alt.Column('YEAR:O', title='Year'),
    # row=alt.Row('WEEKDAY:O')
).transform_filter(
    alt.datum.YEAR == '2018'
).properties(
    width=600,
    height=200,
    title='Number of Collisions by Day of the Week (2018 vs. 2020)'
) +alt.Chart(collisions).mark_bar(
    opacity=0.4
).encode(
    x=alt.X('count():Q'),
    y=alt.Y('DAY NAME:O',
            title='Day of the Week',
            sort=alt.SortField(field='DAY NAME', order='ascending')),
    color=alt.Color('YEAR:O',
                    scale=alt.Scale(domain=['2018', '2020'], range=['blue', 'green']),
                    legend=None),
    # column=alt.Column('YEAR:O', title='Year'),
    # row=alt.Row('WEEKDAY:O')
).transform_filter(
    alt.datum.YEAR == '2020'
).properties(
    width=600,
    height=200,
    title='Number of Collisions by Day of the Week (2018 vs. 2020)'
)