In [None]:
import pandas as pd
import altair as alt

dir = '../Data'

weather = pd.read_csv(f'{dir}/weather_clean.csv')
collisions = pd.read_csv(f'{dir}/collisions_clean.csv')
weather_aggregated = pd.read_csv(f'{dir}/weather_aggregated.csv')

alt.data_transformers.enable("vegafusion")
# select the year from the CRASH DATE column
collisions['YEAR'] = collisions['CRASH DATE'].astype(str).str[:4]

In [None]:
weather_aggregated['DATE'] = pd.to_datetime(weather_aggregated['DATE']).dt.strftime('%Y-%m-%d')
weather_aggregated.head()

## Is there a correlation between weather conditions and accidents?

In [None]:
collisions['CRASH DATE'] = pd.to_datetime(collisions['CRASH DATE']).dt.strftime('%Y-%m-%d')

collisions_by_day = collisions[['CRASH DATE', 'COLLISION_ID']].groupby(['CRASH DATE']).count().reset_index()
collisions_by_day.head()

In [None]:
merged_data = pd.merge(weather_aggregated, collisions_by_day, left_on='DATE', right_on='CRASH DATE')
merged_data.drop(columns=['CRASH DATE'], inplace=True)
merged_data.rename(columns={'COLLISION_ID': 'COLLISION COUNT'}, inplace=True)
merged_data.head()

In [None]:
merged_data.describe()

In [None]:
merged_data['BEAUFORT_SCALE'] = merged_data['BEAUFORT_SCALE'].astype('category')
merged_data['PRCP_SCALE'] = merged_data['PRCP_SCALE'].astype('category')

# min temperature is 11, max temperature is 29
bin_edges = list(range(10, 30, 2))
merged_data['TEMP_SCALE'] = pd.cut(merged_data['MEAN_TEMP'], bins=bin_edges, right=False)

In [None]:
merged_data['CASES_COUNT'] = 1

In [None]:
agg_data = merged_data.groupby(['BEAUFORT_SCALE', 'TEMP_SCALE']).agg({'COLLISION COUNT': 'sum', 'CASES_COUNT': 'sum'}).reset_index()

# change teh column type to categorical
agg_data['BEAUFORT_SCALE'] = agg_data['BEAUFORT_SCALE'].astype('category')
agg_data['TEMP_SCALE'] = agg_data['TEMP_SCALE'].astype('str')

agg_data['NORMALIZED_COLLISION_COUNT'] = agg_data['COLLISION COUNT'] / agg_data['CASES_COUNT']

agg_data.head()

In [None]:
# fill nan of NORMALIZED_COLLISION_COUNT with 0
#agg_data['NORMALIZED_COLLISION_COUNT'].fillna(0, inplace=True)

Given that in summertime there is nearly no rain, we decided for this first visualization to only analyze the data given the categorized average daily speed of wind and the average daily temperature. Since we want to encode two keys, wind and temperature, and a value, number of crashes, a heatmap seems to be a good option. To do so we also categorized the temperature taking into account the maximum and minimum temperatures registered. The result is correct and relatively easy to interpret, but at the same time there are values missing for some combinations, which is expectable since the climate does not usually vary much in the same season. 

In [None]:
alt.Chart(agg_data).mark_rect().encode(
    x=alt.X('TEMP_SCALE:O',
            title='Temperature Scale',
            axis=alt.Axis(labelAngle=0)),
    y=alt.Y('BEAUFORT_SCALE:N',
            
            title='Beaufort Scale'),
    color=alt.Color('NORMALIZED_COLLISION_COUNT:Q',
                    scale=alt.Scale(range=['#f0fff1', '#5603ad']),
                    title='Collision Count')
).properties(
    width=500,
    height=300,
    title='Collision Count by Beaufort Scale and Precipitation Scale'
)

In [None]:
merged_data.columns

The other option we consider to be useful is a small multiples of the scatterplots of collisions versus the three weather variables (precipitation, temperature and wind speed). This visualization is simple and clear to show if there is any correlation between the number of accidents and this meterological factors. 

In [None]:
t1 = alt.Chart(merged_data).mark_point(
    filled=True,
    size=100,
    opacity=0.5
).encode(
    x=alt.X('MEAN_TEMP:Q',
            title='Mean Temperature',
            scale=alt.Scale(domain=[10, 30])),
    y=alt.Y('COLLISION COUNT:Q',
            title='Number of Collisions'),
    color=alt.Color('year(DATE):N',
                    title='Year')
).properties(
    title='Number of Collisions by Mean Temperature'
)

t2 = alt.Chart(merged_data).mark_point(
    filled=True,
    size=100,
    opacity=0.5
).encode(
    x=alt.X('PRCP:Q',
            title='Precipitation'),
    y=alt.Y('COLLISION COUNT:Q',
            title='Number of Collisions'),
    color=alt.Color('year(DATE):N',
                    title='Year'),
).properties(
    title='Number of Collisions by Precipitation'
)

t3 = alt.Chart(merged_data).mark_point(
    filled=True,
    size=100,
    opacity=0.5
).encode(
    x=alt.X('AWND:Q',
            scale=alt.Scale(domain=[1, 6.5]),
            title='Wind Speed'),
    y=alt.Y('COLLISION COUNT:Q',
            title='Number of Collisions'),
    color=alt.Color('year(DATE):N',
                    scale=alt.Scale(range=['#A7C9C7', '#8367C7']),
                    title='Year'),
).properties(
    title='Number of Collisions by Wind Speed'
)

t1 | t2 | t3

Since we are alredy using a heatmap for another question, we decided to use the small multiples for the scatterplots. As mentioned, this option is effective and really allows the user to see if there is any clear correlation. The main disadvantage of this plot is that it may occupy more space than other options.