In [2]:
import pandas as pd
import altair as alt

dir = '../Data'

weather = pd.read_csv(f'{dir}/weather_clean.csv')
collisions = pd.read_csv(f'{dir}/collisions_clean.csv')
weather_aggregated = pd.read_csv(f'{dir}/weather_aggregated.csv')

alt.data_transformers.enable("vegafusion")
# select the year from the CRASH DATE column
collisions['YEAR'] = collisions['CRASH DATE'].astype(str).str[:4]

In [3]:
weather_aggregated['DATE'] = pd.to_datetime(weather_aggregated['DATE']).dt.strftime('%Y-%m-%d')
weather_aggregated.head()

Unnamed: 0,DATE,AWND,PRCP,TMAX,TMIN,TOBS,BEAUFORT_SCALE,PRCP_SCALE,MEAN_TEMP
0,2018-06-01,1.625,4.354762,24.921429,17.371429,18.683333,Light Breeze,Slight,20.325397
1,2018-06-02,3.4875,5.296296,29.330769,18.8,21.825,Gentle Breeze,Slight,23.31859
2,2018-06-03,5.0,6.153165,23.8,12.353846,15.125,Gentle Breeze,Slight,17.092949
3,2018-06-04,2.8875,13.223256,20.7,10.392308,10.75,Light Breeze,Slight,13.947436
4,2018-06-05,3.4875,0.801266,23.092857,11.5,13.233333,Gentle Breeze,Slight,15.942063


## Is there a correlation between weather conditions and accidents?

In [4]:
collisions['CRASH DATE'] = pd.to_datetime(collisions['CRASH DATE']).dt.strftime('%Y-%m-%d')

collisions_by_day = collisions[['CRASH DATE', 'COLLISION_ID']].groupby(['CRASH DATE']).count().reset_index()
collisions_by_day.head()

Unnamed: 0,CRASH DATE,COLLISION_ID
0,2018-06-01,751
1,2018-06-02,622
2,2018-06-03,525
3,2018-06-04,698
4,2018-06-05,688


In [5]:
merged_data = pd.merge(weather_aggregated, collisions_by_day, left_on='DATE', right_on='CRASH DATE')
merged_data.drop(columns=['CRASH DATE'], inplace=True)
merged_data.rename(columns={'COLLISION_ID': 'COLLISION COUNT'}, inplace=True)
merged_data.head()

Unnamed: 0,DATE,AWND,PRCP,TMAX,TMIN,TOBS,BEAUFORT_SCALE,PRCP_SCALE,MEAN_TEMP,COLLISION COUNT
0,2018-06-01,1.625,4.354762,24.921429,17.371429,18.683333,Light Breeze,Slight,20.325397,751
1,2018-06-02,3.4875,5.296296,29.330769,18.8,21.825,Gentle Breeze,Slight,23.31859,622
2,2018-06-03,5.0,6.153165,23.8,12.353846,15.125,Gentle Breeze,Slight,17.092949,525
3,2018-06-04,2.8875,13.223256,20.7,10.392308,10.75,Light Breeze,Slight,13.947436,698
4,2018-06-05,3.4875,0.801266,23.092857,11.5,13.233333,Gentle Breeze,Slight,15.942063,688


In [6]:
merged_data.describe()

Unnamed: 0,AWND,PRCP,TMAX,TMIN,TOBS,MEAN_TEMP,COLLISION COUNT
count,244.0,244.0,244.0,244.0,244.0,244.0,244.0
mean,3.100534,4.261346,27.988266,18.638808,19.7006,22.109225,474.344262
std,0.911683,7.272843,3.793126,3.675226,3.653244,3.549131,189.606079
min,1.475,0.0,17.392308,6.785714,7.7,11.334921,196.0
25%,2.4375,0.019747,25.476786,16.659821,17.3125,20.287056,300.0
50%,2.966071,0.897468,28.259066,19.194048,20.339286,22.611111,428.5
75%,3.5875,5.289804,30.973626,21.42761,22.351429,24.767262,657.25
max,6.1625,49.176531,35.366667,25.435714,26.766667,29.055556,845.0


In [7]:
merged_data['BEAUFORT_SCALE'] = merged_data['BEAUFORT_SCALE'].astype('category')
merged_data['PRCP_SCALE'] = merged_data['PRCP_SCALE'].astype('category')

# min temperature is 11, max temperature is 29
bin_edges = list(range(10, 30, 2))
merged_data['TEMP_SCALE'] = pd.cut(merged_data['MEAN_TEMP'], bins=bin_edges, right=False)

In [8]:
merged_data['CASES_COUNT'] = 1

In [9]:
agg_data = merged_data.groupby(['BEAUFORT_SCALE', 'TEMP_SCALE']).agg({'COLLISION COUNT': 'sum', 'CASES_COUNT': 'sum'}).reset_index()

# change teh column type to categorical
agg_data['BEAUFORT_SCALE'] = agg_data['BEAUFORT_SCALE'].astype('category')
agg_data['TEMP_SCALE'] = agg_data['TEMP_SCALE'].astype('str')

agg_data['NORMALIZED_COLLISION_COUNT'] = agg_data['COLLISION COUNT'] / agg_data['CASES_COUNT']

agg_data.head()

  agg_data = merged_data.groupby(['BEAUFORT_SCALE', 'TEMP_SCALE']).agg({'COLLISION COUNT': 'sum', 'CASES_COUNT': 'sum'}).reset_index()


Unnamed: 0,BEAUFORT_SCALE,TEMP_SCALE,COLLISION COUNT,CASES_COUNT,NORMALIZED_COLLISION_COUNT
0,Gentle Breeze,"[10, 12)",307,1,307.0
1,Gentle Breeze,"[12, 14)",801,3,267.0
2,Gentle Breeze,"[14, 16)",2844,5,568.8
3,Gentle Breeze,"[16, 18)",1581,3,527.0
4,Gentle Breeze,"[18, 20)",6193,13,476.384615


In [10]:
# fill nan of NORMALIZED_COLLISION_COUNT with 0
agg_data['NORMALIZED_COLLISION_COUNT'].fillna(0, inplace=True)

In [11]:
alt.Chart(agg_data).mark_rect().encode(
    x=alt.X('TEMP_SCALE:O',
            title='Temperature Scale',
            axis=alt.Axis(labelAngle=0)),
    y=alt.Y('BEAUFORT_SCALE:N',
            
            title='Beaufort Scale'),
    color=alt.Color('NORMALIZED_COLLISION_COUNT:Q', title='Collision Count')
).properties(
    width=500,
    height=300,
    title='Collision Count by Beaufort Scale and Precipitation Scale'
)

In [15]:
merged_data.columns

Index(['DATE', 'AWND', 'PRCP', 'TMAX', 'TMIN', 'TOBS', 'BEAUFORT_SCALE',
       'PRCP_SCALE', 'MEAN_TEMP', 'COLLISION COUNT', 'TEMP_SCALE',
       'CASES_COUNT'],
      dtype='object')

In [36]:
t1 = alt.Chart(merged_data).mark_point(
    filled=True,
    size=100,
    opacity=0.5
).encode(
    x=alt.X('MEAN_TEMP:Q',
            title='Mean Temperature',
            scale=alt.Scale(domain=[10, 30])),
    y=alt.Y('COLLISION COUNT:Q',
            title='Number of Collisions'),
    color=alt.Color('year(DATE):N',
                    title='Year')
).properties(
    title='Number of Collisions by Mean Temperature'
)

t2 = alt.Chart(merged_data).mark_point(
    filled=True,
    size=100,
    opacity=0.5
).encode(
    x=alt.X('PRCP:Q',
            title='Precipitation'),
    y=alt.Y('COLLISION COUNT:Q',
            title='Number of Collisions'),
    color=alt.Color('year(DATE):N',
                    title='Year'),
).properties(
    title='Number of Collisions by Precipitation'
)

t3 = alt.Chart(merged_data).mark_point(
    filled=True,
    size=100,
    opacity=0.5
).encode(
    x=alt.X('AWND:Q',
            scale=alt.Scale(domain=[1, 6.5]),
            title='Wind Speed'),
    y=alt.Y('COLLISION COUNT:Q',
            title='Number of Collisions'),
    color=alt.Color('year(DATE):N',
                    title='Year'),
).properties(
    title='Number of Collisions by Wind Speed'
)

t1 | t2 | t3