In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_accidents = pd.read_pickle('datasets/accidents.pkl')
df_weather = pd.read_pickle('datasets/hourly_weather.pkl')
df_join = pd.read_csv('datasets/dataset.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'datasets/dataset.csv'

In [None]:
df_accidents.head()

In [None]:
df_weather.head()

In [None]:
df_join.head()

In [None]:
df_weather.columns

In [None]:
df_join.describe()

### Visualizations of accidents data

**Target Distribution**

In [None]:
plt.title('Fig.1) Accident Distribution', fontweight = 'bold')
sns.countplot(x = 'Accident', data=df_join, facecolor=(0, 0, 0, 0),
                   linewidth=5, edgecolor=sns.color_palette("dark", 3))
plt.ylabel('Count', fontweight = 'bold')
plt.xlabel('Accident', fontweight = 'bold')
plt.show()

In [None]:
cols_list = ['air_temperature', 'water_temperature', 'wind_gust_max_10min',
       'wind_speed_avg_10min', 'wind_force_avg_10min', 'wind_direction',
       'windchill', 'barometric_pressure_qfe', 'precipitation', 'dew_point',
       'global_radiation', 'humidity', 'water_level']

plt.figure(figsize=(15, 15))

for i in range(1,len(cols_list)+1):
    plt.subplot(5, 3, i)
    sns.boxplot(y = cols_list[i-1], x = 'Accident', data = df_join)
    plt.tight_layout()
    plt.title('Fig.{}: %s'.format(i) %cols_list[i-1], fontweight = 'bold')
    plt.ylabel('')
plt.show()

In [None]:
np.mean(df_weather['air_temperature'])

In [None]:
df_accidents.columns

In [None]:
df_accidents['AccidentSeverityCategory'].value_counts().to_dict()

In [None]:
all_ = len(df_accidents.loc[(df_accidents['AccidentInvolvingPedestrian']==1) & (df_accidents['AccidentInvolvingBicycle']==1) 
                            & (df_accidents['AccidentInvolvingMotorcycle']==1)])

bicycle_only_ = len(df_accidents.loc[(df_accidents['AccidentInvolvingBicycle']==1)])
moto_only_ = len(df_accidents.loc[(df_accidents['AccidentInvolvingMotorcycle']==1)])
ped_only_ = len(df_accidents.loc[(df_accidents['AccidentInvolvingPedestrian']==1)])


moto_bicycle_ = len(df_accidents.loc[(df_accidents['AccidentInvolvingMotorcycle']==1) 
                                    & (df_accidents['AccidentInvolvingBicycle']==1)])

moto_ped_ = len(df_accidents.loc[(df_accidents['AccidentInvolvingMotorcycle']==1)
                                    & (df_accidents['AccidentInvolvingPedestrian']==1)])

ped_bicycle_ = len(df_accidents.loc[(df_accidents['AccidentInvolvingPedestrian']==1) 
                                    & (df_accidents['AccidentInvolvingBicycle']==1)])

**Distribution of accidents across hours:**

In [None]:
hourly_accs = df_accidents['Hour'].value_counts().to_dict()
hourly_accs[24.0] = hourly_accs.pop(0.0)
labels = [int(key) for key, value in hourly_accs.items()]

#create barplot of distribution
plt.title('Fig.1) Absolute Number of Accidents per time of day', fontweight = 'bold')
plt.bar(hourly_accs.keys(), hourly_accs.values(), color='darkblue', edgecolor='white', label='Mean', 
       tick_label = labels)
plt.ylabel('Number of Accidents')
plt.xlabel('Time of day')
plt.show()

**Average number of monthly accidents:**

In [None]:
from itertools import groupby
import calendar

years = np.arange(2011,2020,1).tolist()
months = np.arange(1,13,1).tolist()


dict_1 = {}

for j in years:
    accidents_list = []
    
    for i in months:
        row_dim = df_accidents[(df_accidents['Year'] == j) & (df_accidents['Month'] == i)].shape[0]
        accidents_list.append(row_dim)
    dict_1['{}'.format(j)] = [i for i in accidents_list]


monthly_avgs_dict = {}

for j in range(12):
    
    month_list = []
    
    for key, value in dict_1.items():
        month_list.append(value[j]) #stores the number of accidents of the same month for every year
    months_name = calendar.month_name[j+1]
    monthly_avgs_dict[months_name[:3]] = [np.mean(month_list), np.median(month_list)] #stores the average number of accidents for each month


# set heights of bars
bars1 = [value[0] for key, value in monthly_avgs_dict.items()]
bars2 = [value[1] for key, value in monthly_avgs_dict.items()]
 
barWidth = 0.3

r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]

# Make the plot
plt.bar(r1, bars1, width=barWidth, color='darkblue', edgecolor='white', label='Mean')
plt.bar(r2, bars2, width=barWidth, color='green', edgecolor='white', label='Median')

plt.title('Fig.2: Mean and Median of Accidents per Month', fontweight = 'bold')
plt.xlabel('Month', fontweight='bold')
plt.ylabel('Number of Accidents', fontweight='bold')
plt.xticks([r + (barWidth-0.15) for r in range(len(bars1))], list(monthly_avgs_dict.keys()))


# Create legend & Show graphic
plt.legend()
plt.show()

**Average number of yearly accidents:**

In [None]:
yearly_avgs_dict = {}

for key, value in dict_1.items():
    yearly_avgs_dict[key] = [np.mean(value), np.median(value)] #stores the average number of accidents per year
    

# set heights of bars
bars1 = [value[0] for key, value in yearly_avgs_dict.items()]
bars2 = [value[1] for key, value in yearly_avgs_dict.items()]
 
barWidth = 0.3

r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]

# Make the plot
plt.bar(r1, bars1, width=barWidth, color='darkblue', edgecolor='white', label='Mean')
plt.bar(r2, bars2, width=barWidth, color='green', edgecolor='white', label='Median')

plt.title('Fig.3: Mean and Median of Accidents per Year', fontweight = 'bold')
plt.xlabel('Year', fontweight='bold')
plt.ylabel('Number of Accidents', fontweight='bold')
plt.xticks([r + (barWidth-0.15) for r in range(len(bars1))], list(yearly_avgs_dict.keys()))


# Create legend & Show graphic
plt.legend()
plt.show()

**Number of accidents per location:**

In [None]:
location_dict = df_accidents['AccidentLocation_CHLV95_E'].value_counts().to_dict()
plt.title('Fig.4) Distribution of accidents across Locations', fontweight = 'bold')
plt.scatter(location_dict.keys(), location_dict.values(), label = '# accidents')
plt.ylabel('Absolute number of accidents')
plt.xlabel('Accident Location (Code)')
plt.hlines(np.mean(list(location_dict.values())), min(list(location_dict.keys())), 
           max(list(location_dict.keys())), color = 'black', label = 'Average # accidents \nper location')
plt.legend(loc = 'upper left')

### Visualizations of weather data

In [None]:
df_weather.columns

**Average temperature per month:**

In [None]:
from itertools import groupby
import calendar

years = np.arange(2011,2020,1).tolist()
months = np.arange(1,13,1).tolist()


dict_2 = {}

for j in years:
    temperature_list = []
    
    for i in months:
        row_dim = df_weather[(df_weather['Year'] == j) & (df_weather['Month'] == i)].shape[0]
        temperature_list.append(row_dim)
    dict_1['{}'.format(j)] = [i for i in temperature_list]

**Average precipitation per month:**

In [None]:
a