In [None]:
from seaborn.utils import np, os, plt, pd
import seaborn as sns
import copy
from sklearn import feature_selection as s_fs, preprocessing as s_prep, model_selection as s_ms, metrics as s_mtr
# from mlxtend import frequent_patterns as mfp

In [None]:
pd.set_option('display.precision', 3,
             'display.max_columns', None)

In [None]:
from project_methods import RoadAccidents as proj

In [None]:
sns.set_style('whitegrid')

#### Read in datasets

In [None]:
file_path = "Road Safety Data - Accidents 2019.csv"
accidents = copy.deepcopy(pd.read_csv(file_path, low_memory=False))

In [None]:
file_path = "Road Safety Data- Vehicles 2019.csv"
vehicles = copy.deepcopy(pd.read_csv(file_path, low_memory=False))

In [None]:
file_path = "Road Safety Data - Casualties 2019.csv"
casualties = copy.deepcopy(pd.read_csv(file_path, low_memory=False))

In [None]:
# variable look_np spreadsheet
file_path = 'variable lookup.xls'
var_look = pd.read_excel(file_path, sheet_name=None)

In [None]:
# cas_adjustment look up csv
file_path = 'cas_adjustment_lookup_2019.csv'
cas_adj_lookup = pd.read_csv(file_path)

In [None]:
# accidents.info()

In [None]:
# vehicles.info()

In [None]:
# casualties.info()

In [None]:
# cas_adj_lookup.info()

#### First look at each datasets

In [None]:
# accidents.head()

In [None]:
# vehicles.head()

In [None]:
# casualties.head()

In [None]:
# cas_adj_lookup.head()

In [None]:
# var_look.keys() # sheet names for feature description

#### Convert dataframe columns to lowercase

In [None]:
accidents.columns = proj.cleanup_cols(accidents)
# accidents.columns

In [None]:
vehicles.columns = proj.cleanup_cols(vehicles)
# vehicles.columns

In [None]:
casualties.columns = proj.cleanup_cols(casualties)
# casualties.columns

In [None]:
cas_adj_lookup.columns = proj.cleanup_cols(cas_adj_lookup)
# cas_adj_lookup.columns

## MERGE ACCIDENTS, VEHICLES, CASUALTIES DATASETS

In [None]:
acc_veh = pd.merge(left=accidents, right=vehicles, how='inner', on='accident_index')
# acc_veh.info()

In [None]:
acc_veh_cas = pd.merge(left=acc_veh, right=casualties, how='inner', on='accident_index')
# acc_veh_cas.info()

## DATA CLEANING

#### Missing Data Analysis
Inorder to efficiently work with missing data, I shall return the null placeholder, -1, back to NaN

In [None]:
nan_acc = proj.placeholder_to_nan(acc_veh_cas)
# nan_acc.info()

In [None]:
len(proj.null_checker(nan_acc, only_nulls=True))

In [None]:
proj.visualize_nulls(nan_acc, plot_title='Number of Mising Values per Column in the Original Datasets: Accident, Casualties & Vehicles',
                     fig_size=(10, 12), annot_size=9, top_labe_gap=8699, include_perc=True, use_bar=True, savefig=True)


**DROPPING NULL ENTRIES IN COLUMNS WHERE NULLS <= 2.5% OF TOTAL NUMBER OF ENTRIES:** <br>
I do not expect these dropped entries to significantly affect the information in the dataset.

In [None]:
null_freq = proj.null_checker(nan_acc, in_perc=True, only_nulls=True).sort_values()
drop_cols = null_freq.loc[null_freq <= 2.5].index
# print(list(drop_cols))

Features with null values less than or equal to 2.5% include:<br>
('pedestrian_location', 'sex_of_driver', 'light_conditions', 'casualty_type', 'journey_purpose_of_driver', 'pedestrian_road_maintenance_worker', 'latitude', 'longitude', 'location_northing_osgr', 'location_easting_osgr', 'speed_limit', 'bus_or_coach_passenger', 'time', '2nd_road_number', 'vehicle_type', 'car_passenger', 'sex_of_casualty', 'road_surface_conditions', 'junction_detail', 'carriageway_hazards', 'special_conditions_at_site', 'towing_and_articulation', 'age_of_casualty', 'age_band_of_casualty', 'pedestrian_crossing-physical_facilities', 'pedestrian_crossing-human_control') contain an insignificant number of missing values (ie <2.5%).
<br>

In [None]:
# DROPPING NULL ENTRIES
nan_acc = nan_acc.dropna(subset=drop_cols)

In [None]:
proj.visualize_nulls(nan_acc, include_perc=False, plot_title='Columns with Missing Values > 2.5% of Dataset Rows', 
                     fig_size=(10, 5), annot_size=9, fig_filename='impute_nulls.png', savefig=True)

#### IMPUTATION BEGINS!!

In [None]:
proj.null_checker(nan_acc, only_nulls=True).shape

In [None]:
clean_acc = proj.systematically_impute_all_nans(nan_acc)

In [None]:
proj.null_checker(clean_acc, only_nulls=True)

#### IMPUTATION COMPLETE!!!

In [None]:
accidents = clean_acc
# accidents.info()

#### Data Type Checks

In [None]:
num_cols = proj.get_features_with_dtypes(accidents, feat_datatype='number')
str_cols = proj.get_features_with_dtypes(accidents, feat_datatype='str')
acc_num_cols = accidents[num_cols]
acc_str_cols = accidents[str_cols]

In [None]:
# acc_num_cols.dtypes

In [None]:
# acc_str_cols.dtypes

In [None]:
# acc_num_cols.head()

In [None]:
# acc_str_cols.head()

In [None]:
# cast Date column to datetime data type
accidents['date'] = accidents['date'].astype(np.datetime64)

In [None]:
# accidents.info()

In [None]:
fname = 'w:\\MSc-AIDS-UoH\\Trimester_2\\UoH_Big_Data_and_Data_Mining\\Project\\Report\\RoadAccidentsOutput\\clean_accidents.csv'
accidents.to_csv(fname, index=False)

In [None]:
# accidents.info()

#### Data Cleaning Complete

## Feature Engineering

In [None]:
fname = 'w:\\MSc-AIDS-UoH\\Trimester_2\\UoH_Big_Data_and_Data_Mining\\Project\\Report\\RoadAccidentsOutput\\clean_accidents.csv'
accidents = pd.read_csv(fname, low_memory=False)
# accidents.head()

In [None]:
# accidents.info()

In [None]:
accidents = proj.engineer_useful_features(accidents)

# accidents.info()

In [None]:
fname = 'w:\\MSc-AIDS-UoH\\Trimester_2\\UoH_Big_Data_and_Data_Mining\\Project\\Report\\RoadAccidentsOutput\\accidents_ext.csv'
accidents.to_csv(fname, index=False)

## DESCRIPTIVE STATISTICS

In [None]:
fname = 'w:\\MSc-AIDS-UoH\\Trimester_2\\UoH_Big_Data_and_Data_Mining\\Project\\Report\\RoadAccidentsOutput\\accidents_ext.csv'
accidents = pd.read_csv(fname, low_memory=False)
# accidents.head()

In [None]:
num_cols = proj.get_features_with_dtypes(accidents, feat_datatype='number')
str_cols = proj.get_features_with_dtypes(accidents, feat_datatype='str')
acc_num_cols = accidents[num_cols]
acc_str_cols = accidents[str_cols]

In [None]:
# num = int(len(num_cols)/10 + 0.5)
# print(num, len(num_cols))
# print(list(range(0, (num+1)*10, 10)))
# all_stats = acc_num_cols.describe(percentiles=[0.5],
#                                   datetime_is_numeric=True).transpose()
# all_stats.columns = all_stats.columns.astype(str).str.replace('50%', 'median')
# for n in range(0, (num+1)*10, 10):
#     end = n+10
#     display(all_stats.iloc[n: end])

In [None]:
# acc_str_cols.describe().transpose()

## DISTRIBUTION OF VARIABLES

In [None]:
sns.set_style('darkgrid')
proj.visualize_distributions(accidents, savefig=True)

## CORRELATION ANALYSIS

In [None]:
fname = 'w:\\MSc-AIDS-UoH\\Trimester_2\\UoH_Big_Data_and_Data_Mining\\Project\\Report\\RoadAccidentsOutput\\accidents_ext.csv'
accidents = pd.read_csv(fname, low_memory=False)
# accidents.info()

In [None]:
proj.correlation_analyser(accidents)

### TREND ANALYSIS

#### General aggregated frequency table for daily accidents per hour

In [None]:
general_agg = proj.generate_aggregated_lookup(accidents)
# general_agg

In [None]:
proj.show_correlated_variables(accidents, var_look)

In [None]:
proj.visualize_trends(accidents, 'general')

In [None]:
proj.quarterly_observations(accidents, 'general')

In [None]:
proj.monthly_observations(accidents, 'general')

In [None]:
# top district per month
cols = ['month', 'month_name', 'local_authority_district']
districts = proj.assign_district(accidents)
df = pd.concat([accidents[cols[:-1]], districts], axis=1)
display(df)
total = proj.rank_top_occurrences(df, top_n=3)
# display(total)

proj.plot_bar(y=total[cols[1]], x=total['total_count'], condition_on=total[cols[-1]],
             figsize=(8, 15), paletter=None, annotate=True, savefig=True,
             plot_title='Top 3 Districs per Monthly Accidents', fig_filename='top_district_mn.png')

In [None]:
# top district per month
cols = ['month', 'month_name', 'local_authority_highway']
highways = proj.assign_highway(accidents)
df = pd.concat([accidents[cols[:-1]], highways], axis=1)
# display(df)
total = proj.rank_top_occurrences(df, top_n=3)
# display(total)

proj.plot_bar(y=total[cols[1]], x=total['total_count'], condition_on=total[cols[-1]],
             figsize=(8, 15), paletter=None, annotate=True, savefig=True, xlim=(0, 1000),
             plot_title='Top 3 Highways per Monthly Accidents', fig_filename='top_highway_mn.png')

In [None]:
proj.rank_top_occurrences(accidents[['is_dst', 'casualty_severity', 'light_conditions']], top_n=4)

In [None]:
proj.weekly_observations(accidents, 'general')

In [None]:
proj.show_weekly(accidents, 19, 'is_weekend')

In [None]:
x = proj.generate_aggregated_lookup(proj.get_accidents_when(accidents, 'is_weekend', 1))[['week_num', 'total_count']]
# display(x)
# x.groupby('week_num').sum().sort_values('total_count', ascending=False)

In [None]:
cols = ['day_num', 'total_count']
proj.generate_aggregated_lookup(accidents)[cols].groupby(cols[0]).sum().sort_values(cols[-1], ascending=False)

In [None]:
proj.get_accidents_when(accidents, 'day_num', 359)[['month_name', 'day']]

In [None]:
proj.get_daynums_in_wknum(accidents, 32)

In [None]:
proj.get_daynums_in_wknum(accidents, 38)

In [None]:
proj.get_daynums_in_wknum(accidents, 52)

In [None]:
proj.get_daynums_in_wknum(accidents, 38)

In [None]:
proj.daily_observations(accidents, 'general')

In [None]:
cols = ['day_num', 'inactive_hour', 'total_count']
total_inacv_day = general_agg[cols].groupby(cols[:-1]).sum().sort_index().reset_index()
# total_inacv_day

In [None]:
proj.get_accidents_when(accidents, 'day_num', [102, 359])[['month_name', 'day', 'day_name']].value_counts()

In [None]:
proj.hourly_observations(accidents, 'general')

In [None]:
proj.hourly_trend(accidents, 'general')

In [None]:
proj.seasonal_observations(accidents, 'general')

In [None]:
proj.dst_observations(accidents, 'general')

### PREMIER LEAGUE INFLUENCE

In [None]:
onseason_wkend = proj.get_accidents_when(accidents, col1='is_offseason', col1_is=0)
# onseason_wkend

In [None]:
proj.visualize_trends(onseason_wkend, 'League_onseason')

In [None]:
offseasoff_wkend = proj.get_accidents_when(accidents, col1='is_offseason', col1_is=1)
# offseasoff_wkend

In [None]:
proj.visualize_trends(offseasoff_wkend, 'League_offseason')

In [None]:
cols = ['is_offseason', 'part_of_day',  'day_name']
acc_df = proj.get_accidents_with_labels(accidents)
counts = proj.rank_top_occurrences(acc_df[cols], top_n=3, min_count_allowed=2000)
# display(counts)
labes = proj.create_label_from_ranking(counts[cols[:-1]], exclude_last_col=False)
# display(labes)
proj.plot_bar(counts['total_count'], labes, condition_on=counts[cols[-1]], annotate=True,
             plot_title='League Season Relationship to Other Variables', 
             paletter={'Sunday': 'green', 'Monday': 'blue', 'Tuesday': 'yellow', 'Wednesday': 'darkorange', 
                       'Thursday': 'red', 'Friday': 'black', 'Saturday': 'gray'}, savefig=True,
             fig_filename='pl_rlship.png')

In [None]:
cols = ['is_offseason', 'is_weekend',  'accident_severity']
acc_df = proj.get_accidents_with_labels(accidents)
counts = proj.rank_top_occurrences(acc_df[cols], top_n=3, min_count_allowed=2000)
# display(counts)
labes = proj.create_label_from_ranking(counts[cols[:-1]], exclude_last_col=False)
# display(labes)
proj.plot_bar(counts['total_count'], labes, condition_on=counts[cols[-1]], annotate=True,
             plot_title='League Season Relationship to Other Variables', 
             paletter={'fatal':'black', 'serious':'red', 'minor':'gray'}, 
              savefig=True, fig_filename='pl_rlship_wknd.png')

In [None]:
cols = ['is_offseason', 'local_authority_district', 'local_authority_highway', 'accident_severity']
result = proj.run_apriori(acc_df[cols], min_support=0.01)
# result

In [None]:
counts = proj.rank_top_occurrences(acc_df[cols], top_n=3, min_count_allowed=2000)
# display(counts)
labes = proj.create_label_from_ranking(counts[cols[:-1]], exclude_last_col=False)
# display(labes)
proj.plot_bar(counts['total_count'], labes, condition_on=counts[cols[-1]], annotate=True,
             plot_title='League Season Relationship to Other Variables', 
             paletter={'fatal':'black', 'serious':'red', 'minor':'gray'}, 
              savefig=True, fig_filename='pl_rlship_distr.png')

### SEVERITY ANALYSIS

In [None]:
fname = 'w:\\MSc-AIDS-UoH\\Trimester_2\\UoH_Big_Data_and_Data_Mining\\Project\\Report\\RoadAccidentsOutput\\accidents_ext.csv'
accidents = pd.read_csv(fname, low_memory=False)
accidents.info()

In [None]:
general_agg = proj.generate_aggregated_lookup(accidents)
# general_agg

In [None]:
cols = ['season_num', 'season', 'day_of_week', 'day_name', 'total_count']
acc_hr_dayname = general_agg[cols].groupby(cols[:-1]).sum().reset_index()
# acc_hr_dayname

In [None]:
summer = proj.get_accidents_when(accidents, 'season', 'summer')
# summer

In [None]:
cols = ['is_dst', 'casualty_severity']
agg = accidents[cols].groupby(cols).size().sort_index().reset_index()
agg.columns = agg.columns.astype(str).str.replace('0', 'total_count').str.replace('casualty_', '')
# agg

In [None]:
# guide = dict(var_look['Casualty Severity'].to_records(index=False))
guide = {1: 'fatal', 2: 'serious', 3: 'minor'}
# guide2 = {1:'morning', 2:'afternoon', 3:'evening', 4:'night'}
agg['casualty_severity'] = agg['severity'].map(guide)
# agg['part_of_day'] = agg['part_of_day_num'].map(guide2)
# display(agg)

In [None]:
# selected = proj.get_accidents_when(accidents, 'quarter', 0)
# selected = pd.concat([agg, words], axis=1,)
#                      keys=['quarter', 'casualty_severity_num', 'casualty_severity'])
# selected

In [None]:
fig, ax = plt.subplots(figsize=(25, 12), dpi=200)
proj.plot_column(agg[cols[0]], agg['total_count'], agg[cols[1]], include_perc=True, 
                 axis=ax, y_labe='total_count', x_labe='DST',
                 xy_labe_size=12,top_labe_gap=5000,
                  paletter={'fatal':'red', 'serious':'yellow', 'minor':'gray'}, h_labe_shift=-0.1,
                 annot_size=14, ylim=(0, 140000), #rotate_xticklabe=True,
                 plot_title='2019 UK CASUALTY SEVERITY DURING DST',)# savefig=True, )
# sns.move_legend(ax, [1.02, 0.45])
fname = 'cas_sev_dst.png'
proj.fig_writer(fname, fig)

In [None]:
selected = proj.get_accidents_when(accidents, 'casualty_severity', 1,
                                   'is_weekend', 0)
# display(selected)

proj.plot_accident_map(accidents, selected, main_color='gray', focus_color='red',
                       plot_title='WEEKDAY ACCIDENT FATALITY', savefig=True, 
                       fig_filename='wkday_fatal_accidents.png')

In [None]:
proj.visualize_top_ten_districts(accidents, savefig=True, suffix='fatal')

In [None]:
cols = ['local_authority_district', 'is_weekend']
distrs = proj.assign_district(accidents)
df = pd.concat([distrs, accidents[cols[1]]],
              axis=1)
# df

In [None]:
selected = df.groupby(cols).size().sort_values(ascending=False).reset_index()
selected.columns = selected.columns.astype(str).str.replace('0', 'total_count')
top10_wkend_districts = selected.loc[selected[cols[1]] == 1].iloc[:10]
# top10_wkend_districts

In [None]:
proj.visualize_top_ten_districts(accidents, col='is_weekend', plot_title='TOP TEN DISTRICT WITH HIGHEST WEEKEND ACCIDENTS', 
                            suffix='wkend', savefig=True)

In [None]:
proj.visualize_top_ten_districts(accidents, col='is_weekend', select_col_val=0, 
                                 plot_title='TOP TEN DISTRICT WITH HIGHEST WEEKDAY ACCIDENTS', 
                                 suffix='wkday', savefig=True)

In [None]:
proj.visualize_top_ten_districts(accidents, col='is_dst', plot_title='TOP TEN DISTRICT WITH HIGHEST DST ACCIDENTS', 
                            suffix='dst', savefig=True)

In [None]:
proj.visualize_top_ten_districts(accidents, col='is_offseason', plot_title='TOP TEN DISTRICT WITH HIGHEST LEAGUE OFFSEASON ACCIDENTS', 
                            suffix='offseason', savefig=True)

In [None]:
proj.visualize_top_ten_districts(accidents, col='is_offseason', select_col_val=0,
                                 plot_title='TOP TEN DISTRICT WITH HIGHEST LEAGUE ONSEASON ACCIDENTS', 
                            suffix='onseason', savefig=True)

## a) HYPOTHESIS TESTING


In [None]:
fname = 'w:\\MSc-AIDS-UoH\\Trimester_2\\UoH_Big_Data_and_Data_Mining\\Project\\Report\\RoadAccidentsOutput\\accidents_ext.csv'
accidents = pd.read_csv(fname, low_memory=False)
# accidents.info()

In [None]:
general_agg = proj.generate_aggregated_lookup(accidents)
# general_agg

In [None]:
#### H1: September total weekend accidents >  May weekend accidents
cols = ['month', 'month_name', 'is_weekend', 'total_count']
X1_name='September Weekend accidents'
X2_name='May Weekend accidents'
proj.test_hypotheses(accidents, agg_cols=cols, focus_col='month_name', bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals='September', smaller_set_vals='May',
               second_condition_col='is_weekend', second_condition_val=1)

In [None]:
#### H1: September weekly weekend accidents >  May weekend accidents
cols = ['month', 'month_name', 'week_num', 'is_weekend', 'total_count']
X1_name='September weekly Weekend accidents'
X2_name='May weekly Weekend accidents'
proj.test_hypotheses(accidents, agg_cols=cols, focus_col='month_name', bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals='September', smaller_set_vals='May',
               second_condition_col='is_weekend', second_condition_val=1)

In [None]:
#### H1: monthly minor accidents > serious accidents
cols = ['month', 'accident_severity', 'total_count']
X1_name='Minor monthly accidents'
X2_name='Serious monthly accidents'
acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(acc_df, agg_cols=cols, focus_col='accident_severity', bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals='minor', smaller_set_vals='serious',)
#                second_condition_col='month', second_condition_val=1)

In [None]:
#### H1: week 51 daily accidents > week 52 
cols = ['week_num', 'day_num', 'total_count']
X1_name='wk51 weekly accidents'
X2_name='wk52 weekly accidents'

# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col='week_num', bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=51, smaller_set_vals=52,)
#                second_condition_col='month', second_condition_val=1)

In [None]:
#### H1: week 51 daily accidents > week 52 
cols = ['week_num', 'hour', 'total_count']
X1_name='wk51 hourly accidents'
X2_name='wk52 hourly accidents'

# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col='week_num', bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=51, smaller_set_vals=52,)
#                second_condition_col='month', second_condition_val=1)

In [None]:
#### H1: January weekly weekday accidents > February weekday accidents
cols = ['month', 'month_name','week_num', 'is_weekend', 'total_count']
X1_name='January weekly Weekday accidents'
X2_name='February weekly Weekday accidents'

# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col='month_name', bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals='January', smaller_set_vals='February',
               second_condition_col='is_weekend', second_condition_val=0)

### Hours of Day

In [None]:
#### H1: Monthly afternoon accidents > evening
cols = ['part_of_day_num', 'part_of_day', 'month', 'total_count']
X1_name='Monthly Afternoon accidents'
X2_name='Monthly Evening accidents'

# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col='part_of_day', bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals='afternoon', smaller_set_vals='evening',)
#                second_condition_col='is_weekend', second_condition_val=0)

In [None]:
#### H1: Daily afternoon accidents > nighttime
cols = ['part_of_day_num', 'part_of_day', 'day_num', 'total_count']
X1_name='Daily Afternoon accidents'
X2_name='Daily Evening accidents'

# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col='part_of_day', bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals='afternoon', smaller_set_vals='evening',)
#                second_condition_col='is_weekend', second_condition_val=0)

In [None]:
#### H1: Hourly afternoon accidents > nighttime
cols = ['part_of_day_num', 'part_of_day', 'hour', 'total_count']
X1_name='Hourly Afternoon accidents'
X2_name='Hourly Evening accidents'

# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col='part_of_day', bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals='afternoon', smaller_set_vals='evening',)
#                second_condition_col='is_weekend', second_condition_val=0)

In [None]:
# active > inactive daily
X1_name='Daily active time accidents'
X2_name='Daily inactive time accidents'
cols = ['day_num', 'inactive_hour', 'total_count']
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[1], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=0, smaller_set_vals=1,)
#                second_condition_col='is_weekend', second_condition_val=0)

#### H1: Friday weekly accidents > Sunday accidents

In [None]:
cols = ['week_num', 'day_of_week', 'day_name', 'total_count']
X1_name='Weekly Friday accidents'
X2_name='Weekly Sunday accidents'

#H1: friday > sunday weekly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[2], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals='Friday', smaller_set_vals='Sunday',)
#                second_condition_col='is_weekend', second_condition_val=0)

In [None]:
cols = ['month', 'day_of_week', 'day_name', 'total_count']
X1_name='Monthly Friday accidents'
X2_name='Monthly Sunday accidents'

#H1: friday > sunday Monthly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[2], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals='Friday', smaller_set_vals='Sunday',)
#                second_condition_col='is_Monthend', second_condition_val=0)

In [None]:
cols = ['hour', 'day_of_week', 'day_name', 'total_count']
X1_name='hourly Friday accidents'
X2_name='hourly Sunday accidents'

#H1: friday > sunday hourly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[2], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals='Friday', smaller_set_vals='Sunday',)
#                second_condition_col='is_hourend', second_condition_val=0)

#### H1:  Hourly weekday accidents > Weekend  accidents

In [None]:
cols = ['hour', 'is_weekend', 'total_count']
X1_name='hourly weekday accidents'
X2_name='hourly weekend accidents'

#H1: friday > sunday hourly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[1], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=0, smaller_set_vals=1,)
#                second_condition_col='is_hourend', second_condition_val=0)

#### H1: wkday > wkend weekly

In [None]:
cols = ['week_num', 'is_weekend', 'total_count']
X1_name='weekly Weekday accidents'
X2_name='weekly Weekend accidents'

#H1: friday > sunday weekly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[1], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=0, smaller_set_vals=1,)
#                second_condition_col='is_weekend', second_condition_val=0)

#### H1: wkday monthly accidents > wkend 

In [None]:
cols = ['month', 'is_weekend', 'total_count']
X1_name='monthly Weekday accidents'
X2_name='monthly Weekend accidents'

#H1: friday > sunday weekly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[1], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=0, smaller_set_vals=1,)
#                second_condition_col='is_weekend', second_condition_val=0)

#### H1:Summer fridays > winter fridays

In [None]:
cols = ['week_num', 'season', 'day_name', 'total_count']
X1_name='Summer weekly Friday accidents'
X2_name='Winter weekly Friday accidents'

#H1: friday > sunday weekly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[1], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals='summer', smaller_set_vals='winter',
               second_condition_col='day_name', second_condition_val='Friday')

In [None]:
cols = ['month', 'season', 'day_name', 'total_count']
X1_name='Summer monthly Friday accidents'
X2_name='Winter monthly Friday accidents'

#H1: friday > sunday weekly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[1], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals='summer', smaller_set_vals='winter',
               second_condition_col='day_name', second_condition_val='Friday')

#### Day Time (from 7am to 18pm) Vs. Night Time (from 19pm to 6am)
NULL HYPOTHESIS: <br>
Generally speaking, the total accident count during day time is less than or equal to the total accident count during night time.<br>
ALTERNATE HYPOTHESIS:<br>
Generally speaking, the accident count during day time is higher than night time accident count.

In [None]:
cols = ['hour', 'total_count']
X1_name='Total day Working Time (7am to 18pm)'
X2_name='Total night Time (19pm to 6am)'

#H1: friday > sunday weekly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[0], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
                smaller_set_vals=[19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6],)
#                second_condition_col='day_name', second_condition_val='Friday')

In [None]:
cols = ['hour', 'week_num', 'total_count']
X1_name='weekly day Working Time (7am to 18pm)'
X2_name='weekly night Time (19pm to 6am)'

#H1: friday > sunday weekly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[0], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
                smaller_set_vals=[19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6],)
#                second_condition_col='day_name', second_condition_val='Friday')

In [None]:
cols = ['hour', 'month', 'total_count']
X1_name='monthly day Working Time (7am to 18pm)'
X2_name='monthly night Time (19pm to 6am)'

#H1: friday > sunday weekly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[0], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
                smaller_set_vals=[19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6],)
#                second_condition_col='day_name', second_condition_val='Friday')

#### H1: active time hourly accidents > inactive time accidents

In [None]:
cols = ['hour', 'inactive_hour', 'total_count']
X1_name='Hourly active time accidents'
X2_name='Hourly inactive time accidents'

#H1: friday > sunday weekly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[1], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=0,
                smaller_set_vals=1,)
#                second_condition_col='day_name', second_condition_val='Friday')

### SIGNIFICANT DAY OF WEEK

#### H1: Inactive hour accidents on Fridays, Saturdays, Sundays > other days

In [None]:
cols = ['day_of_week', 'day_name', 'inactive_hour', 'total_count', ]
X1_name = 'inactive hour accidents on Fri-Sat-Sun'
X2_name = 'inactive hour accidents on Mon-Tue-Wed-Thur'
proj.test_hypotheses(accidents, cols, focus_col=cols[1], bigger_set_name=X1_name,
                    smaller_set_name=X2_name, bigger_set_vals=['Friday', 'Saturday', 'Sunday'],
                    smaller_set_vals=['Monday', 'Tuesday', 'Wednesday', 'Thursday'],
                    second_condition_col=cols[2], second_condition_val=1, balance_unequal=False)

In [None]:
cols = ['day_of_week', 'day_name', 'inactive_hour', 'total_count']
agg_df = proj.generate_aggregated_lookup(accidents)
agg_df = agg_df[cols].groupby(cols[:-1]).sum().reset_index()
proj.plot_bar(agg_df[cols[-1]], agg_df[cols[1]], agg_df[cols[2]],
              paletter={0:'green', 1:'gray'},
             plot_title='Relationship Between Day of Week and Part of Day', annotate=True,
             savefig=True, fig_filename='rlship_dayname_inav.png',)# xlim=(0, 19000))

In [None]:
cols = ['part_of_day', 'day_of_week', 'day_name', 'total_count']
agg_df = proj.generate_aggregated_lookup(accidents)
agg_df = agg_df[cols].groupby(cols[:-1]).sum().reset_index()
proj.plot_bar(agg_df[cols[-1]], agg_df[cols[2]], agg_df[cols[0]],
              paletter={'morning':'green', 'afternoon':'yellow', 'evening':'gray', 'night':'black'},
             plot_title='Relationship Between Day of Week and Part of Day', annotate=True,
             savefig=True, fig_filename='rlship_dayname_pod.png', xlim=(0, 19000))

#### Night hours accidents on Fridays-Saturdays-Sundays > Mondays-Tuesdays-Wednesdays-Thursdays

In [None]:
cols = ['day_of_week', 'day_name', 'part_of_day', 'total_count', ]
X1_name = 'nighttime accidents on Fri-Sat-Sun'
X2_name = 'nighttime accidents on Mon-Tue-Wed-Thur'
proj.test_hypotheses(accidents, cols, focus_col=cols[1], bigger_set_name=X1_name,
                    smaller_set_name=X2_name, bigger_set_vals=['Friday', 'Saturday', 'Sunday'],
                    smaller_set_vals=['Monday', 'Tuesday', 'Wednesday', 'Thursday'],
                    second_condition_col=cols[2], second_condition_val='night', balance_unequal=False)

In [None]:
cols = ['day_of_week', 'day_name', 'total_count']
X1_name='Total Thursday_Friday'
X2_name='Total Saturday_Sunday'

#H1: friday > sunday weekly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[1], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=['Thursday', 'Friday'],
                smaller_set_vals=['Saturday', 'Sunday'],)
#                second_condition_col='day_name', second_condition_val='Friday')

In [None]:
cols = ['week_num', 'day_of_week', 'day_name', 'total_count']
X1_name='weekly Thursday_Friday'
X2_name='weekly Saturday_Sunday'

#H1: friday > sunday weekly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[2], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=['Thursday', 'Friday'],
                smaller_set_vals=['Saturday', 'Sunday'],)
#                second_condition_col='day_name', second_condition_val='Friday')

In [None]:
cols = ['month', 'day_of_week', 'day_name', 'total_count']
X1_name='monthly Thursday_Friday'
X2_name='monthly Saturday_Sunday'

#H1: friday > sunday monthly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[2], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=['Thursday', 'Friday'],
                smaller_set_vals=['Saturday', 'Sunday'],)
#                second_condition_col='day_name', second_condition_val='Friday')

In [None]:
cols = ['hour', 'day_of_week', 'day_name', 'total_count']
X1_name='hourly Thursday_Friday'
X2_name='hourly Saturday_Sunday'

#H1: friday > sunday hourly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[2], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=['Thursday', 'Friday'],
                smaller_set_vals=['Saturday', 'Sunday'],)
#                second_condition_col='day_name', second_condition_val='Friday')

#### H1: Accidents on Weekdays > Weekends

In [None]:
cols = ['week_num', 'is_weekend', 'total_count']
X1_name='Weekly Weekdays'
X2_name='Weekly Weekends'

#H1: friday > sunday monthly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[1], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=0,
                smaller_set_vals=1,)
#                second_condition_col='day_name', second_condition_val='Friday')

In [None]:
cols = ['month', 'is_weekend', 'total_count']
X1_name='monthly weekdays'
X2_name='monthly weekends'

#H1: friday > sunday monthly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[1], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=0,
                smaller_set_vals=1,)
#                second_condition_col='day_name', second_condition_val='Friday')

In [None]:
cols = ['season', 'week_num', 'is_weekend', 'total_count']
X1_name='Summer Weekdays'
X2_name='winter Weekdays'

#H1: friday > sunday monthly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[0], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals='summer',
                smaller_set_vals='winter',
               second_condition_col=cols[2], second_condition_val=0, balance_unequal=False)

In [None]:
cols = ['season', 'week_num', 'is_weekend', 'total_count']
X1_name='Summer Weekends'
X2_name='winter Weekends'

#H1: friday > sunday monthly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(accidents, agg_cols=cols, focus_col=cols[0], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals='summer',
                smaller_set_vals='winter',
               second_condition_col=cols[2], second_condition_val=1, balance_unequal=False)

In [None]:
quarterly_impact(general_agg, [3, 1], ['friday', 'sunday'])

## b) FOR MOTORBIKES

In [None]:
from seaborn.utils import np, os, plt, pd
import seaborn as sns
import copy
from sklearn import feature_selection as s_fs, preprocessing as s_prep, model_selection as s_ms, metrics as s_mtr
# from mlxtend import frequent_patterns as mfp

In [None]:
pd.set_option('precision', 3,
             'max_columns', None)

In [None]:
from project_methods import RoadAccidents as proj

In [None]:
sns.set_style('darkgrid')

In [None]:
fname = 'w:\\MSc-AIDS-UoH\\Trimester_2\\UoH_Big_Data_and_Data_Mining\\Project\\Report\\RoadAccidentsOutput\\accidents_ext.csv'
accidents = pd.read_csv(fname, low_memory=False)
accidents.info()

In [None]:
mbike_acc = proj.get_mbike_accidents(accidents)
mbike_acc

#### Visualize Trends

In [None]:
proj.visualize_trends(mbike_acc, 'motorbike')

In [None]:
proj.seasonal_observations(mbike_acc, 'mbike')

In [None]:
proj.hourly_observations(mbike_acc, 'mbike')

In [None]:
cols = ['month', 'month_name','day_of_week', 'day_name', 'total_count']
agg_df = proj.generate_aggregated_lookup(mbike_acc)
agg_df = agg_df[cols].groupby(cols[:-1]).sum().reset_index()
proj.plot_bar(agg_df[cols[-1]], agg_df[cols[1]], agg_df[cols[3]], annotate=True,
             plot_title='Monthly Motorbike Accidents per Part of Day',
             paletter={'Sunday': 'green', 'Monday': 'blue', 'Tuesday': 'yellow', 'Wednesday': 'darkorange',
                       'Thursday': 'red', 'Friday': 'black', 'Saturday': 'gray'}, figsize=(10, 18),
              savefig=True, fig_filename='rlship_mon_pod_mbike.png')

In [None]:
proj.plot_accident_map(accidents, mbike_acc, plot_title='UK 2019 MOTORBIKE ACCIDENTS',
                      alpha=None, savefig=True, fig_filename='uk_mbike.png')
sns.scatterplot()

In [None]:
proj.visualize_top_ten_districts(mbike_acc, savefig=True, suffix='mbike',
                                plot_title='Top Ten Districts with Fatal Motorbike Accidents',)

In [None]:
proj.visualize_top_ten_districts(mbike_acc, col='is_weekend', 
                                 plot_title='Top Ten Districts with Highest Weekend Motorbike Accidents',
                                 savefig=True, suffix='mbike_wkend')

In [None]:
proj.visualize_top_ten_districts(mbike_acc, col='is_weekend', select_col_val=0,
                                 plot_title='Top Ten Districts with Highest Weekday Motorbike Accidents',
                                 savefig=True, suffix='mbike_wkday')

### i. Significant Hours of  Day 

In [None]:
#### H1: hourly 21pm > 9am
cols = ['hour', 'total_count']
X2_name='Motorbike total accidents at 9am'
X1_name='Motorbike total accidents at 21pm'

#H1: friday > sunday monthly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(mbike_acc, agg_cols=cols, focus_col=cols[0], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=21,
                smaller_set_vals=9,)
#                second_condition_col=cols[2], second_condition_val=0, balance_unequal=False)

In [None]:
#### H1: weekly 9pm > 9am
cols = ['week_num', 'hour', 'total_count']

X2_name='Motorbike weekly accidents at 9am'
X1_name='Motorbike weekly accidents at 21pm'

#H1: friday > sunday monthly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(mbike_acc, agg_cols=cols, focus_col=cols[1], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=21,
                smaller_set_vals=9,)
#                second_condition_col=cols[2], second_condition_val=0, balance_unequal=False)

In [None]:
#### H1: monthly 12pm - 19pm > 0am - 7am
cols = ['month', 'hour', 'total_count']
X1_name='Motorbike monthly accidents between 12pm - 19pm'
X2_name='Motorbike monthly accidents between 0am - 7am'

#H1: friday > sunday monthly accidents
# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(mbike_acc, agg_cols=cols, focus_col=cols[1], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=[12, 13, 14, 15, 16, 17, 18, 19],
                smaller_set_vals=[0, 1, 2, 3, 4, 5, 6, 7],)
#                second_condition_col=cols[2], second_condition_val=0, balance_unequal=False)

In [None]:
#### H1: hourly 12pm - 19pm > 0am - 7am
cols = ['hour', 'total_count']
X1_name='Motorbike hourly accidents between 12pm - 19pm'
X2_name='Motorbike hourly accidents between 0am - 7am'

# acc_df = proj.get_accidents_with_labels(accidents)
proj.test_hypotheses(mbike_acc, agg_cols=cols, focus_col=cols[0], bigger_set_name=X1_name, 
                smaller_set_name=X2_name, bigger_set_vals=[12, 13, 14, 15, 16, 17, 18, 19],
                smaller_set_vals=[0, 1, 2, 3, 4, 5, 6, 7],)
#                second_condition_col=cols[2], second_condition_val=0, balance_unequal=False)

#### H1: 8am - 11.59am > 20pm - 23.59pm

In [None]:
#### H1: total 8am - 11.59am > 20pm - 23.59pm
cols = ['hour', 'total_count']

proj.test_hypotheses(mbike_acc, agg_cols=cols, focus_col='hour',
                bigger_set_name='Motorbike accidents between 8am - 11.59am',
               smaller_set_name='Motorbike accidents between 20pm - 23.59pm',
               bigger_set_vals=[8, 9, 10, 11], smaller_set_vals=[20, 21,22,23])

In [None]:
#### H1: daily 8am - 11.59am > 20pm - 23.59pm
cols = ['hour', 'month', 'total_count']

proj.test_hypotheses(mbike_acc, agg_cols=cols, focus_col='hour',
                bigger_set_name='Motorbike monthly accidents between 8am - 11.59am',
               smaller_set_name='Motorbike monthly accidents between 20pm - 23.59pm',
               bigger_set_vals=[8, 9, 10, 11], smaller_set_vals=[20, 21,22,23])

### ii. Significant Days of Week 

#### H1: Motorbike accidents on Thursdays-Fridays > Saturdays-Sundays

In [None]:
# H1: total accidents on Thursday&Friday > Saturday&Sunday
X1_name='Thursday_Friday'
X2_name='Saturday_Sunday'
cols = ['day_of_week', 'day_name', 'total_count']

proj.test_hypotheses(mbike_acc, agg_cols=cols, focus_col=cols[1],
                bigger_set_name=X1_name,
               smaller_set_name=X2_name,
               bigger_set_vals=['Thursday', 'Friday'], smaller_set_vals=['Saturday', 'Sunday'])

#### H1: monthly accidents on Friday > Sunday

In [None]:
# H1: total accidents on Friday > Sunday
X1_name='Friday total'
X2_name='Sunday total'
cols = ['day_of_week', 'day_name', 'total_count']

proj.test_hypotheses(mbike_acc, agg_cols=cols, focus_col=cols[1],
                bigger_set_name=X1_name,
               smaller_set_name=X2_name,
               bigger_set_vals='Friday', smaller_set_vals='Sunday')

In [None]:
# H1: monthly accidents on Friday > Sunday
X1_name='Friday monthly'
X2_name='Sunday monthly'
cols = ['month', 'day_of_week', 'day_name', 'total_count']

proj.test_hypotheses(mbike_acc, agg_cols=cols, focus_col=cols[2],
                bigger_set_name=X1_name,
               smaller_set_name=X2_name,
               bigger_set_vals='Friday', smaller_set_vals='Sunday')

#### H1: Motorbike accidents on Weekdays > Weekends

In [None]:
# H1: monthly accidents per weekday > weekend
cols = ['month', 'is_weekend', 'total_count']
X1_name='Weekday monthly'
X2_name='Weekend monthly'

proj.test_hypotheses(mbike_acc, agg_cols=cols, focus_col=cols[1],
                bigger_set_name=X1_name,
               smaller_set_name=X2_name,
               bigger_set_vals=0, smaller_set_vals=1)

In [None]:
# H1: weekly accidents per weekday > weekend
cols = ['week_num', 'is_weekend', 'total_count']
X1_name='Weekday weekly'
X2_name='Weekend weekly'

proj.test_hypotheses(mbike_acc, agg_cols=cols, focus_col=cols[1],
                bigger_set_name=X1_name,
               smaller_set_name=X2_name,
               bigger_set_vals=0, smaller_set_vals=1)

#### H1: summertime Friday-Saturdays accidents > winter Friday-Saturdays

In [None]:
cols = ['season', 'week_num', 'day_name', 'total_count']
X1_name='Summer weekly Weekend'
X2_name='Winter weekly Weekend'

mbike_df = proj.get_accidents_with_labels(mbike_acc)
proj.test_hypotheses(mbike_df, agg_cols=cols, focus_col=cols[0],
                bigger_set_name=X1_name,
               smaller_set_name=X2_name,
               bigger_set_vals='summer', smaller_set_vals='winter',
                     second_condition_col=cols[2], second_condition_val=['Friday', 'Saturday'], balance_unequal=False)

#### H1: summertime weekday accidents > weekday

In [None]:
cols = ['season', 'week_num', 'day_name', 'total_count']
X1_name='Summer weekly Weekend'
X2_name='Winter weekly Weekend'

mbike_df = proj.get_accidents_with_labels(mbike_acc)
proj.test_hypotheses(mbike_df, agg_cols=cols, focus_col=cols[0],
                bigger_set_name=X1_name,
               smaller_set_name=X2_name,
               bigger_set_vals='summer', smaller_set_vals='winter',
                     second_condition_col=cols[2], second_condition_val=['Saturday'], balance_unequal=False)

## c) FOR PEDESTRIANS

In [None]:
ped_acc = proj.get_pedestrian_accidents(accidents)
# ped_acc

#### Visualize Trends

In [None]:
proj.plot_accident_map(accidents, ped_acc, plot_title='UK 2019 Accidents Involving Pedestrians',
                       alpha=None, savefig=True, fig_filename='uk_pedestrian_acc.png')

In [None]:
proj.visualize_top_ten_districts(ped_acc, col='casualty_severity', select_col_val=1,
                                suffix='ped_fatal', plot_title='Top Ten Districts With Highest Pedestrian Fatality',
                                savefig=True)

In [None]:
proj.visualize_top_ten_districts(ped_acc, col='is_weekend', select_col_val=1,
                                suffix='ped_fatal', 
                                 plot_title='Top Ten Districts With Highest Weekend Accidents for Pedestrians',
                                savefig=True)

In [None]:
proj.visualize_top_ten_districts(ped_acc, col='season', select_col_val='summer',
                                suffix='ped_ssn', 
                                 plot_title='Top Ten Districts With Highest Summer Accidents for Pedestrians',
                                savefig=True)

In [None]:
proj.visualize_trends(ped_acc, 'pedestrians')

In [None]:
proj.seasonal_observations(ped_acc, suffix='ped_ssn')

### i. Significant Hours of  Day

#### H1: 12pm - 19pm > 0am - 7am

In [None]:
cols = ['hour', 'total_count']
X1_name='pedestrian accidents between 12pm - 19pm'
X2_name='pedestrian accidents between 0am - 7'

ped_df = proj.get_accidents_with_labels(ped_acc)
proj.test_hypotheses(ped_df, agg_cols=cols, focus_col=cols[0],
                bigger_set_name=X1_name,
               smaller_set_name=X2_name,
               bigger_set_vals=[12, 13, 14, 15, 16, 17, 18, 19], smaller_set_vals=[0, 1, 2, 3, 4, 5, 6, 7],)
#                      second_condition_col=cols[2], second_condition_val=['Saturday'], balance_unequal=False)

#### H1: 8am - 11.59am > 20pm - 23.59pm

In [None]:
cols = ['hour', 'total_count']
X1_name='pedestrian accidents between 8am - 11.59am'
X2_name='pedestrian accidents between 20pm - 23.59pm'

# ped_df = proj.get_accidents_with_labels(ped_acc)
proj.test_hypotheses(ped_acc, agg_cols=cols, focus_col=cols[0],
                bigger_set_name=X1_name,
               smaller_set_name=X2_name,
               bigger_set_vals=[8, 9, 10, 11], smaller_set_vals=[20, 21, 22, 23],)
#                      second_condition_col=cols[2], second_condition_val=['Saturday'], balance_unequal=False)

### ii. Significant Days of Week 

In [None]:
# total per day of week
cols = ['day_of_week', 'day_name', 'total_count']
total_acc_dow = general_agg_ped[cols].groupby(cols[:-1]).sum().reset_index()
total_acc_dow

#### H1: Pedestrian accidents on Thursdays-Fridays > Saturdays-Sundays

In [None]:
cols = ['day_of_week', 'day_name', 'total_count']
X1_name='pedestrian accidents on  Thursdays-Fridays'
X2_name='pedestrian accidents on Saturdays-Sundays'

# ped_df = proj.get_accidents_with_labels(ped_acc)
proj.test_hypotheses(ped_acc, agg_cols=cols, focus_col=cols[1],
                bigger_set_name=X1_name,
               smaller_set_name=X2_name,
               bigger_set_vals=['Thursday', 'Friday'], smaller_set_vals=['Saturday', 'Sunday'],)
#                      second_condition_col=cols[2], second_condition_val=['Saturday'], balance_unequal=False)

#### H1: Pedestrian accidents on Weekdays > Weekends

In [None]:
cols = ['week_num', 'is_weekend', 'total_count']
X1_name='pedestrian accidents on  weekdays'
X2_name='pedestrian accidents on weekends'

# ped_df = proj.get_accidents_with_labels(ped_acc)
proj.test_hypotheses(ped_acc, agg_cols=cols, focus_col=cols[1],
                bigger_set_name=X1_name,
               smaller_set_name=X2_name,
               bigger_set_vals=0, smaller_set_vals=1,)
#                      second_condition_col=cols[2], second_condition_val=['Saturday'], balance_unequal=False)

In [None]:
cols = ['month', 'month_name','day_of_week', 'day_name', 'total_count']
agg_df = proj.generate_aggregated_lookup(ped_acc)
agg_df = agg_df[cols].groupby(cols[:-1]).sum().reset_index()
proj.plot_bar(agg_df[cols[-1]], agg_df[cols[1]], agg_df[cols[3]], annotate=True,
             plot_title='Monthly Pedestrian Accidents per Part of Day',
             paletter={'Sunday': 'green', 'Monday': 'blue', 'Tuesday': 'yellow', 'Wednesday': 'darkorange',
                       'Thursday': 'red', 'Friday': 'black', 'Saturday': 'gray'}, figsize=(10, 18),
              savefig=True, fig_filename='rlship_mon_pod_ped.png')

In [None]:
cols = ['month', 'is_weekend', 'total_count']
X1_name='pedestrian accidents on  weekdays'
X2_name='pedestrian accidents on weekends'

# ped_df = proj.get_accidents_with_labels(ped_acc)
proj.test_hypotheses(ped_acc, agg_cols=cols, focus_col=cols[1],
                bigger_set_name=X1_name,
               smaller_set_name=X2_name,
               bigger_set_vals=0, smaller_set_vals=1,)
#                      second_condition_col=cols[2], second_condition_val=['Saturday'], balance_unequal=False)

In [None]:
# H1: autumn weekly friday > winter friday
cols = ['week_num', 'day_name', 'season', 'total_count']
X1_name='pedestrian autumn accidents on fridays'
X2_name='pedestrian winter accidents on fridays'

# ped_df = proj.get_accidents_with_labels(ped_acc)
proj.test_hypotheses(ped_acc, agg_cols=cols, focus_col=cols[2],
                bigger_set_name=X1_name,
               smaller_set_name=X2_name,
               bigger_set_vals='autumn', smaller_set_vals='winter',
                     second_condition_col=cols[1], second_condition_val=['Friday'], balance_unequal=False)

In [None]:
# H1: autumn monthly friday > winter friday
cols = ['month_name', 'day_name', 'season', 'total_count']
X1_name='pedestrian autumn accidents on fridays'
X2_name='pedestrian winter accidents on fridays'

# ped_df = proj.get_accidents_with_labels(ped_acc)
proj.test_hypotheses(ped_acc, agg_cols=cols, focus_col=cols[2],
                bigger_set_name=X1_name,
               smaller_set_name=X2_name,
               bigger_set_vals='autumn', smaller_set_vals='winter',
                     second_condition_col=cols[1], second_condition_val=['Friday'], balance_unequal=False)

## d) IMPACT OF DAYLIGHT SAVINGS ON ACCIDENTS
2019 Daylight Saving Time (DST) = 1am March 31 to 2am October 27

In [None]:
dst_acc = proj.get_accidents_when(accidents, 'is_dst', 1)
# dst_acc

#### Visualize Trends

In [None]:
cols = ['month', 'month_name','day_of_week', 'day_name', 'total_count']
agg_df = proj.generate_aggregated_lookup(dst_acc)
agg_df = agg_df[cols].groupby(cols[:-1]).sum().reset_index()
proj.plot_bar(agg_df[cols[-1]], agg_df[cols[1]], agg_df[cols[3]], annotate=True,
             plot_title='Monthly DST Accidents per Part of Day',
             paletter={'Sunday': 'green', 'Monday': 'blue', 'Tuesday': 'yellow', 'Wednesday': 'darkorange',
                       'Thursday': 'red', 'Friday': 'black', 'Saturday': 'gray'}, figsize=(10, 18),
              savefig=True, fig_filename='rlship_mon_pod_dst.png')

In [None]:
proj.visualize_top_ten_districts(dst_acc, select_col_val=1, suffix='dst_fatal',
                                 plot_title='Top Ten Districts with Higest Casualty Fatality During DST', 
                                savefig=True)

In [None]:
proj.visualize_top_ten_districts(dst_acc, col='is_weekend', select_col_val=1, suffix='dst_wkend',
                                 plot_title='Top Ten Districts with Higest Weekend Accidents During DST',
                                savefig=True)

In [None]:
proj.visualize_top_ten_districts(dst_acc, col='is_weekend', select_col_val=0, suffix='dst_wkday',
                                 plot_title='Top Ten Districts with Higest Weekday Accidents During DST',
                                savefig=True)

In [None]:
proj.visualize_trends(dst_acc, 'Daylight Savings Time')

#### COMPARING FIRST WEEK OF DST TO WEEK BEFORE DST

impact plan:
1. select accidents for period (week_num, day_num, etc.)
2. groupby day_name and hour
3. pyramid, difference plot
4. test for significance

#### The week (7 days) BEFORE the start of DST
1am 24-3-2019 to 0am 31-3-2019

In [None]:
dst_first_week = proj.dst_first_week(accidents)
display(dst_first_week)
wk_before_dst = proj.dst_prior_week(accidents)
display(wk_before_dst)

In [None]:
print(dst_first_week['week_num'].unique(), wk_before_dst['week_num'].unique())

In [None]:
cols = ['day_of_week', 'day_name', 'hour']
general_agg_wk_before_dst = proj.generate_aggregated_lookup(wk_before_dst, cols)
# display(general_agg_wk_before_dst)

# hourly comparisons
cols = ['hour', 'total_count']
week_bf_hr = general_agg_wk_before_dst[cols].groupby(cols[:-1]).sum().sort_index().reset_index()
# display(week_bf_hr)

# hourly comparisons
cols = ['hour', 'total_count']
dst_wk1_hr = general_agg_wk1[cols].groupby(cols[:-1]).sum().sort_index().reset_index()
# dst_wk1_hr

proj.plot_pyramid(week_bf_hr['total_count'], dst_wk1_hr['total_count'],
          plot_title='Comparison of Week Before & First Week of DST', 
          left_legend='Week Before DST', right_legend='DST First Week', left_labe_shift=350,
            savefig=True, fig_filename='pyramid_dst_wk1.png')

In [None]:
proj.plot_diff(week_bf_hr['total_count'], dst_wk1_hr['total_count'],
          plot_title='Differences in Week Before Vs. 1st Week of DST', 
          left_legend='DST Prior Week', right_legend='DST First Week', left_labe_shift=35,
         savefig=True, fig_filename='diff_dst_wk1.png')

In [None]:
proj.visualize_trends(wk_before_dst, 'wk_before_dst')

In [None]:
proj.visualize_trends(dst_first_week, 'dst week 1')

#### Hypothesis Testing for impact

**H1: 24-hour accidents in the week before DST > DST week 1**

In [None]:
# display(week_bf_hr)
# display(dst_wk1_hr)

proj.report_a_significance(week_bf_hr['total_count'], dst_wk1_hr['total_count'],
                           X1_name='Week Before DST Hourly total',
                          X2_name='DST Hourly total')

**H1: 9am - 16.59pm accidents in the week BEFORE dst > in dst wk 1**

In [None]:
selected_hrs = [9, 10, 11, 12, 13, 14, 15, 16]
wk_bf_9_16 = proj.get_accidents_when(week_bf_hr, 'hour', selected_hrs)
# display(wk_bf_9_16)
dst_wk1_9_16 = proj.get_accidents_when(dst_wk1_hr, 'hour', selected_hrs)
# display(dst_wk1_9_16)

proj.report_a_significance(wk_bf_9_16['total_count'], dst_wk1_9_16['total_count'],
                           X1_name='Week Before DST 9am - 16.59pm',
                          X2_name='DST Week 1 9am - 16.59pm')

**H1: 9am - 11.59pm accidents in the week BEFORE dst > in dst wk 1**

In [None]:
selected_hrs = [9, 10, 11]
wk_bf_9_11 = proj.get_accidents_when(week_bf_hr, 'hour', selected_hrs)
# display(wk_bf_9_11)
dst_wk1_9_11 = proj.get_accidents_when(dst_wk1_hr, 'hour', selected_hrs)
# display(dst_wk1_9_11)

proj.report_a_significance(wk_bf_9_11['total_count'], dst_wk1_9_11['total_count'],
                           X1_name='Week Before DST 9am - 11.59pm',
                          X2_name='DST Week 1 9am - 11.59pm')

**H1: 19pm - 21.59pm accidents in the week BEFORE dst > in dst wk 1**

In [None]:
selected_hrs = [19, 20, 21]
wk_bf_9_11 = proj.get_accidents_when(week_bf_hr, 'hour', selected_hrs)
# display(wk_bf_9_11)
dst_wk1_9_11 = proj.get_accidents_when(dst_wk1_hr, 'hour', selected_hrs)
# display(dst_wk1_9_11)

proj.report_a_significance(wk_bf_9_11['total_count'], dst_wk1_9_11['total_count'],
                           X1_name='Week Before DST 9am - 11.59pm',
                          X2_name='DST Week 1 9am - 11.59pm')

**H1: 5am - 5.59am accidents in the week BEFORE dst > in dst wk 1**

In [None]:
cols = ['day_num', 'hour', 'total_count']
week_bf_hr = general_agg_wk_before_dst[cols].groupby(cols[:-1]).sum().reset_index()
# display(week_bf_hr)
wk_bf_5am = proj.get_accidents_when(week_bf_hr, 'hour', 5)
# display(wk_bf_5am)
dst_wk1_hr = general_agg_wk1[cols].groupby(cols[:-1]).sum().reset_index()
dst_wk1_5am = proj.get_accidents_when(dst_wk1_hr, 'hour', 5)
# display(dst_wk1_5am)

proj.report_a_significance(dst_wk1_5am['total_count'], wk_bf_5am['total_count'],
                           X2_name='Week Before DST 5am',
                          X1_name='DST Week 1 5am')

**H1: 18pm - 18.59pm accidents in the week BEFORE dst > in dst wk 1**

In [None]:
week_bf_hr = general_agg_wk_before_dst[cols].groupby(cols[:-1]).sum().reset_index()
wk_bf_18pm = proj.get_accidents_when(week_bf_hr, 'hour', 18)
# display(wk_bf_18pm)
dst_wk1_hr = general_agg_wk1[cols].groupby(cols[:-1]).sum().reset_index()
dst_wk1_18pm = proj.get_accidents_when(dst_wk1_hr, 'hour', 18)
# display(dst_wk1_18pm)

proj.report_a_significance(dst_wk1_18pm['total_count'], wk_bf_18pm['total_count'],
                           X2_name='Week Before DST 18pm',
                          X1_name='DST Week 1 18pm')

In [None]:
# dst hourly average
cols = ['day_num', 'hour', 'total_count']
day_hr_count = proj.generate_aggregated_lookup(dst_acc, cols[:-1])
# display(day_hr_count)
dst_avg_hr = day_hr_count[cols].groupby(cols[1]).mean()
# display(dst_avg_hr)

#### The week (7 days) AFTER DST
3am 27-10-2019 to 2am 3-11-2019

In [None]:
week_aft_dst = proj.week_after_dst(accidents)
# week_aft_dst

In [None]:
proj.visualize_trends(week_aft_dst, 'Week_after_dst')

In [None]:
dst_last_wk = proj.dst_last_week(accidents)
# dst_last_wk

In [None]:
proj.visualize_trends(dst_last_wk, 'DST Last Week')

In [None]:
# hourly comparisons
cols = ['hour', 'total_count']
total_dst_lastwk_hr = proj.generate_aggregated_lookup(dst_last_wk)[cols].groupby(cols[:-1]).sum().reset_index()
# display(total_dst_lastwk_hr)
# 
wk_after_hr = proj.generate_aggregated_lookup(week_aft_dst)[cols].groupby(cols[:-1]).sum().reset_index()
# display(wk_after_hr)

In [None]:
proj.plot_pyramid(total_dst_lastwk_hr['total_count'], wk_after_hr['total_count'], left_legend='DST LAST WEEK', right_legend='WEEK AFTER DST',
              left_labe_shift=350, plot_title='COMPARISON OF DST LAST WEEK VS WEEK AFTER DST', #xlim=(-300, 300),
               savefig=True, fig_filename='pyramid_dst_last_after.png')

In [None]:
proj.plot_diff(total_dst_lastwk_hr['total_count'], wk_after_hr['total_count'], left_legend='DST LAST WEEK', right_legend='WEEK AFTER DST',
              left_labe_shift=200, plot_title='DIFFERENCES IN DST LAST WEEK VS WEEK AFTER DST', xlim=(-300, 300),
               savefig=True, fig_filename='diff_dst_last_after.png')

**H1: 24-hour accidents in the week after DST > last week of DST**

In [None]:
proj.report_a_significance(wk_after_hr['total_count'], total_dst_lastwk_hr['total_count'], 
                          X2_name='Last Week of DST', X1_name='Week After DST')

**H1: 7am – 12.59pm accidents in the week after DST > the last week of DST**

In [None]:
cols = ['day_num', 'hour', 'total_count']

selected_hrs = [7, 8, 9, 10, 11, 12]
lastwk_am_7_12 = proj.get_accidents_when(total_dst_lastwk_hr, 'hour', selected_hrs)
# display(lastwk_am_7_12)
wkafter_am_7_12 = proj.get_accidents_when(wk_after_hr, 'hour', selected_hrs)
# display(wkafter_am_7_12)
proj.report_a_significance(wkafter_am_7_12['total_count'], lastwk_am_7_12['total_count'], X1_name='Week After', X2_name='DST Last Week')

**H1: 17pm - 17.59pm accidents in the week after DST > the last week of DST**

In [None]:
selected_hrs = [17]#, 8, 9, 10, 11, 12]

lastwk_pm_17 = proj.get_accidents_when(total_dst_lastwk_hr, 'hour', selected_hrs)
# display(lastwk_pm_17)
# wkafter_pm_17 = proj.get_accidents_when(wk_after_hr, 'hour', selected_hrs)
display(wkafter_pm_17)
proj.report_a_significance(wkafter_pm_17['total_count'], lastwk_pm_17['total_count'], X1_name='Week After', X2_name='DST Last Week')

In [None]:
cols = ['hour', 'day_num', 'total_count']
total_lastwk_hr = proj.generate_aggregated_lookup(dst_last_wk)
# display(total_lastwk_hr)

total_dst_lastwk_hr = total_lastwk_hr[cols].groupby(cols[:-1]).sum().reset_index()
# display(total_dst_lastwk_hr)

total_wk_after_hr = proj.generate_aggregated_lookup(week_aft_dst)
wk_after_hr = total_wk_after_hr[cols].groupby(cols[:-1]).sum().reset_index()
# display(wk_after_hr)

In [None]:
# weekly 17pm - 17.59pm

selected_hrs = [17]#, 8, 9, 10, 11, 12]

lastwk_pm_17 = proj.get_accidents_when(total_dst_lastwk_hr, 'hour', selected_hrs)
# display(lastwk_pm_17)
wkafter_pm_17 = proj.get_accidents_when(wk_after_hr, 'hour', selected_hrs)
# display(wkafter_pm_17)
proj.report_a_significance(wkafter_pm_17['total_count'], lastwk_pm_17['total_count'], X1_name='Week After', X2_name='DST Last Week')

**H1: 9am - 10.59pm accidents in the week after DST > the last week of DST**

In [None]:
selected_hrs = [9,10]# 10, 11, 12]

lastwk_am_9_10 = proj.get_accidents_when(total_dst_lastwk_hr, 'hour', selected_hrs)
# display(lastwk_am_9_10)
wkafter_am_9_10 = proj.get_accidents_when(wk_after_hr, 'hour', selected_hrs)
# display(wkafter_am_9_10)
proj.report_a_significance(wkafter_am_9_10['total_count'], lastwk_am_9_10['total_count'],
                           X1_name='Week After 9 - 10.59am', X2_name='DST Last Week 9 - 10.59am')

## e) IMPACT OF SUNRISE AND SUNSET TIMES ON ACCIDENTS

In [None]:
#sunrise impact
rise_set_df = proj.assign_sunrise_sunset(accidents, as_boundaries=True)
# rise_set_df

In [None]:
two_hrs_before_rise = rise_set_df.loc[rise_set_df['full_hour'].between(rise_set_df['rise_start'] - 2, 
                                                                     rise_set_df['rise_start'] - 0.01)].index
# display(two_hrs_before_rise)

two_hrs_after_rise = rise_set_df.loc[rise_set_df['full_hour'].between(rise_set_df['rise_end'] + 0.01, 
                                                                     rise_set_df['rise_end'] + 2)].index
# display(two_hrs_after_rise)

two_hrs_before_set = rise_set_df.loc[rise_set_df['full_hour'].between(rise_set_df['set_start'] - 2,
                                                                   rise_set_df['set_start'] - 0.01)].index

two_hrs_after_set = rise_set_df.loc[rise_set_df['full_hour'].between(rise_set_df['set_start'] + 0.01, 
                                                                     rise_set_df['set_start'] + 2)].index
# display(two_hrs_after_set)

### SUNRISE IMPACT

In [None]:
accidents_before_sunrise = accidents.loc[two_hrs_before_rise]
accidents_after_sunrise = accidents.loc[two_hrs_after_rise]

In [None]:
# accidents_before_sunrise

In [None]:
cols = ['hour', 'total_count']
before_agg = proj.generate_aggregated_lookup(accidents_before_sunrise)
# total_before_hr = before_agg[cols].groupby(cols[:-1]).sum()
display(total_before_hr)

avg = total_before_hr['total_count'].mean()
cmap = {i:'red' if v >= avg else 'gray' for i, v in zip(total_before_hr.index, total_before_hr['total_count'])}
# display(cmap)
proj.plot_column(total_before_hr.index, total_before_hr['total_count'],
                 plot_title='Accidents Within Two Hours Before Sunrise',
                paletter=cmap)
plt.show()

after_agg = proj.generate_aggregated_lookup(accidents_after_sunrise)
total_after_hr = after_agg[cols].groupby(cols[:-1]).sum()
# display(total_after_hr)


avg = total_after_hr['total_count'].mean()
cmap = {i:'red' if v >= avg else 'gray' for i, v in zip(total_after_hr.index, total_after_hr['total_count'])}
proj.plot_column(total_after_hr.index, total_after_hr['total_count'],
                 plot_title='Accidents Within Two Hours Before Sunrise',
                paletter=cmap)

In [None]:
before_rise_agg = proj.generate_aggregated_lookup(accidents_before_sunrise)
after_rise_agg = proj.generate_aggregated_lookup(accidents_after_sunrise)

In [None]:
cols = ['hour', 'total_count']
before_sunrise_hr = before_rise_agg[cols].groupby(cols[:-1]).sum().reset_index()
# display(before_sunrise_hr)
after_sunrise_hr = after_rise_agg[cols].groupby(cols[:-1]).sum().reset_index()
# display(after_sunrise_hr)

before_sunrise_hr = before_sunrise_hr.append(pd.DataFrame({'hour':[8, 9, 10],
                                                           'total_count':[0, 0, 0]})).sort_values('hour').reset_index(drop=True)

# display(before_sunrise_hr)

after_sunrise_hr = after_sunrise_hr.append(pd.DataFrame({'hour':[2, 3],
                                                         'total_count':[0, 0]})).sort_values('hour').reset_index(drop=True)

# display(after_sunrise_hr)

In [None]:
before_sunrise_hr = before_sunrise_hr.set_index(before_sunrise_hr['hour'])
# display(before_sunrise_hr)
after_sunrise_hr = after_sunrise_hr.set_index(before_sunrise_hr['hour'])
# display(after_sunrise_hr)

In [None]:
proj.plot_diff(before_sunrise_hr['total_count'], after_sunrise_hr['total_count'] ,
               left_legend='Within 2 Hours Before Sunrise',plot_title='Accidents Within 2 Hours Before Sunrise Vs After Sunrise',
              right_legend='Within 2 Hours After Sunrise', xlim=(-6500, 12800), left_labe_shift=5000,
              fig_w=12, fig_filename='sunrise_impact.png', savefig=True)

####  H1: Accidents after sunrise > accidents before sunrise

In [None]:
X2_name = 'Accidents 2 hrs before sunrise'
X1_name = 'Accidents 2 hrs after sunrise'
proj.report_a_significance(after_sunrise_hr['total_count'], before_sunrise_hr['total_count'],
                          X1_name=X1_name, X2_name=X2_name, balance=False)

#### h1: Accidents at 7 - 9.59am after sunrise > before sunrise

In [None]:
after_am_7_9 = after_sunrise_hr.loc[after_sunrise_hr['hour'].isin([7, 8, 9])]
# display(after_am_7_9)
before_am_7_9 = before_sunrise_hr.loc[before_sunrise_hr['hour'].isin([7, 8, 9])]
# display(before_am_7_9)

X2_name = 'Accidents before sunrise at 7am - 9.59am'
X1_name = 'Accidents after sunrise at 7am - 9.59am'
proj.report_a_significance(after_am_7_9['total_count'], before_am_7_9['total_count'],
                          X1_name=X1_name, X2_name=X2_name, balance=False)

### SUNSET IMPACT

In [None]:
accidents_before_sunset = accidents.loc[two_hrs_before_set]
accidents_after_sunset = accidents.loc[two_hrs_after_set]

In [None]:
# accidents_before_sunset

In [None]:
cols = ['hour', 'total_count']
before_agg = proj.generate_aggregated_lookup(accidents_before_sunset)
total_before_hr = before_agg[cols].groupby(cols[:-1]).sum()
# display(total_before_hr)

avg = total_before_hr['total_count'].mean()
cmap = {i:'red' if v >= avg else 'gray' for i, v in zip(total_before_hr.index, total_before_hr['total_count'])}
# display(cmap)
proj.plot_column(total_before_hr.index, total_before_hr['total_count'],
                 plot_title='Accidents Within Two Hours Before Sunset',
                paletter=cmap)
plt.show()

after_agg = proj.generate_aggregated_lookup(accidents_after_sunset)
total_after_hr = after_agg[cols].groupby(cols[:-1]).sum()
# display(total_after_hr)


avg = total_after_hr['total_count'].mean()
cmap = {i:'red' if v >= avg else 'gray' for i, v in zip(total_after_hr.index, total_after_hr['total_count'])}
proj.plot_column(total_after_hr.index, total_after_hr['total_count'],
                 plot_title='Accidents Within Two Hours Before Sunset',
                paletter=cmap)

In [None]:
before_set_agg = proj.generate_aggregated_lookup(accidents_before_sunset)
after_set_agg = proj.generate_aggregated_lookup(accidents_after_sunset)

In [None]:
cols = ['hour', 'total_count']
before_sunset_hr = before_set_agg[cols].groupby(cols[:-1]).sum().reset_index()
# display(before_sunset_hr)
after_sunset_hr = after_set_agg[cols].groupby(cols[:-1]).sum().reset_index()
# display(after_sunset_hr)

before_sunset_hr = before_sunset_hr.append(pd.DataFrame({'hour':[22, 23],
                                                           'total_count':[0, 0]})).sort_values('hour').reset_index(drop=True)

# display(before_sunset_hr)

after_sunset_hr = after_sunset_hr.append(pd.DataFrame({'hour':[13, 14],
                                                         'total_count':[0, 0]})).sort_values('hour').reset_index(drop=True)

# display(after_sunset_hr)

In [None]:
before_sunset_hr = before_sunset_hr.set_index(before_sunset_hr['hour'])
# display(before_sunset_hr)
after_sunset_hr = after_sunset_hr.set_index(before_sunset_hr['hour'])
# display(after_sunset_hr)s

In [None]:
proj.plot_diff(before_sunset_hr['total_count'], after_sunset_hr['total_count'] ,
               left_legend='Within 2 Hours Before Sunset',plot_title='Accidents Within 2 Hours Before Sunset Vs After Sunset',
              right_legend='Within 2 Hours After Sunset', xlim=(-10000, 12800), left_labe_shift=4500, 
              left_vlabe_shift=18, right_vlabe_shift=-12, fig_w=8, fig_filename='sunset_impact.png', savefig=True)

####  H1: Accidents before sunset > accidents after sunset

In [None]:
X1_name = 'Accidents 2 hrs before sunset'
X2_name = 'Accidents 2 hrs after sunset'
proj.report_a_significance(before_sunset_hr['total_count'], after_sunset_hr['total_count'],
                          X1_name=X1_name, X2_name=X2_name, balance=False)

#### h1: Accidents at 14 - 15.59am before sunrise > after sunrise

In [None]:
before_pm_14_15 = before_sunset_hr.loc[before_sunset_hr['hour'].isin([14, 15])]
display(before_pm_14_15)
after_pm_14_15 = after_sunset_hr.loc[after_sunset_hr['hour'].isin([14, 15])]
display(after_pm_14_15)

X1_name = 'Accidents before sunset at 14pm - 15.59pm'
X2_name = 'Accidents after sunset at 14pm - 15.59am'
proj.report_a_significance(before_pm_14_15['total_count'], after_pm_14_15['total_count'],
                          X1_name=X1_name, X2_name=X2_name, balance=False)

## f) RELATIONSHIP OF VEHICULAR VARIABLES TO ACCIDENTS


#### Read in datasets

In [None]:
vehicle_variables = ['vehicle_reference', 'vehicle_type', 'towing_and_articulation', 'vehicle_manoeuvre',
                     'vehicle_location_restricted_lane', 'junction_location', 'skidding_and_overturning', 
                     'hit_object_in_carriageway', 'vehicle_leaving_carriageway', 'hit_object_off_carriageway',
                     '1st_point_of_impact', 'was_vehicle_left_hand_drive', 'journey_purpose_of_driver', 'sex_of_driver',
                     'age_of_driver', 'age_band_of_driver', 'engine_capacity_cc', 'propulsion_code', 'age_of_vehicle',
                     'driver_imd_decile', 'driver_home_area_type', 'vehicle_imd_decile']

In [None]:
proj.visualize_vehicle_variables(accidents)

In [None]:
cols = ['vehicle_type',  'was_vehicle_left_hand_drive', 'junction_location']
proj.visualize_3_variable_rlship(accidents, cols,  y_labe=f'{cols[0]} & {cols[1]}', x_labe='accidents',
                                plot_title='Highest Occurrences of Vehicle Type, Left Hand Drive & Junction Location',
                                min_count_allowed=1000, savefig=True, fig_fname='veh_type_lhd.png')

In [None]:
# print(var_look.keys())

In [None]:
# display(accidents,)# proj.null_checker(pd.DataFrame(accidents), only_nulls=True))

In [None]:
cols = ['propulsion_code', 'age_of_vehicle', 'vehicle_type']
proj.visualize_3_variable_rlship(accidents, cols,  y_labe=f'{cols[0]} & {cols[1]}', x_labe='accidents',
                                plot_title=f'Highest Occurrences of {cols[0]}, {cols[1]} & {cols[-1]}',
                                top_n=2,  figsize=(6, 6), savefig=True, xlim=(0, 35000),
                                min_count_allowed=2000, fig_fname='veh_prop_age.png')

In [None]:
# vehicle_variables = ['vehicle_reference', 'vehicle_type', 'towing_and_articulation', 'vehicle_manoeuvre',
#                      'vehicle_location_restricted_lane', 'junction_location', 'skidding_and_overturning', 
#                      'hit_object_in_carriageway', 'vehicle_leaving_carriageway', 'hit_object_off_carriageway',
#                      '1st_point_of_impact', 'was_vehicle_left_hand_drive', 'journey_purpose_of_driver', 'sex_of_driver',
#                      'age_of_driver', 'age_band_of_driver', 'engine_capacity_cc', 'propulsion_code', 'age_of_vehicle',
#                      'driver_imd_decile', 'driver_home_area_type', 'vehicle_imd_decile']

In [None]:
cols = ['vehicle_location_restricted_lane', 'junction_location','journey_purpose_of_driver']
proj.visualize_3_variable_rlship(accidents, cols,  y_labe=f'{cols[0]} & {cols[1]}', x_labe='accidents',
                                plot_title=f'Highest Occurrences of {cols[0]}, {cols[1]} & {cols[-1]} ',
                                top_n=3, xlim=(0, 70000), figsize=(10, 6),  annot_size=13, xy_labe_size=14,
                                min_count_allowed=1000, savefig=True, fig_fname='veh_lane_loc.png')

In [None]:
cols = ['vehicle_manoeuvre','vehicle_location_restricted_lane', 'propulsion_code',]
proj.visualize_3_variable_rlship(accidents, cols,  y_labe=f'{cols[0]} & {cols[1]}', x_labe='accidents',
                                plot_title=f'Highest Occurrences of {cols[0]}, {cols[1]} & {cols[-1]} ',
                                min_count_allowed=1000, top_n=3, xlim=(0, 65000), figsize=(8, 14), savefig=True, fig_fname='veh_type_lane.png')

In [None]:
cols = ['vehicle_type','speed_limit', 'propulsion_code',]
proj.visualize_3_variable_rlship(accidents, cols,  y_labe=f'{cols[0]} & {cols[1]}', x_labe='accidents',
                                plot_title=f'Highest Occurrences of {cols[0]}, {cols[1]} & {cols[-1]}',
                                top_n=2,  figsize=(8, 8),  annot_size=11, xy_labe_size=11,xlim=(0, 75000),
                                min_count_allowed=2000, savefig=True, fig_fname='veh_type_spd.png')

In [None]:
cols = ['age_band_of_driver','sex_of_driver', 'accident_severity',]
proj.visualize_3_variable_rlship(accidents, cols,  y_labe=f'{cols[0]} & {cols[1]}', x_labe='accidents',
                                plot_title=f'Highest Occurrences of {cols[0]}, {cols[1]} & {cols[-1]}',
                                top_n=1, xlim=(0, 70000), figsize=(15, 24),  annot_size=13, xy_labe_size=14,
                                paletter={'fatal':'black', 'serious':'red', 'minor':'gray'},
                                min_count_allowed=1000, )#savefig=True, fig_fname='driver_age_sex.png')

### ASSOCIATION RULE:
WHERE:<br>
Vehicle_type = Car; <br>
Propulsion = Petrol/Heavy oil;<br>
Engine_capacity = 124, 1598, 1968;<br>
Vehicle_age = 7; <br>
left_hand_drive = No

In [None]:
cond = ((accidents['vehicle_type'] == 'Car') &
        (accidents['propulsion_code'].isin(['Petrol', 'Heavy oil'])) &
        (accidents['engine_capacity_cc'].isin([124.0, 1598.0, 1968.0])) &
        (accidents['age_of_vehicle'] == 7.0)  &
        (accidents['was_vehicle_left_hand_drive'] == 'No'))
accidents.loc[cond]

In [None]:
vehicle_variables = ['vehicle_type', 'vehicle_manoeuvre',
                     'vehicle_location_restricted_lane', 'junction_location', 
                      'was_vehicle_left_hand_drive', 'journey_purpose_of_driver', 'sex_of_driver',
                     'engine_capacity_cc', 'propulsion_code', 'age_of_vehicle',
                     'vehicle_imd_decile', 'accident_severity']

Frequent driver variables: <br>
1. restricted_lane = 'on main carriageway not on restricted lane' & right hand drive <br>
2. restricted_lane = 'on main carriageway not on restricted lane' & driver from urban home area
3. right hand drive & driver from urban home area
4. restricted_lane = 'on main carriageway not on restricted lane' &   right hand drive & driver from urban home area

In [None]:
acc_df = proj.get_accidents_with_labels(accidents)
display(acc_df)

result = proj.run_apriori(acc_df[vehicle_variables], min_support=0.1)
display(result)

In [None]:
ranking_cols = ['vehicle_location_restricted_lane', 'vehicle_type', 'was_vehicle_left_hand_drive']
counts = proj.rank_top_occurrences(acc_df[ranking_cols], min_count_allowed=2000)
display(counts)
x, y = counts.drop(ranking_cols[-1], axis=1), counts[ranking_cols[-1]]
labes = proj.create_label_from_ranking(x, exclude_last_col=False)
display(labes)

proj.plot_bar(y=labes, x=counts['total_count'], condition_on=y, annotate=True, annot_size=12,xlim=(0, 235000),
                 plot_title=f'Association Between Vehicle Variables', paletter=None, figsize=(8, 7), xy_labe_size=9,
                 x_labe='accidents', y_labe=f"{cols[:-1]}", dpi=250, savefig=True, fig_filename='veh_typ_lane_lhd.png')

In [None]:
ranking_cols = ['vehicle_location_restricted_lane', 'journey_purpose_of_driver', 'was_vehicle_left_hand_drive']
counts = proj.rank_top_occurrences(acc_df[ranking_cols], min_count_allowed=2000)
display(counts)
x, y = counts.drop(ranking_cols[-1], axis=1), counts[ranking_cols[-1]]
labes = proj.create_label_from_ranking(x, exclude_last_col=False)
display(labes)

proj.plot_bar(y=labes, x=counts['total_count'], condition_on=y, annotate=True, annot_size=12,xlim=(0, 235000),
                 plot_title=f'Association Between Vehicle Variables', paletter=None, figsize=(8, 7), xy_labe_size=9,
                 x_labe='accidents', y_labe=f"{cols[:-1]}", dpi=250, savefig=True, fig_filename='veh_purp_lane_lhd.png')

In [None]:

ranking_cols = ['vehicle_location_restricted_lane', 'sex_of_driver', 'was_vehicle_left_hand_drive']
counts = proj.rank_top_occurrences(acc_df[ranking_cols], min_count_allowed=2000)
display(counts)
x, y = counts.drop(ranking_cols[-1], axis=1), counts[ranking_cols[-1]]
labes = proj.create_label_from_ranking(x, exclude_last_col=False)
display(labes)

proj.plot_bar(y=labes, x=counts['total_count'], condition_on=y, annotate=True, annot_size=12,xlim=(0, 235000),
                 plot_title=f'Association Between Vehicle Variables', paletter=None, figsize=(8, 7), xy_labe_size=9,
                 x_labe='accidents', y_labe=f"{cols[:-1]}", dpi=250, savefig=True, fig_filename='veh_sex_lane_lhd.png')

In [None]:
ranking_cols = ['vehicle_location_restricted_lane', 'journey_purpose_of_driver','sex_of_driver', 
                'vehicle_type', 'was_vehicle_left_hand_drive', 'accident_severity']
counts = proj.rank_top_occurrences(acc_df[ranking_cols], top_n=5, min_count_allowed=2000)
display(counts)
x, y = counts.drop(ranking_cols[-1], axis=1), counts[ranking_cols[-1]]
labes = proj.create_label_from_ranking(x, exclude_last_col=False)
display(labes)

proj.plot_bar(y=labes, x=counts['total_count'], condition_on=y, annotate=True, annot_size=12,xlim=(0, 75000),
                 plot_title=f'Association Between Vehicle Variables', figsize=(8, 7), xy_labe_size=9,
              paletter={'fatal':'black', 'serious':'red', 'minor':'gray'},
                 x_labe='accidents', y_labe=f"{cols[:-1]}", dpi=250, savefig=True, fig_filename='veh_sev_purp__sex_lane_lhd.png')

In [None]:
ranking_cols = ['vehicle_type', 'was_vehicle_left_hand_drive', 'vehicle_location_restricted_lane',
       'accident_severity']
acc_df = proj.get_accidents_with_labels(accidents)
counts = proj.rank_top_occurrences(acc_df[ranking_cols], top_n=2, min_count_allowed=2000)
display(counts)

x = counts.drop(ranking_cols[-1], axis=1)
y = counts[ranking_cols[-1]]

labe = proj.create_label_from_ranking(x, exclude_last_col=False)
display(labe)

proj.plot_bar(y=labe, x=counts['total_count'], condition_on=y, annotate=True, annot_size=12,xlim=(0, 175000),
                 plot_title=f'Association Between Vehicle Variables', figsize=(8, 10), xy_labe_size=9,
              paletter={'fatal':'black', 'serious':'red', 'minor':'gray'},
                 x_labe='accidents', y_labe=f"{ranking_cols[:-1]}", dpi=250, savefig=True, 
              fig_filename='veh_typ_lane_lhd.png')

## e) CONDITIONS (WEATHER, GEOGRAPHIC LOCATION, SITUATION ) THAT GENERATE MORE ACCIDENTS

In [None]:
geo_variables = ['local_authority_district', 'local_authority_highway', '1st_road_class', 'police_force',]

situation_variables = ['road_type', 'road_surface_conditions', '1st_road_class', '2nd_road_class', 'junction_detail', 'junction_control', 
                       'speed_limit', 'pedestrian_crossing_human_control', 'day_name', 'month_name', 'is_offseason',
                     'pedestrian_crossing_physical_facilities', 'light_conditions', 'carriageway_hazards', 
                     'pedestrian_location', 'pedestrian_movement', 'car_passenger','bus_or_coach_passenger',
                     'pedestrian_road_maintenance_worker', 'casualty_type', 'special_conditions_at_site', 'urban_or_rural_area']

weather_variables = ['weather_conditions', 'season', 'road_surface_conditions',]

In [None]:
acc_df = proj.get_accidents_with_labels(accidents)
display(acc_df)

apr_result = proj.run_apriori(acc_df, geo_variables, min_support=0.1)

In [None]:
display(apr_result)

In [None]:
acc_df = proj.get_accidents_with_labels(accidents)
display(acc_df)

In [None]:
apr_result = proj.run_apriori(acc_df, weather_variables, min_support=0.1)

In [None]:
display(apr_result)

In [None]:
use = apr_result.loc[apr_result['num_sets'] == 2]
use

In [None]:
# start = 0
# stop = 5
print(use.loc[8]['itemsets'])

In [None]:

counts = proj.rank_top_occurrences(acc_df[weather_variables[:-1]], min_count_allowed=2000)
counts

In [None]:
labe = proj.create_label_from_ranking(counts)
labe

In [None]:
proj.plot_bar(counts['total_count'], labe, counts.iloc[:, -2],  figsize=(6, 5),  annotate=True,
                                xlim=(0, 70000), savefig=True, fig_filename='weather_ssn_surf.png',
              paletter={'autumn': 'black', 'spring': 'green', 'summer': 'yellow', 'winter':'gray'},
              x_labe='accidents', plot_title='Association Between Weather Variables', annot_size=12)

In [None]:
counts = proj.rank_top_occurrences(acc_df[weather_variables], min_count_allowed=2000)
counts

In [None]:
ranking_cols = ['weather_conditions', 'road_surface_conditions', 'season']
labe = proj.create_label_from_ranking(counts[ranking_cols])
labe

In [None]:
labe.iloc[0]

In [None]:
proj.plot_bar(counts['total_count'], labe, counts.loc[:, 'season'],  figsize=(6, 5),  annotate=True,
                                xlim=(0, 70000),# savefig=True, fig_filename='weather_ssn_surf.png',
              paletter={'autumn': 'black', 'spring': 'green', 'summer': 'yellow', 'winter':'gray'},
              x_labe='accidents', plot_title='Association Between Weather Variables', annot_size=12)

#### SITUATIONAL INFLUENCE

In [None]:
ranking_cols = ['bus_or_coach_passenger', 'pedestrian_location', 'pedestrian_movement', 'pedestrian_crossing_human_control',
               'junction_control', 'carriageway_hazards', 'pedestrian_road_maintenance_worker', 
                'pedestrian_crossing_physical_facilities', '1st_road_']
counts = proj.rank_top_occurrences(acc_df[ranking_cols], min_count_allowed=2000)
counts
# acc_df.value_counts().sort_values(ascending=False).reset_index()

In [None]:
labe = proj.create_label_from_ranking(counts)
labe

In [None]:
proj.plot_bar(counts['total_count'], labe, counts[ranking_cols[-1]],  figsize=(6, 5),  annotate=True,
                                xlim=(0, 280000), savefig=True, fig_filename='situation_bus_loc.png',
              x_labe='accidents', plot_title='Association Between Variables', annot_size=12)

In [None]:
proj.visualize_distributions(acc_df)

In [None]:
ranking_cols = ['light_conditions', 'speed_limit', 'driver_home_area_type', 'road_type', 'is_weekend',
                'part_of_day']
counts = proj.rank_top_occurrences(acc_df[ranking_cols], min_count_allowed=2000)
display(counts)

labe = proj.create_label_from_ranking(counts, exclude_last_col=True)
display(labe[0])

In [None]:
proj.plot_bar(counts['total_count'], labe, counts[ranking_cols[-1]],  figsize=(6, 5),  annotate=True,
                                savefig=True, fig_filename='situation_light_spd_rtype2.png',xlim=(0, 39000),
              x_labe='accidents', plot_title='Association Between Variables', annot_size=12,
             paletter={'morning':'green', 'afternoon':'yellow', 'evening':'gray', 'night':'black'})

Association Rules:
1. not a bus passenger & 
2. carriageway hazards = None & 
3. pedestrian_location = Not a pedestrian &
4. pedestrian_crossing_human_control = None within 50 metres &
5. urban_or_rural_area = Urban &
6. pedestrian_road_maintenance_worker = No / Not applicable &
7. pedestrian_movement = Not a Pedestrian

Association Rules:
1. road_type = Single carriageway &
2. carriageway_hazards = None & 
3. bus_or_coach_passenger = Not a bus or coach passenger
4. pedestrian_location = Not a Pedestrian &
5. pedestrian_crossing_human_control = None within 50 metres
6. pedestrian_road_maintenance_worker = No / Not applicable &
7. junction_control = Give way or uncontrolled &
8. pedestrian_movement = Not a Pedestrian
9. pedestrian_crossing_physical_facilities = No physical crossing facilities within 50 metres

In [None]:
weather = proj.assign_weather(accidents)
weather_freq = weather.value_counts()
print(weather)
proj.plot_bar(y=weather_freq.index, x=weather_freq, plot_title='Weather Ranking for Accident Occurrence',
              annotate=True, savefig=True, fig_filename='weather_ranking.png')

severity = proj.assign_severity(accidents)
df = pd.concat([weather, severity], axis=1)

cols = [severity.name, weather.name, 'total_count']
print(cols)
total = df.value_counts().sort_index().reset_index()
total.columns = total.columns.astype(str).str.replace('0', cols[-1])
total = total[cols].groupby(cols[:-1]).sum().reset_index()
display(total)

proj.plot_bar(x=total[cols[-1]], y=total[cols[1]], condition_on=total[cols[0]],
              plot_title='Accident Outcome per Weather Conditions',
              paletter={'fatal':'black', 'serious':'red', 'minor':'gray'},
              annotate=True, savefig=True, fig_filename='outcome_weather_ranking.png')


In [None]:
    districts = proj.assign_district(accidents)
    top10_districts = proj.rank_top10(districts, use_bar=True, x_labe='total_count', y_labe='district',
                    plot_title='Top Ten Accidents per District', savefig=True, fig_fname='top10_district.png')

In [None]:
proj.plot_top10_districts(accidents, )#savefig=True)

In [None]:
# top 10 highway
highways = proj.assign_highway(accidents)
top10_highway = proj.rank_top10(accidents, 'local_authority_highway', use_bar=True, 
                                plot_title='Top Ten Highway Involved in Accidents',)
proj.rank_top10(highways, use_bar=True,#accidents, 'local_authority_highway', use_bar=True, 
                 plot_title='Top Ten Highway Involved in Accidents', 
                savefig=True, fig_fname='hist_top10_highway.png' )
top10_highway = proj.get_accidents_when(accidents, 'local_authority_highway', list(top10_highway.index))
display(top10_highway)
proj.plot_accident_map(accidents, top10_highway, point_at='highway', plot_title='UK Top Ten Highway Involved in Accidents',
                       savefig=True, fig_filename='uk_top10_highway.png')

In [None]:
y = proj.assign_severity(accidents, 'accident_severity')
x = proj.assign_severity(accidents)
cols = [x.name, y.name]
print(cols)
df = pd.concat([x, y], axis=1)
ranked = proj.rank_top_occurrences(df, cols[-1])
proj.plot_bar(y=ranked[x.name], x=ranked['total_count'], condition_on=ranked[y.name], paletter=None,
             annotate=True, paletter={'fatal':'black', 'serious':'red', 'minor':'gray'})

In [None]:
cols = ['road_type', 'road_surface', 'light_conditions', 'casualty_severity']
rtype = proj.assign_road_type(accidents)
rsurf = proj.assign_road_surface(accidents)
jpurp = proj.assign_journey_purpose(accidents)

df = pd.concat([rtype, rsurf, jpurp], axis=1)
counts = proj.rank_top_occurrences(df)
display(counts)

In [None]:
categ_indwx = counts[['road_type', 'road_surface_conditions']].apply(lambda row: 
                                                                     f" {row['road_type']} Road on {row['road_surface_conditions']} surface",
                                                                    axis=1)
cmap = {'Not known':'gray', 'Journey as part of work':'green', 'Other':'blue'}
display(categ_indwx)
proj.plot_bar(counts['total_count'], categ_indwx, counts['journey_purpose_of_driver'],
             paletter=cmap, y_labe='Type and Surface of Road', x_labe='Accident Count',
              xy_labe_size=8, annotate=True, figsize=(8, 14), annot_size=12, xlim=(0,90000),
              plot_title='Accident Relationship Between Type and Surface of Road and Journey Purpose',
             savefig=True, fig_filename='driver_var_jpurp.png')

In [None]:
acc_df = proj.get_accidents_with_labels(accidents)
display(acc_df)
counts = proj.rank_top_occurrences(acc_df[geo_variables], min_count_allowed=1000)
counts

In [None]:
x = counts.drop('1st_road_class', axis=1)
y = counts['1st_road_class']
labes = proj.create_label_from_ranking(x, exclude_last_col=False)
labes


In [None]:
proj.plot_bar(x=counts['total_count'], y=labes, condition_on=y, annotate=True, annot_size=11,
             figsize=(7, 15), xlim=(0, 4000), plot_title='Association Between Geographic Variables',
             savefig=True, fig_filename='geo_variables.png')

### DRIVER'S AGE AND CASUALTY GENDER

In [None]:
vehicle_variables = ['vehicle_reference', 'vehicle_type', 'towing_and_articulation', 'vehicle_manoeuvre', 'vehicle_location_restricted_lane', 'junction_location', 'skidding_and_overturning', 'hit_object_in_carriageway', 'vehicle_leaving_carriageway', 'hit_object_off_carriageway', '1st_point_of_impact', 'was_vehicle_left_hand_drive', 'journey_purpose_of_driver', 'sex_of_driver', 'age_of_driver', 'age_band_of_driver', 'engine_capacity_cc', 'propulsion_code', 
                     'age_of_vehicle', 'driver_imd_decile', 'driver_home_area_type', 'vehicle_imd_decile']

In [None]:
proj.visualize_casualty_outcomes_for(accidents, savefig=True)

In [None]:
acc_df = proj.get_accidents_with_labels(accidents)
cols = ['age_band_of_driver', 'sex_of_driver', 'accident_severity']
counts = proj.rank_top_occurrences(acc_df[cols], min_count_allowed=1)
display(counts)

In [None]:
labes  = proj.create_label_from_ranking(counts)
proj.plot_bar(counts['total_count'], labes, counts[cols[-1]], figsize=(8, 12),
             paletter={'fatal':'black', 'serious':'red', 'minor':'gray'}, annotate=True,
             plot_title='Relationship Between Age & Sex of Driver to Accident Outcome',
             savefig=True, fig_filename='driver_age_sex.png')

In [None]:
acc_df = proj.get_accidents_with_labels(accidents)
cols = ['age_band_of_driver', 'sex_of_driver', 'journey_purpose_of_driver','accident_severity']
counts = proj.rank_top_occurrences(acc_df[cols], min_count_allowed=2000)
display(counts)

In [None]:
labes  = proj.create_label_from_ranking(counts)
display(labes.iloc[0])

In [None]:
proj.plot_bar(counts['total_count'], labes, counts[cols[-1]], figsize=(8, 18),
             paletter={'fatal':'black', 'serious':'red', 'minor':'gray'}, annotate=True,
             plot_title='Relationship Between Driver Variables to Accident Outcome',
             savefig=True, fig_filename='driver_age_sex_purp.png')

In [None]:
agg_df = proj.visualize_severity(accidents, plot_title='Accident Casualty Outcomes per Month',
                            xlim=(0, 25000), savefig=True, fig_filename='outcome_per_month.png')
display(agg_df)



In [None]:
accidents['is_offseason'] = proj.assign_pl_offseason(accidents)
# accidents['is_league_weekend'] = is_prem_wkend(accidents)

In [None]:
accidents

In [None]:
cols = ['is_offseason', 'is_weekend']

agg = proj.generate_aggregated_lookup(accidents, cols)
prem_offssn = agg[cols+ ['total_count']].groupby(cols).sum().reset_index()

proj.plot_column(prem_offssn['is_offseason'], prem_offssn['total_count'],
                 prem_offssn['is_weekend'], 
                 plot_title='Weekend/Weekday Accident When Premier League Season is Off/On',
                 savefig=True, fig_filename='wkend_offseason.png')

In [None]:
cols = ['is_offseason', 'day_of_week', 'day_name']
color_mapping = {'Sunday': 'green', 'Monday': 'blue', 'Tuesday': 'yellow', 'Wednesday': 'darkorange',
                 'Thursday': 'red', 'Friday': 'black', 'Saturday': 'gray'}

agg = proj.generate_aggregated_lookup(accidents, cols)
prem_weekend_day = agg[cols+ ['total_count']].groupby(cols).sum().reset_index()
display(prem_weekend_day)
proj.plot_column(prem_weekend_day[cols[0]], prem_weekend_day['total_count'],
                 prem_weekend_day[cols[-1]], paletter=color_mapping,
                 plot_title='Day of Week Accidents during Premier League On/OffSeason',
                 savefig=True, fig_filename='pl_dayname_ssn.png')

In [None]:
cols = ['is_offseason']

agg_df = proj.visualize_top_ten_districts(accidents, cols[0], 0, 
                                          plot_title='Top Ten Districts with Highest Accidents When PL Season is On',
                                          savefig=True, suffix='pl_season')
display(agg_df)



In [None]:
cols = ['is_offseason']

agg_df = proj.visualize_top_ten_districts(accidents, cols[0], 1,
                                          plot_title='Top Ten Districts with Highest Accidents When PL Season is Off',
                                          savefig=True, suffix='pl_offseason')
display(agg_df)



In [None]:
cols = ['part_of_day']

agg_df = proj.visualize_severity(accidents, cols, cols[0], plot_title='Accident Casualty Outcomes per Part of Day',
                            savefig=True, fig_filename='outcome_per_pod.png')
display(agg_df)



In [None]:
proj.generate_aggregated_lookup(accidents, cols)

In [None]:
cols = ['is_weekend']

agg_df = proj.visualize_severity(accidents, cols, cols[0], plot_title='Accident Casualty Outcomes per Weekday/Weekend',
                            savefig=True, fig_filename='outcome_per_wkend.png')
display(agg_df)



In [None]:
cols = ['season']

agg_df = proj.visualize_severity(accidents, cols, cols[0], plot_title='Accident Casualty Outcomes per Season',
                            xlim=(0, 75000), savefig=True, fig_filename='outcome_per_ssn.png')
display(agg_df)



In [None]:
# outcome per pl on_offseason
cols = ['is_offseason']

agg_df = proj.visualize_severity(accidents, cols, use_as_x=cols[0], plot_title='Accident Casualty Outcomes per PL On/Off Season',
                             savefig=True, fig_filename='outcome_pl_ssn.png')
display(agg_df)



___PREDICTIVE MODELLING IS CONTINUED IN THE SECOND NOTEBOOK___