In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ast
from hijri_converter import convert


In [None]:
car_accident_data = pd.read_csv('final_data.csv', index_col=0)
data = pd.read_csv('final_data.csv', index_col=0)

# Analysis of date/ time related columns

## creating hijri months for deeper analysis

In [None]:
def to_hijri(date):
    gregorian_date = pd.to_datetime(date).date()  
    hijri_date = convert.Gregorian(gregorian_date.year, gregorian_date.month, gregorian_date.day).to_hijri()
    return f'{hijri_date.year}-{hijri_date.month:02d}-{hijri_date.day:02d}'

data['hijri'] = data['datetime'].apply(to_hijri)
hijri_month_names = {
    1: 'Muharram',
    2: 'Safar',
    3: 'Rabi Al-Awwal',
    4: 'Rabi Al-Thani',
    5: 'Jumada Al-Awwal',
    6: 'Jumada Al-Thani',
    7: 'Rajab',
    8: 'Sha\'ban',
    9: 'Ramadan',
    10: 'Shawwal',
    11: 'Dhu Al-Qa\'dah',
    12: 'Dhu Al-Hijjah'
}

def get_hijri_month(hijri_date_str):
    hijri_month_number = int(hijri_date_str.split('-')[1])
    return hijri_month_names[hijri_month_number]

data['hijri_month_name'] = data['hijri'].apply(get_hijri_month)


In [None]:
data['datetime'] = pd.to_datetime(data['datetime'])
data['month'] = data['datetime'].dt.strftime('%B')
car_accident_data['datetime'] = pd.to_datetime(car_accident_data['datetime'])
car_accident_data['month'] = car_accident_data['datetime'].dt.strftime('%B')

## gregorian_month VS Hijri Month

In [None]:
data['gregorian_month'] = pd.to_datetime(data['datetime']).dt.strftime('%B')

hijri_month_order_corrected = ['Muharram', 'Safar', 'Rabi Al-Awwal', 'Rabi Al-Thani', 
                               'Jumada Al-Awwal', 'Jumada Al-Thani', 'Rajab', "Sha'ban", 
                               'Ramadan', 'Shawwal', "Dhu Al-Qa'dah", 'Dhu Al-Hijjah']

gregorian_month_order = ['January', 'February', 'March', 'April', 'May', 'June', 
                         'July', 'August', 'September', 'October', 'November', 'December']

sns.set_style("whitegrid")

plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
sns.countplot(data=data, y='gregorian_month', order=gregorian_month_order, color='skyblue')
plt.title('Accident Count in Each Gregorian Month')

plt.subplot(1, 2, 2)
sns.countplot(data=data, y='hijri_month_name', order=hijri_month_order_corrected, color='lightcoral')
plt.title('Accident Count in Each Hijri Month')

# Display the plots
plt.tight_layout()
plt.show()


### Heatmap 

In [None]:
data['gregorian_year'] = pd.to_datetime(data['datetime']).dt.year
data['hijri_year'] = data['hijri'].str.slice(0, 4).astype(int)

pivot_gregorian = data.pivot_table(index='gregorian_year', columns='gregorian_month', 
                                   values='datetime', aggfunc='count', fill_value=0)
pivot_gregorian = pivot_gregorian[gregorian_month_order]

pivot_hijri = data.pivot_table(index='hijri_year', columns='hijri_month_name', 
                               values='hijri', aggfunc='count', fill_value=0)
pivot_hijri = pivot_hijri[hijri_month_order_corrected]

plt.figure(figsize=(18, 12))

plt.subplot(1, 2, 1)
sns.heatmap(pivot_gregorian, cmap='Blues', annot=True, fmt='d', cbar_kws={'label': 'Accident Count'})
plt.title('Monthly Accident Count per Year (Gregorian)')

plt.subplot(1, 2, 2)
sns.heatmap(pivot_hijri, cmap='Reds', annot=True, fmt='d', cbar_kws={'label': 'Accident Count'})
plt.title('Monthly Accident Count per Year (Hijri)')

plt.tight_layout()
plt.show()


## Accident Count Comparison: Full Month vs First Five Days

In [None]:
data['hijri_month'] = data['hijri'].str.split('-').str[1]
data['hijri_day'] = data['hijri'].str.split('-').str[2]

data['hijri_month'] = data['hijri_month'].astype(int)
data['hijri_day'] = data['hijri_day'].astype(int)

data[['hijri', 'hijri_month', 'hijri_day']].head()


In [None]:
first_five_days_data = data[data['hijri_day'] <= 5]

accidents_by_month_first_five_days = first_five_days_data.groupby('hijri_month_name').size()

accidents_by_month_total = data.groupby('hijri_month_name').size()

### graph comparison first 5 days of each month

In [None]:
fig, ax = plt.subplots(figsize=(14, 7))

accidents_by_month_first_five_days.sort_index().plot(kind='bar', ax=ax, color='skyblue', position=0, width=0.4, label="First Five Days")

accidents_by_month_total.sort_index().plot(kind='bar', ax=ax, color='salmon', position=1, width=0.4, label="Entire Month")

ax.set_title("Number of Accidents in the First Five Days vs. Entire Hijri Month")
ax.set_ylabel("Number of Accidents")
ax.set_xlabel("Hijri Month")
ax.legend()

plt.tight_layout()
plt.show()


## Accident Count for Each Day of the Week

In [None]:
# Set the pastel color palette
color_palette = sns.color_palette("pastel")

# Create a bar plot for observations count on each day of the week with different colors for each bar
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='day_of_week', 
              order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
              palette=color_palette) 
plt.title('Observation Count for Each Day of the Week')
plt.ylabel('Number of Observations')
plt.xlabel('Day of the Week')
plt.xticks(rotation=45)
plt.show()


## Accident Count for Each hour of the day

In [None]:
# Convert the 'datetime' column to datetime format
data['datetime'] = pd.to_datetime(data['datetime'], errors='coerce')

# Extract the hour from the 'datetime' column
data['hour'] = data['datetime'].dt.hour

# Create a count plot to visualize the number of accidents each hour
plt.figure(figsize=(12, 6))
sns.countplot(x=data['hour'], palette="Spectral")
plt.title('Number of Accidents Each Hour')
plt.xlabel('Hour')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Light category vs hour of the day

In [None]:
data['hour'] = pd.to_datetime(data['datetime']).dt.hour
plt.figure(figsize=(15, 8))

hourly_light_counts = data.groupby(['hour', 'light']).size().unstack().fillna(0)

hourly_light_counts.plot(kind='bar', stacked=True, colormap='Set3', ax=plt.gca())

plt.title('Observation Count for Each Light Category by Hour of the Day')
plt.ylabel('Number of Observations')
plt.xlabel('Hour of the Day')
plt.xticks(rotation=45)
plt.legend(title='Light Category')
plt.tight_layout()
plt.show()


## Day of the week grouped by season

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(data=data, x='day_of_week', hue='season', palette='Set3', order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.title('Count of Observations for Each Day of the Week Grouped by Season')
plt.xlabel('Day of the Week')
plt.ylabel('Count of Observations')
plt.legend(title='Season', title_fontsize='13', fontsize='11')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


# Vehicle analysis

## Vehicle color vs time of the day

In [None]:
car_accident_data['datetime'] = pd.to_datetime(car_accident_data['datetime'])
car_accident_data['hour'] = car_accident_data['datetime'].dt.hour

conditions = [
    (car_accident_data['hour'] >= 6) & (car_accident_data['hour'] < 12),  # Morning
    (car_accident_data['hour'] >= 12) & (car_accident_data['hour'] < 18),  # Afternoon
    (car_accident_data['hour'] >= 18) & (car_accident_data['hour'] < 24),  # Evening
    (car_accident_data['hour'] >= 0) & (car_accident_data['hour'] < 6)    # Night
]

choices = ['Morning', 'Afternoon', 'Evening', 'Night']
car_accident_data['time_of_day'] = np.select(conditions, choices, default='Unknown')
color_time_grouped = car_accident_data.groupby(['vehicle_color', 'time_of_day']).size().unstack().fillna(0)
color_time_percentages = color_time_grouped.div(color_time_grouped.sum(axis=1), axis=0) * 100
color_time_grouped.plot(kind='bar', stacked=True, figsize=(15, 10), colormap='Set2')
plt.title('Frequency of Accidents by Vehicle Color and Time of Day')
plt.xlabel('Vehicle Color')
plt.ylabel('Frequency of Accidents')
plt.xticks(rotation=45)
plt.legend(title='Time of Day')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 8))
sns.heatmap(color_time_percentages, annot=True, cmap='coolwarm', fmt=".0f")
plt.title('Percentage of Accidents by Vehicle Color and Time of Day')
plt.xlabel('Time of Day')
plt.ylabel('Vehicle Color')
plt.show()

print(color_time_percentages)


## Vehicle year analysis

In [None]:
if 'vehicle_year' in car_accident_data.columns:
    car_accident_data['vehicle_year_bin_eq_freq'] = pd.qcut(car_accident_data['vehicle_year'], q=5, precision=0)
    vehicle_year_eq_freq_frequency = car_accident_data['vehicle_year_bin_eq_freq'].value_counts()

    print(vehicle_year_eq_freq_frequency)
else:
    print("The column 'vehicle_year' does not exist. Please provide the correct column name.")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


vehicle_year_eq_width_frequency = car_accident_data['vehicle_year_bin_eq_width'].value_counts()

vehicle_year_eq_freq_frequency = car_accident_data['vehicle_year_bin_eq_freq'].value_counts()

plt.figure(figsize=(15, 10))

# Subplot 1: Bar plot for vehicle year bins (equal width)
plt.subplot(2, 1, 1)
sns.barplot(x=vehicle_year_eq_width_frequency.index, y=vehicle_year_eq_width_frequency.values, palette='Set2')
plt.title('Distribution of Vehicle Ages (Equal Width Bins)')
plt.xlabel('Vehicle Year Bins (Equal Width)')
plt.ylabel('Frequency')
plt.xticks(rotation=45)

# Subplot 2: Bar plot for vehicle year bins (equal frequency)
plt.subplot(2, 1, 2)
sns.barplot(x=vehicle_year_eq_freq_frequency.index, y=vehicle_year_eq_freq_frequency.values, palette='Set2')
plt.title('Distribution of Vehicle Ages (Equal Frequency Bins)')
plt.xlabel('Vehicle Year Bins (Equal Frequency)')
plt.ylabel('Frequency')
plt.xticks(rotation=45)

# Adjust the layout
plt.tight_layout()
plt.show()


## car class and victims count

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=car_accident_data, x='vehicle_category', y='victim_counts', palette='Set2')
plt.title('Distribution of Victim Counts Across Vehicle Categories')
plt.xlabel('Vehicle Category')
plt.ylabel('Victim Counts')
plt.xticks(rotation=90)
plt.show()


The box plot illustrates the distribution of victim counts across different vehicle categories. Here are some observations:

The median victim count appears to be slightly higher for accidents involving buses, which might be expected given the larger passenger capacity of buses.

Accidents involving trucks also show a wider interquartile range and some outliers with higher victim counts, possibly indicating more severe accidents when trucks are involved.

Passenger vehicles, motorcycles, and bicycles have lower median victim counts and narrower interquartile ranges, suggesting these accidents might be less severe on average.

These insights could be indicative of the varying levels of accident severity associated with different vehicle categories.

# Nearby factors and the frequency of accidents

In [None]:
nearby_factors_frequency = car_accident_data['nearby_cluster_categories'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=nearby_factors_frequency.values, y=nearby_factors_frequency.index, palette='Set2')
plt.title('Frequency of Accidents by Nearby Factors (Cluster Categories)')
plt.xlabel('Frequency')
plt.ylabel('Nearby Factors (Cluster Categories)')
plt.show()


# Distribution of Accident Severity by Victim Violations Cluster Categories

In [None]:
sns.set_style("whitegrid")

plt.figure(figsize=(14, 10))

sns.countplot(data=data, y='nearby_cluster_categories', hue='victim_categories', palette='Set2')

plt.title('Distribution of Guilty Health Status within Guilty Violations Cluster Categories', fontsize=16)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Guilty Violations Cluster Categories', fontsize=12)

plt.show()

In [None]:
nearby_victim_grouped = data.groupby(['nearby_cluster_categories', 'victim_categories']).size().unstack(fill_value=0)

victim_nearby_percentage = (nearby_victim_grouped.T / nearby_victim_grouped.sum(axis=1)).T * 100

victim_nearby_percentage_rounded = victim_nearby_percentage.round(2)
victim_nearby_percentage_rounded.reset_index(inplace=True)  
victim_nearby_percentage_rounded


Pedestrian and Commercial Area Traffic Accidents:

Pedestrian Victims: 
32.95%
Child Victims: 
9.09%

Regulated Intersection Near Residential Area Traffic Accidents:

Pedestrian Victims: 
19.69%
Child Victims: 
13.39%

Residential Houses, Individual Buildings, Unregulated Intersections:

Pedestrian Victims: 
19.64%
Child Victims: 
15.85%

Unregulated Intersection near Residential Houses:

Pedestrian Victims: 
16.36%
Child Victims: 
12.42%

Residential Individual Houses Area & Gas Stations:

Pedestrian Victims: 
15.65%
Child Victims: 
18.70%

Prevalence of Accidents Involving Pedestrians and Children:

Areas categorized as "Pedestrian and Commercial Area Traffic Accidents," "Residential Houses, Individual Buildings, Unregulated Intersections," and similar clusters show a higher percentage of accidents involving pedestrians and children.
Particularly, the "Residential Houses, Individual Buildings, Unregulated Intersections" cluster has a notable percentage of accidents involving pedestrians (
19.64
%
19.64%) and children (
15.85
%
15.85%).

Gas Station Accidents and Vehicle Victims:


The "Gas Station Accidents" cluster shows a high percentage of vehicle victims (
89.89
%
89.89%), which could be indicative of its proximity to highways or major roads.
The design of gas stations' entrances and exits, high-speed traffic, and possibly inadequate signage could be contributing factors to the high occurrence of vehicle accidents in these areas.

Safety Enhancement Recommendations:


For areas with a high percentage of pedestrian and child victims, implementing safety measures such as enhanced pedestrian crossings, traffic calming measures, and education campaigns on road safety could be beneficial.
In areas near gas stations, improved signage, regulated speed limits, and better-designed entrances and exits could potentially reduce the risk of accidents.

Further Analysis:


Delving into additional variables like road conditions, lighting conditions, and exact accident locations could provide a more nuanced understanding of the factors contributing to accidents in these identified clusters.

# Male vs Female

In [None]:
gender_proportions = car_accident_data['guilty_gender'].value_counts()

# Create a bar chart
plt.figure(figsize=(8, 6))
plt.bar(gender_proportions.index, gender_proportions.values, color=['blue', 'pink'])
plt.xlabel('Gender')
plt.ylabel('Frequency')
plt.title('Accident Frequency by Gender')
plt.show()

# Distribution of Accident Severity by Victim Violations Cluster Categories

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("whitegrid")

plt.figure(figsize=(14, 10))

sns.countplot(data=data, y='victim_violations_cluster_categories', hue='victim_health_status', palette='Set2')

plt.title('Distribution of Accident Severity within Victim Violations Cluster Categories', fontsize=16)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Victim Violations Cluster Categories', fontsize=12)
plt.legend(title='Severity')

plt.show()


The plot above shows the distribution of accident severity within various victim violation cluster categories, colored by the health status of victims. Based on the visual, here are some detailed observations:

1. **Severity Distributions**: The plot reveals different patterns of victim health status across various violation categories. This indicates how certain types of violations correlate with more severe health outcomes.

2. **Specific Categories**:
   - **High Frequency of Severe Outcomes**: Some categories, possibly including "Lack of Reflective Elements Violation" if it shows darker colors (indicating severe outcomes like 'Fatal' or 'Severe'), suggest that these violations are more likely to result in serious injuries or fatalities.
   - **Categories with Lower Severity**: Other categories might predominantly show lighter colors (indicating 'Minor' injuries), suggesting that violations in these categories are less likely to result in serious health impacts.

3. **Intervention Points**:
   - **For High-Risk Categories**: If categories like "Lack of Reflective Elements Violation" show a high frequency of severe outcomes, targeted interventions such as awareness campaigns about the importance of reflective gear or enhanced enforcement of existing laws could be beneficial.
   - **For Less Severe Categories**: Educational programs could still be beneficial, focusing on preventing such violations and promoting safer practices.

4. **Policy and Planning**:
   - **Resource Allocation**: The data can help prioritize which violations need more immediate attention in terms of road safety initiatives or law enforcement resource allocation.
   - **Legislative Review**: If certain violations consistently lead to severe outcomes, it might be necessary to review and possibly strengthen the legislation related to these violations.

5. **Community Engagement**:
   - **Awareness Programs**: Particularly for violations that lead to severe outcomes, community awareness programs could be crucial in reducing these incidents. For instance, teaching about the importance of reflective materials at community centers or through school programs.

# Distribution of Guilty Health Status by Guilty Violations Cluster Categories


In [None]:
sns.set_style("whitegrid")

plt.figure(figsize=(14, 10))

sns.countplot(data=data, y='guilty_violations_cluster_categories', hue='guilty_health_status', palette='Set2')

plt.title('Distribution of Guilty Health Status within Guilty Violations Cluster Categories', fontsize=16)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Guilty Violations Cluster Categories', fontsize=12)

plt.show()


Violation of Safety Belt Usage Rules by Driver:

A notable insight from the graph is the significantly higher count of deceased individuals when the violation involves the non-usage of safety belts by drivers. This finding underscores the critical importance of adhering to safety belt usage regulations, which are designed to protect individuals in the event of a collision. The data strongly supports ongoing public education and enforcement efforts to ensure compliance with safety belt usage rules, as this simple preventive measure can significantly reduce fatalities in road accidents.


Non-Compliance with Traffic Rules by Unqualified Driver without Auto Insurance:

Additionally, there is a concerning number of deceased and injured individuals in accidents where the guilty party is an unqualified driver without auto insurance, who also violated traffic rules. This category of violations encapsulates multiple layers of risk, including lack of proper driving qualifications and non-compliance with traffic regulations. The absence of auto insurance further exacerbates the situation, potentially leaving victims without recourse for medical or repair expenses. This finding suggests a need for stricter enforcement of driving qualifications and insurance regulations, alongside efforts to ensure compliance with traffic rules.

# Distribution of Guilty Driving Experience by Guilty Violations Cluster Categories

In [None]:
plt.figure(figsize=(14, 10))

sns.countplot(data=data, y='guilty_violations_cluster_categories', hue='guilty_driving_experience_bin', palette='Set2')

plt.title('Distribution of Guilty Driving Experience within Guilty Violations Cluster Categories', fontsize=16)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Guilty Violations Cluster Categories', fontsize=12)
plt.legend(title='Guilty Driving Experience (years)', bbox_to_anchor=(1, 1), loc='upper right')

plt.show()


Using numerical analysis to clarify and deepen the understanding based on the distribution of guilty driving experience within guilty violations cluster categories, here are detailed insights and recommendations:

1. **Incidence Distribution**:
   - **1.1** Categories such as "Unknown Violations in Car Accidents" and "Accident Due to Control by Person Without Driving License" exhibit the highest counts across multiple experience bins. This suggests a general need for targeted interventions across all experience levels.
   - **1.2** Less experienced drivers (0-5 years) are prominently featured in specific violation categories, indicating potential areas where beginner drivers may struggle.

2. **Targeted Training Programs**:
   - **2.1** Initiate beginner-specific training modules that address the most common violations among drivers with 0-5 years of experience. These programs should focus on practical skills and the importance of adhering to road rules.
   - **2.2** Develop refresher courses for drivers with 10-20 years of experience, as this group also frequently appears in several high-incidence categories, possibly due to overconfidence or outdated driving habits.

### Create a grouped bar chart to analyze 'severity' and 'category' together


In [None]:
grouped_data = data.groupby(['category', 'severity']).size().unstack().fillna(0)

grouped_data = grouped_data.loc[grouped_data.sum(axis=1).sort_values(ascending=False).index]

# Plotting
fig, ax = plt.subplots(figsize=(12, 10))
grouped_data.plot(kind='bar', stacked=True, ax=ax)
ax.set_title('Distribution of Severity within each Category')
ax.set_xlabel('Category')
ax.set_ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


#### Converting the count data to percentage for a better comparison across categories with different total counts


In [None]:
grouped_percentage = grouped_data.div(grouped_data.sum(axis=1), axis=0) * 100

fig, ax = plt.subplots(figsize=(12, 10))
grouped_percentage.plot(kind='bar', stacked=True, ax=ax)
ax.set_title('Percentage Distribution of Severity within each Category')
ax.set_xlabel('Category')
ax.set_ylabel('Percentage (%)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
count_and_percentage = grouped_data.copy()
for col in grouped_data.columns:
    count_and_percentage[col + ' (%)'] = grouped_percentage[col]

count_and_percentage


The table provides both the counts and percentages of accident severity within each accident category. Here's an insightful analysis based on these numbers:

1. **Severity in Major Categories**:
   - **Столкновение**: The largest category with a total of 896 incidents, distributed as 26.0% minor (233 incidents), 25.0% fatal (224 incidents), and 49.0% severe (439 incidents).
   - **Наезд на пешехода**: Comprising 336 incidents with a higher proportion of fatal (32.1% or 108 incidents) and severe (46.7% or 157 incidents) cases compared to minor ones (21.1% or 71 incidents).

2. **High-Risk Categories**:
   - **Съезд с дороги** and **Наезд на стоящее ТС** both show exceptionally high percentages of severe accidents, at 57.1% each (64 and 24 incidents, respectively). These categories, despite having fewer total incidents, indicate a significant risk of severe outcomes.

3. **Fatalities and Severe Injuries**:
   - **Наезд на велосипедиста**: Although it has the fewest incidents (15), it has the highest percentage of severe cases (73.3% or 11 incidents) and a substantial percentage of fatalities (20% or 3 incidents).
   - Categories like **Наезд на препятствие** also have high fatal (32.4% or 66 incidents) and severe (44.6% or 91 incidents) percentages, emphasizing the danger in these scenarios.

4. **Comparative Analysis for Safety Enhancements**:
   - Categories such as **Опрокидывание** and **Наезд на пешехода**, where severe and fatal outcomes are predominant, should be priorities for traffic safety measures, including better pedestrian crossings, clearer signage, and road design improvements.
   - **Столкновение**, being the most common type, requires broad-based interventions such as public awareness campaigns, enforcement of traffic rules, and perhaps technological aids like collision avoidance systems in vehicles.

In [None]:
import pandas as pd

data['datetime'] = pd.to_datetime(data['datetime'], errors='coerce')
data['hour'] = data['datetime'].dt.hour

hourly_accidents = data.groupby('hour').size()

color_map = plt.cm.get_cmap('rainbow')  # Using the 'rainbow' color map
colors = color_map(np.linspace(0, 1, len(hourly_accidents)))

fig, ax = plt.subplots(figsize=(12, 8))
hourly_accidents.plot(kind='bar', ax=ax, color=colors)
ax.set_title('Number of Accidents by Hour of the Day')
ax.set_xlabel('Hour of the Day')
ax.set_ylabel('Number of Accidents')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


The graph shows the distribution of accidents by hour of the day, providing a clear view of when accidents are most and least frequent. Here are some observations and insights based on this data:

1. **Peak Hours**:
   - There is a noticeable increase in the number of accidents during certain hours, particularly around 18:00 (6 PM). This suggests a peak in accident occurrences likely linked to evening rush hour when traffic density increases as people commute home from work or school.

2. **Morning Rush Hour**:
   - Another peak occurs around 8:00 AM, coinciding with morning rush hour. This period typically sees increased traffic flow as people begin their workday, leading to a higher likelihood of accidents.

3. **Late Night to Early Morning**:
   - The number of accidents significantly drops from late evening (after 22:00 or 10 PM) until early morning (around 6:00 AM). This reduction is likely due to decreased traffic volume during these hours. However, accidents that occur during these times might be more severe due to factors like reduced visibility and possibly higher speeds.

4. **Midday Activity**:
   - A moderate level of accidents persists from late morning through the afternoon, peaking slightly around noon. This could be related to increased vehicle and pedestrian activity during lunch hours and early dismissals from schools or shifts.

5. **Implications for Road Safety**:
   - **5.1** Traffic management and road safety measures could be intensified during the identified peak times to reduce accident rates. For instance, deploying more traffic officers and enhancing traffic signal effectiveness during these hours could help manage the flow and reduce collisions.
   - **5.2** Public awareness campaigns could focus on promoting safe driving practices during high-risk hours, especially emphasizing the evening rush hour.
   - **5.3** Consideration could be given to adjusting work or school start and end times in a staggered manner to help disperse peak hour traffic, potentially reducing the congestion that contributes to accidents.

These insights can help stakeholders in traffic and city planning to design and implement more effective measures tailored to the specific dynamics of road usage throughout the day, thereby enhancing overall road safety.

# Analyzing Relationship between Weather and Road Conditions

In [None]:
weather_road_cross_tab = pd.crosstab(data['weather'], data['road_conditions_cluster_categories'])

weather_road_cross_tab = weather_road_cross_tab.loc[weather_road_cross_tab.sum(axis=1).sort_values(ascending=False).index]

fig, ax = plt.subplots(figsize=(15, 10))
weather_road_cross_tab.plot(kind='bar', stacked=True, ax=ax)
ax.set_title('Relationship between Weather and Road Conditions')
ax.set_xlabel('Weather Conditions')
ax.set_ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

weather_road_percentage = weather_road_cross_tab.div(weather_road_cross_tab.sum(axis=1), axis=0) * 100

# Plotting the percentage distribution
fig, ax = plt.subplots(figsize=(15, 10))
weather_road_percentage.plot(kind='bar', stacked=True, ax=ax)
ax.set_title('Percentage Relationship between Weather and Road Conditions')
ax.set_xlabel('Weather Conditions')
ax.set_ylabel('Percentage (%)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
weather_road_analysis = pd.concat([weather_road_cross_tab, weather_road_percentage], axis=1, keys=['Count', 'Percentage'])
weather_road_analysis


# Analyzing Relationship between Weather, Road Conditions, and Light

In [None]:

g = sns.FacetGrid(data, col="light", col_wrap=1, height=8, aspect=2, sharex=False) 
g.map(sns.countplot, 'weather', order=data['weather'].value_counts().index, hue=data['road_conditions_cluster_categories'], palette='Set2')
g.add_legend(title='Road Conditions', bbox_to_anchor=(0.5, 1.2), loc='upper center', ncol=3)  
g.set_xticklabels(rotation=45, ha='right')
g.set_axis_labels("Weather Conditions", "Count")
g.fig.subplots_adjust(top=0.85, hspace=0.3) 
g.fig.suptitle('Relationship between Weather, Road Conditions, and Light', fontsize=16)

# Increase the figure size
g.fig.set_figwidth(20) 
g.fig.set_figheight(30) 

plt.show()

Prevalence of Accidents in Different Conditions:

It's evident that most accidents occur during Bright Daylight and Clear Weather conditions. The prevalence of accidents in these conditions could be due to higher traffic volumes during the day and in favorable weather.
Road Conditions:

Across all light conditions, the Normal road condition appears to be the most common during accidents. This could be indicative of other factors like human error or mechanical failures being more significant contributors to accidents than adverse road conditions.
Adverse Weather Conditions:

There's a noticeable number of accidents occurring under Raining weather conditions, particularly in Darkness with street lights on and Twilight or Dawn. This suggests that visibility and wet roads could be contributing factors to accidents during adverse weather conditions.
Impact of Light Conditions:

The change in the distribution of accidents across different light conditions, especially from Bright Daylight to Darkness with street lights on or Twilight or Dawn, indicates that light conditions could play a crucial role in road safety.
Correlation between Weather and Road Conditions:

The interaction between weather and road conditions in the occurrence of accidents is nuanced. For instance, adverse road conditions don't seem to correspond with a significant increase in accidents, suggesting that other factors might be at play.
Recommendations for Further Analysis:

The findings might warrant a deeper investigation into other contributing factors to accidents like traffic volumes, human factors (e.g., distracted or impaired driving), vehicle conditions, and roadway design.
Data-Driven Policy Recommendations:

Policymakers and urban planners might use this data to implement safety measures aimed at reducing accidents, especially during adverse weather or light conditions. For instance, improving street lighting, enhancing road markings, or launching weather-aware driving campaigns.

# Plotting correlations using different techniques

In [None]:
columns_to_exclude = ['region', 'point', 'address', 'parent_region', 'geometry', 'nearby', 'road_conditions','victim_violations', 'guilty_violations'
                      , 'hijri', 'vehicle_model', 'vehicle_brand']

categorical_columns = data.select_dtypes(include=['object']).columns

categorical_columns = categorical_columns.drop(columns_to_exclude)


In [None]:
for i in range(len(categorical_columns)):
    for j in range(i+1, len(categorical_columns)):
        contingency_table = pd.crosstab(data[categorical_columns[i]], data[categorical_columns[j]])
        
        plt.figure(figsize=(10, 8))
        sns.heatmap(contingency_table, annot=True, fmt='d', cmap='coolwarm')
        plt.title(f'Heatmap of {categorical_columns[i]} vs {categorical_columns[j]}')
        plt.show()


In [None]:
for i in range(len(categorical_columns)):
    for j in range(i + 1, len(categorical_columns)):
        contingency_table = pd.crosstab(data[categorical_columns[i]], data[categorical_columns[j]], normalize='index') * 100
        
        plt.figure(figsize=(10, 8))
        sns.heatmap(contingency_table, annot=True, fmt='.2f', cmap='coolwarm', cbar_kws={'label': 'Percentage (%)'})
        plt.title(f'Percentage Heatmap of {categorical_columns[i]} vs {categorical_columns[j]}')
        plt.xlabel(categorical_columns[j])
        plt.ylabel(categorical_columns[i])
        plt.show()


## Generate custom heatmaps

In [None]:
updated_combinations = [
    ('tags', 'severity'),
    ('tags', 'nearby_cluster_categories'),
    ('tags', 'guilty_violations_cluster_categories'),
    ('light', 'category'),
    ('light', 'weather'),
    ('light', 'dead_count'),
    ('light', 'injured_count'),
    ('light', 'victim_counts'),
    ('weather', 'severity'),
    ('weather', 'victim_counts'),
    ('weather', 'victim_categories'),
    ('weather', 'victim_health_status'),
    ('category', 'severity'),
    ('category', 'injured_count'),
    ('category', 'victim_counts'),
    ('category', 'victim_categories'),
    ('category', 'victim_violations_cluster_categories'),
    ('category', 'road_conditions_cluster_categories'),
    ('category', 'nearby_cluster_categories'),
    ('category', 'guilty_violations_cluster_categories'),
    ('severity', 'guilty_violations_cluster_categories'),
    ('dead_count', 'nearby_cluster_categories'),
    ('victim_role', 'road_conditions_cluster_categories')
]

updated_heatmaps_dict = {}

for combination in updated_combinations:
    column1, column2 = combination
    
    contingency_table = pd.crosstab(data[column1], data[column2])
    
    updated_heatmaps_dict[combination] = contingency_table
    
for combination, contingency_table in updated_heatmaps_dict.items():
    column1, column2 = combination
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(contingency_table, annot=True, fmt="d", cmap="YlGnBu")
    plt.title(f'Heatmap of {column1} vs {column2}')
    plt.xlabel(column2)
    plt.ylabel(column1)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
