In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

In [None]:
df = pd.read_csv("dataset/cleaned_us_accident_data.csv")

In [None]:
location_features = ['start_lat', 'start_lng']
X_location = df[location_features]

In [None]:
inertia = []
k_values = range(1, 11)
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_location)
    inertia.append(kmeans.inertia_)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_location)

In [None]:
from sklearn.metrics import davies_bouldin_score

db_index = davies_bouldin_score(X_location, df['cluster'])
print("Davies-Bouldin Index:", db_index)
### Lower is better (Good clustering usually has a DB index < 1)

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['start_lng'], y=df['start_lat'], hue=df['cluster'], palette='viridis')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('High Risk Area Clustering')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Group by cluster and check severity distribution
plt.figure(figsize=(8, 5))
sns.boxplot(x=df['cluster'], y=df['severity'])
plt.xlabel('Cluster')
plt.ylabel('Accident Severity')
plt.title('Accident Severity Distribution Across Clusters')
plt.show()


In [None]:
# Compare average weather conditions across clusters
weather_features = ['temperature(f)', 'humidity(%)', 'pressure(in)', 'visibility(mi)', 'wind_speed(mph)']
weather_analysis = df.groupby('cluster')[weather_features].mean()

import pandas as pd
import matplotlib.pyplot as plt

# Display as a table
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.heatmap(weather_analysis, annot=True, cmap="coolwarm")
plt.title('Average Weather Conditions by Cluster')
plt.show()


In [None]:
df['hour'] = pd.to_datetime(df['start_time']).dt.hour

# Plot accident frequency by hour for each cluster
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='hour', hue='cluster', multiple='stack', bins=24)
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Accidents')
plt.title('Accident Frequency by Hour for Each Cluster')
plt.show()


In [None]:
# Find top accident-prone roads per cluster
top_roads = df.groupby(['cluster', 'street'])['severity'].count().reset_index()
top_roads = top_roads.sort_values(['cluster', 'severity'], ascending=[True, False])

# Show top 5 roads per cluster
for cluster in df['cluster'].unique():
    print(f"Cluster {cluster} - Top 5 High-Risk Roads:")
    print(top_roads[top_roads['cluster'] == cluster].head(5))
    print("\n")
