In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Setting seaborn style
sns.set(style="whitegrid")

# Creating output directory for plots
if not os.path.exists('eda_plots'):
    os.makedirs('eda_plots')

# Loading the dataset
print("Loading dataset...")
df = pd.read_csv('accident.csv')

# Creating binary target variable: Is_Severe (1 if Number_of_Deaths > 0, else 0)
df['Is_Severe'] = (df['Number_of_Deaths'] > 0).astype(int)

# 1. Dataset Overview
print("\n=== Dataset Overview ===")
print("Shape:", df.shape)
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isna().sum())
print("\nDuplicate Rows:", df.duplicated().sum())

# Checking unique values in categorical columns
print("\nUnique Values in Categorical Columns:")
for col in ['State', 'Reason', 'Road_Type', 'Weather_Conditions', 'Road_Conditions', 'Alcohol_Involved', 'Driver_Fatigue']:
    print(f"{col}: {df[col].nunique()} unique values")

# Summary statistics
print("\nSummary Statistics for Numeric Columns:\n", df.describe())

# 2. Univariate Analysis
# Numeric Variables
print("\n=== Univariate Analysis ===")
plt.figure(figsize=(12, 4))

# Number of Deaths
plt.subplot(1, 3, 1)
sns.histplot(df['Number_of_Deaths'], bins=10, kde=True)
plt.title('Distribution of Number of Deaths')
plt.xlabel('Number of Deaths')
plt.ylabel('Count')

# Number of Injuries
plt.subplot(1, 3, 2)
sns.histplot(df['Number_of_Injuries'], bins=10, kde=True)
plt.title('Distribution of Number of Injuries')
plt.xlabel('Number of Injuries')
plt.ylabel('Count')

# Speed Limit
plt.subplot(1, 3, 3)
sns.histplot(df['Speed_Limit'], bins=10, kde=True)
plt.title('Distribution of Speed Limit')
plt.xlabel('Speed Limit (km/h)')
plt.ylabel('Count')

plt.tight_layout()
plt.savefig('eda_plots/numeric_distributions.png')
plt.close()

# Categorical Variables
categorical_cols = ['State', 'Reason', 'Road_Type', 'Weather_Conditions', 'Road_Conditions', 'Alcohol_Involved', 'Driver_Fatigue']
for col in categorical_cols:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x=col, order=df[col].value_counts().index)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'eda_plots/{col}_distribution.png')
    plt.close()

# 3. Bivariate Analysis
print("\n=== Bivariate Analysis ===")
# Speed Limit vs. Is_Severe
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='Is_Severe', y='Speed_Limit')
plt.title('Speed Limit vs. Accident Severity')
plt.xlabel('Is Severe (1 = Yes, 0 = No)')
plt.ylabel('Speed Limit (km/h)')
plt.savefig('eda_plots/speed_limit_vs_severity.png')
plt.close()

# Categorical Variables vs. Is_Severe
for col in categorical_cols:
    plt.figure(figsize=(10, 6))
    sns.barplot(data=df, x=col, y='Is_Severe', order=df[col].value_counts().index)
    plt.title(f'Proportion of Severe Accidents by {col}')
    plt.xlabel(col)
    plt.ylabel('Proportion Severe')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'eda_plots/{col}_vs_severity.png')
    plt.close()

# Correlation Heatmap for Numeric Variables
plt.figure(figsize=(8, 6))
# Ensure only numeric columns are used for correlation
numeric_cols = ['Number_of_Deaths', 'Number_of_Injuries', 'Speed_Limit', 'Is_Severe']
# Remove any non-numeric columns like Road_Type
numeric_df = df[numeric_cols].select_dtypes(include=np.number)
corr = numeric_df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Numeric Variables')
plt.savefig('eda_plots/correlation_heatmap.png')
plt.close()

# 4. Insights
print("\n=== Key Insights ===")
# Most common reasons for accidents
top_reasons = df['Reason'].value_counts().head(3)
print("Top 3 Reasons for Accidents:\n", top_reasons)

# States with highest severe accidents
severe_by_state = df.groupby('State')['Is_Severe'].mean().sort_values(ascending=False).head(5)
print("\nTop 5 States by Proportion of Severe Accidents:\n", severe_by_state)

# Impact of Alcohol and Fatigue
alcohol_severe = df.groupby('Alcohol_Involved')['Is_Severe'].mean()
fatigue_severe = df.groupby('Driver_Fatigue')['Is_Severe'].mean()
print("\nProportion of Severe Accidents by Alcohol Involvement:\n", alcohol_severe)
print("\nProportion of Severe Accidents by Driver Fatigue:\n", fatigue_severe)

# Weather conditions and severity
weather_severe = df.groupby('Weather_Conditions')['Is_Severe'].mean().sort_values(ascending=False)
print("\nProportion of Severe Accidents by Weather Conditions:\n", weather_severe)

# Road conditions and severity
road_cond_severe = df.groupby('Road_Conditions')['Is_Severe'].mean().sort_values(ascending=False)
print("\nProportion of Severe Accidents by Road Conditions:\n", road_cond_severe)

print("\nEDA plots saved in 'eda_plots' directory.")

Loading dataset...

=== Dataset Overview ===
Shape: (300, 14)

Data Types:
 Accident_ID            object
State                  object
Date                   object
Time                   object
Reason                 object
Number_of_Deaths        int64
Number_of_Injuries     object
Road_Type              object
Weather_Conditions     object
Alcohol_Involved       object
Driver_Fatigue         object
Road_Conditions        object
Speed_Limit           float64
Is_Severe               int64
dtype: object

Missing Values:
 Accident_ID           0
State                 0
Date                  0
Time                  0
Reason                0
Number_of_Deaths      0
Number_of_Injuries    0
Road_Type             0
Weather_Conditions    0
Alcohol_Involved      0
Driver_Fatigue        0
Road_Conditions       0
Speed_Limit           1
Is_Severe             0
dtype: int64

Duplicate Rows: 0

Unique Values in Categorical Columns:
State: 28 unique values
Reason: 8 unique values
Road_Type: 3 uniq

In [None]:
#training

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

# Load data
df = pd.read_csv("accident.csv")

# Drop unnecessary columns
df.drop(columns=['Accident_ID', 'Date', 'Time'], inplace=True)

# Ensure numeric columns are correct
df['Number_of_Deaths'] = pd.to_numeric(df['Number_of_Deaths'], errors='coerce')
df['Number_of_Injuries'] = pd.to_numeric(df['Number_of_Injuries'], errors='coerce')
df['Speed_Limit'] = pd.to_numeric(df['Speed_Limit'], errors='coerce')

# Drop rows with missing target
df = df.dropna(subset=['Reason'])

# Define features and target
X = df.drop("Reason", axis=1)
y = df["Reason"]

# Define numerical and categorical features
numerical_features = ["Number_of_Deaths", "Number_of_Injuries", "Speed_Limit"]
categorical_features = [col for col in X.columns if col not in numerical_features]

# Preprocessing
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numerical_features),
    ("cat", categorical_transformer, categorical_features)
])

# Full pipeline
model_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(multi_class='multinomial', max_iter=1000))
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
model_pipeline.fit(X_train, y_train)

# Save the model
joblib.dump(model_pipeline, "cause_model.pkl")
print("✅ Accident cause prediction model saved as 'cause_model.pkl'")


✅ Accident cause prediction model saved as 'cause_model.pkl'




In [None]:
#testing
!pip install streamlit
import streamlit as st
import pandas as pd
import joblib
import datetime

# Load the full pipeline (preprocessing + model)
model = joblib.load('cause_model.pkl')

st.set_page_config(page_title="Accident Cause Predictor", layout="centered")
st.title("🚗 Traffic Accident Cause Predictor")
st.markdown("Enter the accident details to predict the likely cause.")

# User Inputs
state = st.selectbox("State", ["State1", "State2", "State3"])  # Replace with real states
road_type = st.selectbox("Road Type", ["Highway", "Urban", "Rural"])
weather = st.selectbox("Weather Conditions", ["Clear", "Rainy", "Foggy", "Snowy"])
alcohol = st.selectbox("Alcohol Involved", ["Yes", "No"])
fatigue = st.selectbox("Driver Fatigue", ["Yes", "No"])
road_cond = st.selectbox("Road Conditions", ["Dry", "Wet", "Icy", "Gravel"])
speed_limit = st.number_input("Speed Limit (km/h)", min_value=20, max_value=150, step=5)
number_of_deaths = st.number_input("Number of Deaths", min_value=0, step=1)
number_of_injuries = st.number_input("Number of Injuries", min_value=0, step=1)

# Prepare input data (no need to encode/scale, the pipeline will handle it)
input_data = pd.DataFrame([{
    'State': state,
    'Road_Type': road_type,
    'Weather_Conditions': weather,
    'Alcohol_Involved': alcohol,
    'Driver_Fatigue': fatigue,
    'Road_Conditions': road_cond,
    'Speed_Limit': speed_limit,
    'Number_of_Deaths': number_of_deaths,
    'Number_of_Injuries': number_of_injuries
}])

if st.button("Predict Reason"):
    try:
        prediction = model.predict(input_data)[0]
        st.success(f"🎯 Predicted Accident Cause: **{prediction}**")
    except Exception as e:
        st.error(f"Prediction failed: {e}")


Collecting streamlit
  Downloading streamlit-1.45.0-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m103.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hIns

2025-05-07 11:10:08.497 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-05-07 11:10:08.508 Session state does not function when running a script without `streamlit run`
