# Synthetic data generation for School Bus Accident Prediction system

In [None]:
# Importing necessary libraries for data manipulation and numerical operations
import numpy as np
import pandas as pd

In [None]:
# Resetting pandas display options to default for better visibility of data
pd.reset_option('display.max_columns', None)
pd.reset_option('display.max_rows', None)

In [None]:
# Setting a random seed for reproducibility
np.random.seed(42)

# Defining the number of data points to generate
N = 5000

In [None]:
# Generating synthetic data for vehicle speeds using normal distribution
speeds = np.random.normal(100, 10, N)

In [None]:
# Generating synthetic data for factors affecting accidents
fatique = np.random.normal(50, 25, N)  # Fatigue levels of drivers
traffic = np.random.uniform(50, 15, N)  # Traffic density levels
visibilty = np.random.uniform(600, 200, N)  # Visibility levels in meters
brake_events = np.random.poisson(3, N)  # Number of brake events
hour = np.random.randint(5, 18, N)  # Hour of the day (5 AM to 6 PM)
weather = np.random.choice(['Clear', 'Rainy', 'Foggy'], size=N, p=[0.7, 0.2, 0.1])  # Weather conditions

In [None]:
# Adding noise and outliers to the synthetic data
speeds += np.random.normal(0, 5, N)  # Adding random noise to speeds
visibilty += np.random.normal(0, 50, N)  # Adding random noise to visibility

# Introducing outliers in speed data
outliers = np.random.choice(N, 50)  # Selecting 50 random indices as outliers
speeds[outliers] = np.random.uniform(60, 80, 50)  # Assigning outlier values

In [None]:
# Creating a DataFrame to organize the generated data
df = pd.DataFrame({
    'Speed_kph': speeds,  # Vehicle speeds in kph
    'Fatigue_Level': fatique,  # Driver fatigue levels
    'Traffic_density': traffic,  # Traffic density levels
    'Visibility': visibilty,  # Visibility levels in meters
    'Brake_Events': brake_events,  # Number of brake events
    'Hour': hour,  # Hour of the day
    'Weather_Condition': weather  # Weather conditions
})

In [None]:
df

Unnamed: 0,Speed_kph,Fatigue_Level,Traffic_density,Visibility,Brake_Events,Hour,Weather_Condition
0,103.075573,39.406008,38.756069,449.494378,3,17,Clear
1,90.679946,38.664647,46.394328,374.665478,3,15,Rainy
2,106.698800,5.108921,47.486736,647.561176,4,7,Clear
3,117.274243,41.747745,46.700341,344.989013,2,9,Clear
4,101.875848,68.320727,29.599572,295.634523,3,5,Foggy
...,...,...,...,...,...,...,...
4995,103.952509,82.527552,15.336554,321.640638,1,9,Rainy
4996,105.143488,0.041376,27.243766,223.008389,4,5,Clear
4997,130.988208,32.367082,45.550825,459.276430,5,10,Rainy
4998,113.573906,62.394139,32.284047,310.371224,10,8,Clear


# Feature Engineering

In [None]:
# Adding new features to the DataFrame for better analysis
df["overspeeding"] = (df["Speed_kph"] > 50).astype(int)  # Binary feature for overspeeding
df["traffic_pressure"] = df["Traffic_density"] * df["Speed_kph"]  # Interaction between traffic and speed
df["visibiltiy_risk"] = 1 / (df["Visibility"] + 1)  # Risk factor based on visibility
df["rush_hour"] = df["Hour"].isin([8, 9, 10, 16, 17, 18]).astype(int)  # Binary feature for rush hours
df["fatigue"] = df["Speed_kph"] * df["Fatigue_Level"]  # Interaction between speed and fatigue
df["harsh_brake"] = (df["Brake_Events"] > 8).astype(int)  # Binary feature for harsh braking

# Mapping weather conditions to risk levels
weather_risk = {"Clear": 1, "Rainy": 3, "Foggy": 2}
df["weather_risk"] = df["Weather_Condition"].map(weather_risk)

In [None]:
# Calculating a risk score based on weighted factors
# Risk Factor = 0.3 * Speed + 0.25 * Fatigue + 0.2 * Traffic + 0.15 * Visibility + 0.1 * Brake Events
risk_score = (0.30 * df["Speed_kph"] + 
              0.25 * df["fatigue"] + 
              0.2 * df["Traffic_density"] + 
              0.15 * df["Visibility"] + 
              0.1 * df["Brake_Events"])

In [None]:
# Compressing the risk score into a probability between 0 and 1 using the sigmoid function
prob = 1 / (1 + np.exp(-risk_score))

# Creating a binary feature for accident risk based on the probability threshold of 0.5
df["accident_risk"] = (prob > 0.5).astype(int)

df

# Saving the DataFrame to a CSV
df.to_csv("School_bus_accident_data", index=False)

  result = getattr(ufunc, method)(*inputs, **kwargs)
