<a href="https://colab.research.google.com/github/Amirgh8080/Anlyzer/blob/main/Road_Accident_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Nessassary Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Data Loading

In [3]:
# prompt: use kaggle api /content/kaggle.json to download the dataset :datasets/sobhanmoosavi/us-accidents

!pip install kaggle

!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d sobhanmoosavi/us-accidents
!unzip us-accidents.zip


cp: cannot stat '/content/kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents
License(s): CC-BY-NC-SA-4.0
Downloading us-accidents.zip to /content
100% 650M/653M [00:04<00:00, 174MB/s]
100% 653M/653M [00:04<00:00, 147MB/s]
Archive:  us-accidents.zip
  inflating: US_Accidents_March23.csv  


In [None]:
# Load dataset
df = pd.read_csv('/content/US_Accidents_March23.csv')

df.head()

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 46 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID                     object 
 1   Source                 object 
 2   Severity               int64  
 3   Start_Time             object 
 4   End_Time               object 
 5   Start_Lat              float64
 6   Start_Lng              float64
 7   End_Lat                float64
 8   End_Lng                float64
 9   Distance(mi)           float64
 10  Description            object 
 11  Street                 object 
 12  City                   object 
 13  County                 object 
 14  State                  object 
 15  Zipcode                object 
 16  Country                object 
 17  Timezone               object 
 18  Airport_Code           object 
 19  Weather_Timestamp      object 
 20  Temperature(F)         float64
 21  Wind_Chill(F)          float64
 22  Humidity(%)       

In [12]:
df.describe

In [14]:
df['Start_Time'] = pd.to_datetime(df['Start_Time'], format='mixed')

# Aggregate accidents by date (daily counts)
df['Date'] = df['Start_Time'].dt.date
daily_accidents = df.groupby('Date').size().reset_index(name='Accident_Count')


In [1]:
# Optional: Plot the time series
plt.figure(figsize=(12, 4))
plt.plot(daily_accidents['Date'], daily_accidents['Accident_Count'], label='Daily Accident Count')
plt.xlabel('Date')
plt.ylabel('Accident Count')
plt.title('Daily Accident Count Over Time')
plt.legend()
plt.show()


NameError: name 'plt' is not defined

In [None]:
def create_lag_features(df, lag=7):
    """
    Create lag features for a time series DataFrame.
    Given a DataFrame with 'Accident_Count', this function creates lag_1, lag_2, ..., lag_{lag}
    as features to forecast the current day's accident count.
    """
    df_new = df.copy()
    for i in range(1, lag+1):
        df_new[f'lag_{i}'] = df_new['Accident_Count'].shift(i)
    df_new = df_new.dropna().reset_index(drop=True)
    return df_new

In [15]:
# Ensure the data is sorted by date
daily_accidents['Date'] = pd.to_datetime(daily_accidents['Date'])
daily_accidents = daily_accidents.sort_values('Date').reset_index(drop=True)

## Random Forest (RF)

In [None]:
rf_model = RandomForestRegressor(
    n_estimators=100,  # Default hyperparameters (tune if needed)
    random_state=42
)
rf_model.fit(X_train, y_train)

## Support Vector Machine (SVM)

In [None]:
svm_model = SVR(
    kernel='rbf',  # RBF kernel for non-linear regression
    C=1.0,         # Default hyperparameters (tune if needed)
    epsilon=0.1
)
svm_model.fit(X_train_scaled, y_train)

## Evaluate Models

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, is_svm=False):
    # Predictions
    if is_svm:
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
    else:
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

    # Metrics
    metrics = {
        'MAE_train': mean_absolute_error(y_train, y_pred_train),
        'MSE_train': mean_squared_error(y_train, y_pred_train),
        'RMSE_train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
        'R2_train': r2_score(y_train, y_pred_train),
        'MAE_test': mean_absolute_error(y_test, y_pred_test),
        'MSE_test': mean_squared_error(y_test, y_pred_test),
        'RMSE_test': np.sqrt(mean_squared_error(y_test, y_pred_test)),
        'R2_test': r2_score(y_test, y_pred_test)
    }
    return metrics

# Evaluate RF
rf_metrics = evaluate_model(rf_model, X_train, X_test, y_train, y_test)

# Evaluate SVM
svm_metrics = evaluate_model(svm_model, X_train_scaled, X_test_scaled, y_train, y_test, is_svm=True)

# Print results
print("Random Forest Metrics:")
print(pd.DataFrame([rf_metrics]))
print("\nSupport Vector Machine Metrics:")
print(pd.DataFrame([svm_metrics]))

## Results Interpretation