# Import Necessary Libraries

In [None]:
# Importing libraries for data processing, model training, evaluation, and visualization
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load Dataset Files

In [144]:
# Load multiple CSV files containing IoT data from different scenarios
alonso_0208_lab = pd.read_csv('Human-Driving and AV IoT Data/Human Driver IoT Datasets/Polidriving Dataset/alonso/20240208_120000/20240208_120000_lab.csv')
alonso_0208_nor = pd.read_csv('Human-Driving and AV IoT Data/Human Driver IoT Datasets/Polidriving Dataset/alonso/20240208_120000/20240208_120000_nor.csv')
alonso_0208_ovs = pd.read_csv('Human-Driving and AV IoT Data/Human Driver IoT Datasets/Polidriving Dataset/alonso/20240208_120000/20240208_120000_ovs.csv')
alonso_0208_res = pd.read_csv('Human-Driving and AV IoT Data/Human Driver IoT Datasets/Polidriving Dataset/alonso/20240208_120000/20240208_120000_res.csv')
alonso_1201_lab = pd.read_csv('Human-Driving and AV IoT Data/Human Driver IoT Datasets/Polidriving Dataset/alonso/23241201_290300/23241201_290300_lab.csv')
alonso_1201_nor = pd.read_csv('Human-Driving and AV IoT Data/Human Driver IoT Datasets/Polidriving Dataset/alonso/23241201_290300/23241201_290300_nor.csv')

# Concatenate and Preprocess the Data

In [169]:
# Concatenate data from multiple sources and remove duplicate records
full_df = pd.concat([alonso_0208_lab, alonso_0208_nor, alonso_0208_ovs, alonso_0208_res, alonso_1201_lab, alonso_1201_nor])
full_df = full_df.drop_duplicates()
full_df = full_df.drop(columns=['Unnamed: 0'])

# Reverse the risk level for uniform risk scoring (higher values indicate greater risk)
full_df['risk_level'] = 5 - full_df['risk_level']

# Split and Scale the Data

In [179]:
# Separate features and target for training and testing
X = full_df.drop(columns=['risk_level'])
y = full_df['risk_level']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data for better convergence
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Multinomial Logistic Regression Model

In [None]:
# Train a logistic regression model with balanced class weight to predict risk level
logistic_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, class_weight='balanced')
logistic_model.fit(X_train_scaled, y_train)

# Predict risk levels on test data with probabilities
y_pred_logistic = logistic_model.predict_proba(X_test_scaled)

# Define risk values for weighted prediction calculation
risk_values = np.array([0, 0.33, 0.67, 1])
y_pred_weighted = np.dot(y_pred_logistic, risk_values)

# Feature Importance Analysis

In [152]:
# Extract feature importance from the trained logistic regression model coefficients
coefficients = logistic_model.coef_

# Calculate absolute importance values for each feature
importance = np.abs(coefficients).mean(axis=0)

# Create a DataFrame to display feature importance and sort it
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

# Display the ranked feature importance to assess the impact of variables on the model
print(importance_df)

               Feature  Importance
9           visibility   16.508738
5   engine_temperature    6.964229
0     observation_hour    6.254969
7           heart_rate    5.824795
12        design_speed    4.489696
1                speed    2.377940
8      current_weather    2.240212
13      accidents_time    1.511282
10       precipitation    1.340412
2                  rpm    1.078075
4    throttle_position    0.489985
11    accidents_onsite    0.381964
3         acceleration    0.325192
6    engine_load_value    0.144017
