In [0]:
# Databricks notebook: 03_ml_model_optional.py (Scikit-Learn Version)
# Works in Databricks CE / Serverless

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from pyspark.sql import functions as F

# Load Spark table
df_full = spark.table("madsc102.usaccidents_volume")

# Convert a small sample to Pandas
pdf = df_full.limit(5000).toPandas()

# Features to use
features = ["start_hour", "distance_miles", "duration_minutes", "is_weekend"]

X = pdf[features].fillna(0)
y = pdf["severity"].astype(int)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Logistic Regression model
lr = LogisticRegression(max_iter=200)
lr.fit(X_train, y_train)

# Predictions
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Convert predictions to Spark DataFrame
result_pdf = pd.DataFrame({
    "actual_severity": y_test.iloc[:20].values,
    "predicted_severity": y_pred[:20]
})
result_sdf = spark.createDataFrame(result_pdf)

display(result_sdf)


Accuracy: 0.974

Classification Report:
               precision    recall  f1-score   support

           2       0.97      1.00      0.99       974
           4       0.00      0.00      0.00        26

    accuracy                           0.97      1000
   macro avg       0.49      0.50      0.49      1000
weighted avg       0.95      0.97      0.96      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


actual_severity,predicted_severity
2,2
2,2
2,2
2,2
2,2
2,2
2,2
2,2
2,2
2,2
