In [0]:
# Databricks notebook: 03_ml_model_optional.py
# Purpose: Train a simple ML model (RandomForestClassifier) to predict accident severity


from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


DATABASE = 'madsc102'
TABLE = 'usaccidents_volume'
FULL_TABLE = f"{DATABASE}.{TABLE}"


df = spark.table(FULL_TABLE)


# Select features for prediction
features = ['start_hour', 'distance_miles', 'duration_minutes', 'is_weekend']


# Handle nulls
for col in features:
 df = df.fillna({col: 0})


# Feature vector
assembler = VectorAssembler(inputCols=features, outputCol='features')
df_vec = assembler.transform(df)


# Prepare label
indexer = StringIndexer(inputCol='severity', outputCol='label')
df_final = indexer.fit(df_vec).transform(df_vec)


# Train/test split
train, test = df_final.randomSplit([0.8, 0.2], seed=42)


rf = RandomForestClassifier(featuresCol='features', labelCol='label', numTrees=50)
model = rf.fit(train)


preds = model.transform(test)


eval_acc = MulticlassClassificationEvaluator(labelCol='label', metricName='accuracy')
print('Accuracy:', eval_acc.evaluate(preds))


# Show sample predictions
display(preds.select('severity', 'prediction', 'probability').limit(20))

Accuracy: 0.8049048207453833


severity,prediction,probability
3,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.7116156048098043"",""0.27007873743692234"",""0.011190503853486501"",""0.00711515389978685""]}"
3,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.7116156048098043"",""0.27007873743692234"",""0.011190503853486501"",""0.00711515389978685""]}"
3,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.7116156048098043"",""0.27007873743692234"",""0.011190503853486501"",""0.00711515389978685""]}"
3,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.7116156048098043"",""0.27007873743692234"",""0.011190503853486501"",""0.00711515389978685""]}"
2,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.7116156048098043"",""0.27007873743692234"",""0.011190503853486501"",""0.00711515389978685""]}"
3,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.7116156048098043"",""0.27007873743692234"",""0.011190503853486501"",""0.00711515389978685""]}"
2,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.7116156048098043"",""0.27007873743692234"",""0.011190503853486501"",""0.00711515389978685""]}"
3,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.7116156048098043"",""0.27007873743692234"",""0.011190503853486501"",""0.00711515389978685""]}"
3,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.7116156048098043"",""0.27007873743692234"",""0.011190503853486501"",""0.00711515389978685""]}"
3,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.7116156048098043"",""0.27007873743692234"",""0.011190503853486501"",""0.00711515389978685""]}"


In [0]:
# Simulated job run
print("Starting ML Job...")
dbutils.notebook.run('/Workspace/Users/aleenaclarethomas20@gmail.com/notebooks/03_ml_model_optional', 60)
print("ML Job completed successfully!")


Starting ML Job...


[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-8102768085781944>, line 3[0m
[1;32m      1[0m [38;5;66;03m# Simulated job run[39;00m
[1;32m      2[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;124mStarting ML Job...[39m[38;5;124m"[39m)
[0;32m----> 3[0m dbutils[38;5;241m.[39mnotebook[38;5;241m.[39mrun([38;5;124m'[39m[38;5;124m/Workspace/Users/aleenaclarethomas20@gmail.com/notebooks/03_ml_model_optional[39m[38;5;124m'[39m, [38;5;241m60[39m)
[1;32m      4[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;124mML Job completed successfully![39m[38;5;124m"[39m)

File [0;32m/databricks/python_shell/lib/dbruntime/dbutils.py:327[0m, in [0;36mDBUtils.NotebookHandler.run[0;34m(self, path, timeout_seconds, arguments, *args, **kwargs)[0m
[1;32m    324[0m arguments_scala_map: [38;5;28mdict[39m[[38;5;28mstr[39m, [38