In [None]:
path = r"/home/mb/college/bda/BDA/datasets/TrafficTwoMonth.csv"

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, hour, minute, second
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import (
    StringIndexer, VectorAssembler, StandardScaler
)

spark = SparkSession.builder.appName("Traffic Time Series").getOrCreate()
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

df = spark.read.csv(path, header=True, inferSchema=True)

# df.show()

# Extract time features
df = df.withColumn("ExtractedTime", to_timestamp(col("Time"), "hh:mm:ss a")) \
       .withColumn("Hour", hour(col("ExtractedTime"))) \
       .withColumn("Minute", minute(col("ExtractedTime"))) \
       .withColumn("Second", second(col("ExtractedTime")))


# Define stages of pipeline
day_indexer = StringIndexer(inputCol="Day of the week", outputCol="Day")
label_indexer = StringIndexer(inputCol="Traffic Situation", outputCol="label")

# df.show()

feature_cols = ["Day", "CarCount", "BikeCount", "BusCount", "TruckCount", "Total", "Hour", "Minute", "Second"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
lr = LogisticRegression(featuresCol="scaled_features", labelCol="label")

pipeline = Pipeline(stages=[day_indexer, label_indexer, assembler, scaler, lr])

model = pipeline.fit(df).transform(df)

model.show(truncate=False)


from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(model)
print(f"Accuracy: {accuracy:.2f}")



spark.stop()


+-----------+----+---------------+--------+---------+--------+----------+-----+-----------------+-------------------+----+------+------+---+-----+-------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+----------+
|Time       |Date|Day of the week|CarCount|BikeCount|BusCount|TruckCount|Total|Traffic Situation|ExtractedTime      |Hour|Minute|Second|Day|label|features                                   |scaled_features                                                                                                                                                 |rawPrediction                                                                 |probability                              