In [3]:
# INIT & LOAD
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# start spark
spark= SparkSession.builder.appName("PostEngagementClassifier").getOrCreate()
# Load data
data_path="/content/social_media_posts.csv"
df= spark.read.option("header", True).option("inferschema", True).csv(data_path)
# label creation
# calculate median likes for engagement threshold
median_likes= df.approxQuantile("likes", [0.5], 0.0)[0]
# create binary label: 1= high engagement, 0= low
df= df.withColumn("engagement_label", when(col("likes") >= median_likes, 1).otherwise(0))
# feature encoding
# categorical features
cat_features= ["post_type", "device_type", "location"]
indexers= [StringIndexer(inputCol= col, outputCol= col+ "_idx") for col in cat_features]
encoders= [OneHotEncoder(inputCol= col+ "_idx", outputCol= col+ "_vec") for col in cat_features]
# numeric features
numeric_features= ["comments"]
# assemble features
assembler_inputs= [f+ "_vec" for f in cat_features] + numeric_features
assembler= VectorAssembler(inputCols= assembler_inputs, outputCol= "features")
# model pipeline
lr= LogisticRegression(labelCol= "engagement_label", featuresCol= "features")
pipeline= Pipeline(stages= indexers + encoders + [assembler, lr])
# train/ test split
train_data, test_data= df.randomSplit([0.8, 0.2], seed= 42)
model= pipeline.fit(train_data)
predictions= model.transform(test_data)
# evaluation
evaluator= BinaryClassificationEvaluator(labelCol= "engagement_label", metricName= "areaUnderROC")
roc_auc= evaluator.evaluate(predictions)
print(f"ROC AUC: {roc_auc: .4f}")
# show predicted vs actual
predictions.select("post_type", "likes", "comments", "engagement_label", "prediction").show(10)

ROC AUC:  0.5571
+---------+-----+--------+----------------+----------+
|post_type|likes|comments|engagement_label|prediction|
+---------+-----+--------+----------------+----------+
|    story|   60|      86|               0|       1.0|
|    photo|  373|      76|               1|       1.0|
|   status|  177|      40|               0|       1.0|
|   status|  384|      54|               1|       1.0|
|     link|  432|      83|               1|       0.0|
|   status|   18|      16|               0|       0.0|
|    video|  225|      65|               0|       0.0|
|    video|  154|      39|               0|       0.0|
|    photo|  206|      80|               0|       1.0|
|   status|  117|      55|               0|       1.0|
+---------+-----+--------+----------------+----------+
only showing top 10 rows

