<a href="https://colab.research.google.com/github/EdiNel0407/us-ie-big-data-technologies/blob/main/postblock2/q4/postblock2_q4_1_pyspark_rf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PostBlock2 — Q4.1: Random Forest in **PySpark only**

This notebook builds a Random Forest classifier using **Spark ML** (no pandas/koalas).

In [1]:
!pip -q install pyspark==3.5.1

In [2]:

from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("Q4.1_PySpark_RF").getOrCreate())
print("Spark version:", spark.version)


Spark version: 3.5.1


In [3]:

from pyspark.sql import functions as F
n = 5000
base = spark.range(0, n)
df = (base.withColumn("x1", F.rand(seed=7))
          .withColumn("x2", F.rand(seed=11))
          .withColumn("x3", F.rand(seed=13))
          .withColumn("x4", F.rand(seed=17)))
df = df.withColumn("label", ((F.col("x1")*0.8 + F.col("x2")*0.6 - F.col("x3")*0.7 + F.col("x4")*0.2 + F.rand(seed=23)*0.3) > 0.8).cast("int")).drop("id")
df.printSchema()
df.show(5)
df.groupBy("label").count().orderBy("label").show()


root
 |-- x1: double (nullable = false)
 |-- x2: double (nullable = false)
 |-- x3: double (nullable = false)
 |-- x4: double (nullable = false)
 |-- label: integer (nullable = false)

+-------------------+-------------------+-------------------+-------------------+-----+
|                 x1|                 x2|                 x3|                 x4|label|
+-------------------+-------------------+-------------------+-------------------+-----+
|0.28188121484885875|0.03422639313807285| 0.4175019040792016| 0.9697474945375325|    0|
|0.45998819035056326|0.11682250456449328| 0.9899129399827472|0.07530606222259384|    0|
| 0.7690971666031955| 0.3252227809451943|0.48359971255227285|0.05242657018696939|    0|
| 0.6631414615273216| 0.4358851046481089| 0.6070081240379598| 0.5448652870760927|    0|
|  0.520550428327607|0.27831507107951303| 0.4013942674751203| 0.3298954444401243|    0|
+-------------------+-------------------+-------------------+-------------------+-----+
only showing top 5 rows

In [4]:

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline

assembler = VectorAssembler(inputCols=["x1","x2","x3","x4"], outputCol="features")
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100, maxDepth=8, seed=42)
pipeline = Pipeline(stages=[assembler, rf])
train_df, test_df = df.randomSplit([0.8, 0.2], seed=123)
model = pipeline.fit(train_df)


In [5]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
pred = model.transform(test_df)
acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy").evaluate(pred)
print(f"Test Accuracy: {acc:.4f}")
pred.groupBy("label","prediction").count().orderBy("label","prediction").show()


Test Accuracy: 0.9145
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       0.0|  669|
|    0|       1.0|   35|
|    1|       0.0|   51|
|    1|       1.0|  251|
+-----+----------+-----+



In [6]:

model_path = "/content/models/q4_1_pyspark_rf_model"
model.write().overwrite().save(model_path)
print("Model saved to:", model_path)


Model saved to: /content/models/q4_1_pyspark_rf_model


**Metric note:** Accuracy can be misleading with class imbalance. Consider ROC-AUC/PR-AUC, F1, recall for minority class, and balanced accuracy.