In [8]:
#Q1 Build a Classification Model with Spark with a dataset of your choice
#Classification using PySpark (Iris Dataset)

In [9]:
# Install PySpark
!pip install pyspark

# Import Libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
from sklearn.datasets import load_iris




In [10]:
# Start Spark session
spark = SparkSession.builder.appName("Iris_Classification").getOrCreate()

# Load Iris dataset using sklearn
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df["target"] = iris.target

# Convert to Spark DataFrame
spark_iris_df = spark.createDataFrame(iris_df)
spark_iris_df.show(5)


+-----------------+----------------+-----------------+----------------+------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|
+-----------------+----------------+-----------------+----------------+------+
|              5.1|             3.5|              1.4|             0.2|     0|
|              4.9|             3.0|              1.4|             0.2|     0|
|              4.7|             3.2|              1.3|             0.2|     0|
|              4.6|             3.1|              1.5|             0.2|     0|
|              5.0|             3.6|              1.4|             0.2|     0|
+-----------------+----------------+-----------------+----------------+------+
only showing top 5 rows



In [11]:
# Assemble features into a single vector column
vectorizer = VectorAssembler(inputCols=iris.feature_names, outputCol="features")
iris_data = vectorizer.transform(spark_iris_df)

# Split data into training and test sets
train_data, test_data = iris_data.randomSplit([0.75, 0.25], seed=10)

# Train a Random Forest Classifier
rf = RandomForestClassifier(labelCol="target", featuresCol="features", numTrees=20)
rf_model = rf.fit(train_data)


In [12]:
# Predict on test set
rf_preds = rf_model.transform(test_data)

# Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(rf_preds)
print(f"Random Forest Model Accuracy: {accuracy:.3f}")


Random Forest Model Accuracy: 0.976


In [13]:
#Q2 Build a Clustering Model with Spark with a dataset of your choice.
#Question 2: Clustering using PySpark (Mall Customers Dataset)

In [14]:
# Already installed pyspark earlier
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
import pandas as pd

# Start Spark session (if not already started)
spark = SparkSession.builder.appName("Customer_Clustering").getOrCreate()


In [15]:
# Sample Mall Customer Data
data = pd.DataFrame({
    "CustID": [1, 2, 3, 4, 5],
    "Age": [18, 22, 24, 35, 45],
    "Income": [15, 18, 20, 25, 30],
    "Score": [39, 81, 6, 77, 40]
})

# Convert to Spark DataFrame
customer_df = spark.createDataFrame(data)
customer_df.show()


+------+---+------+-----+
|CustID|Age|Income|Score|
+------+---+------+-----+
|     1| 18|    15|   39|
|     2| 22|    18|   81|
|     3| 24|    20|    6|
|     4| 35|    25|   77|
|     5| 45|    30|   40|
+------+---+------+-----+



In [16]:
# Assemble features
assembler = VectorAssembler(inputCols=["Age", "Income", "Score"], outputCol="features")
features_df = assembler.transform(customer_df)

# Apply KMeans Clustering
kmeans = KMeans(k=2, seed=15)
k_model = kmeans.fit(features_df)

# Make predictions
clustered_df = k_model.transform(features_df)


In [17]:
# Display cluster assignments
clustered_df.select("CustID", "prediction").show()


+------+----------+
|CustID|prediction|
+------+----------+
|     1|         0|
|     2|         1|
|     3|         0|
|     4|         1|
|     5|         0|
+------+----------+



In [18]:
##Q3 Build a Recommendation Engine with Spark with a dataset of your choice
#Question 3: Recommendation Engine using PySpark ALS

In [19]:
# Already installed pyspark earlier
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd

# Create or use existing Spark session
spark = SparkSession.builder.appName("ALS_Recommendation").getOrCreate()


In [20]:
# Sample user-item ratings data
ratings_data = pd.DataFrame({
    "user": [101, 101, 102, 103, 104],
    "item": [201, 202, 201, 203, 204],
    "score": [4.5, 5.0, 3.5, 4.0, 2.0]
})

# Convert to Spark DataFrame
ratings_df = spark.createDataFrame(ratings_data)
ratings_df.show()


+----+----+-----+
|user|item|score|
+----+----+-----+
| 101| 201|  4.5|
| 101| 202|  5.0|
| 102| 201|  3.5|
| 103| 203|  4.0|
| 104| 204|  2.0|
+----+----+-----+



In [21]:
# Build ALS recommender
als = ALS(userCol="user", itemCol="item", ratingCol="score", nonnegative=True, coldStartStrategy="drop")
als_model = als.fit(ratings_df)


In [22]:
# Predict on known data
predictions = als_model.transform(ratings_df)

# Evaluate with RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="score", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"ALS Model RMSE: {rmse:.3f}")


ALS Model RMSE: 0.065
