<a href="https://colab.research.google.com/github/Bhavanishwarya/Bhavanishwarya/blob/main/bda_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1.Build a classification model with spark with a dataset of your choice in python for big data analysis.

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder.appName("CustomerPurchaseClf").getOrCreate()

data = [
    (22, 39000, 10, 2, 0),
    (29, 52000, 23, 4, 0),
    (41, 78000, 38, 5, 1),
    (36, 64000, 27, 3, 1),
    (48, 90000, 33, 4, 1),
    (57, 110000, 46, 6, 1),
    (39, 68000, 22, 2, 0),
    (26, 45000, 17, 3, 0),
    (34, 58000, 21, 4, 1),
    (42, 72000, 32, 5, 1)
]

columns = ["age", "income", "time_on_site", "clicks", "label"]

df = spark.createDataFrame(data, schema=columns)

assembler = VectorAssembler(
    inputCols=["age", "income", "time_on_site", "clicks"],
    outputCol="assembled"
)
df_assembled = assembler.transform(df)

scaler = StandardScaler(inputCol="assembled", outputCol="features", withStd=True, withMean=False)
df_scaled = scaler.fit(df_assembled).transform(df_assembled)

final_df = df_scaled.select("features", "label")

train_data, test_data = final_df.randomSplit([0.75, 0.25], seed=7)

rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=5)
model = rf.fit(train_data)

predictions = model.transform(test_data)
predictions.select("features", "label", "prediction").show()

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.2f}")

spark.stop()


+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|[3.44221468449791...|    1|       0.0|
|[5.45017325045503...|    1|       1.0|
+--------------------+-----+----------+

Test Accuracy: 0.50


2.Build a clustering model with spark with a data set of your choice

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

spark = SparkSession.builder.appName("UserClusteringModel").getOrCreate()

data = [
    (22.0, 15.0, 39.0),
    (35.0, 16.0, 81.0),
    (19.0, 17.0, 6.0),
    (45.0, 18.0, 77.0),
    (33.0, 19.0, 40.0),
    (26.0, 20.0, 76.0),
    (21.0, 21.0, 6.0),
    (29.0, 22.0, 94.0),
    (24.0, 23.0, 3.0),
    (40.0, 24.0, 72.0),
    (38.0, 25.0, 13.0),
    (30.0, 26.0, 70.0),
    (27.0, 27.0, 14.0),
    (23.0, 28.0, 99.0),
    (34.0, 29.0, 15.0)
]

columns = ["user_age", "monthly_income", "engagement_score"]

df = spark.createDataFrame(data, schema=columns)

assembler = VectorAssembler(inputCols=columns, outputCol="features")
df_vector = assembler.transform(df).select("features")

kmeans = KMeans(k=4, seed=42)
model = kmeans.fit(df_vector)

clustered = model.transform(df_vector)
clustered.show(truncate=False)

print("Cluster Centers:")
for idx, center in enumerate(model.clusterCenters()):
    print(f"Cluster {idx}: {center}")

spark.stop()


+----------------+----------+
|features        |prediction|
+----------------+----------+
|[22.0,15.0,39.0]|2         |
|[35.0,16.0,81.0]|0         |
|[19.0,17.0,6.0] |1         |
|[45.0,18.0,77.0]|0         |
|[33.0,19.0,40.0]|2         |
|[26.0,20.0,76.0]|0         |
|[21.0,21.0,6.0] |1         |
|[29.0,22.0,94.0]|0         |
|[24.0,23.0,3.0] |1         |
|[40.0,24.0,72.0]|0         |
|[38.0,25.0,13.0]|3         |
|[30.0,26.0,70.0]|0         |
|[27.0,27.0,14.0]|3         |
|[23.0,28.0,99.0]|0         |
|[34.0,29.0,15.0]|3         |
+----------------+----------+

Cluster Centers:
Cluster 0: [32.57142857 22.         81.28571429]
Cluster 1: [21.33333333 20.33333333  5.        ]
Cluster 2: [27.5 17.  39.5]
Cluster 3: [33. 27. 14.]


3.Build a recommondation engine with spark with a dataset of your choice



In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder.appName("UniqueBookRecommenderModel").getOrCreate()

data = [
    (10, 201, 5.0),
    (10, 202, 4.0),
    (10, 203, 3.0),
    (11, 201, 2.5),
    (11, 204, 4.5),
    (11, 205, 5.0),
    (12, 202, 3.5),
    (12, 203, 4.0),
    (12, 206, 4.5),
    (13, 204, 2.0),
    (13, 205, 3.0),
    (13, 206, 3.5),
    (14, 201, 4.0),
    (14, 203, 4.0),
    (14, 205, 2.5),
]

columns = ["userId", "itemId", "score"]

df = spark.createDataFrame(data, schema=columns)

train_df, test_df = df.randomSplit([0.75, 0.25], seed=99)

als = ALS(
    userCol="userId",
    itemCol="itemId",
    ratingCol="score",
    coldStartStrategy="drop",
    nonnegative=True,
    implicitPrefs=False,
    rank=8,
    maxIter=12,
    regParam=0.12
)

model = als.fit(train_df)

predictions = model.transform(test_df)
predictions.show()

evaluator = RegressionEvaluator(metricName="rmse", labelCol="score", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Test RMSE: {rmse:.2f}")

user_recommendations = model.recommendForAllUsers(2)
user_recommendations.show(truncate=False)

item_recommendations = model.recommendForAllItems(2)
item_recommendations.show(truncate=False)

spark.stop()


+------+------+-----+----------+
|userId|itemId|score|prediction|
+------+------+-----+----------+
|    12|   202|  3.5|  2.712491|
|    11|   204|  4.5| 1.8850932|
|    13|   205|  3.0| 2.4672604|
+------+------+-----+----------+

Test RMSE: 1.61
+------+------------------------------------+
|userId|recommendations                     |
+------+------------------------------------+
|10    |[{201, 4.719471}, {202, 3.935769}]  |
|11    |[{205, 4.7992153}, {206, 3.227822}] |
|12    |[{206, 4.3875184}, {203, 3.9231749}]|
|13    |[{206, 3.405077}, {203, 3.022925}]  |
|14    |[{206, 4.1035943}, {201, 3.9500735}]|
+------+------------------------------------+

+------+----------------------------------+
|itemId|recommendations                   |
+------+----------------------------------+
|201   |[{10, 4.719471}, {14, 3.9500735}] |
|202   |[{10, 3.935769}, {14, 2.927404}]  |
|203   |[{12, 3.9231749}, {14, 3.7470486}]|
|204   |[{12, 2.4670122}, {14, 2.2902136}]|
|205   |[{11, 4.7992153}, {12