<a href="https://colab.research.google.com/github/Akhil-kuma/akhil/blob/main/BigDataAnalytics_Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# 1. Start Spark Session
spark = SparkSession.builder.appName("IrisClassification").getOrCreate()

# 2. Load the dataset (replace with your path if local)
# Using a public URL for Iris CSV
iris_url = "/iris.csv"
df = spark.read.csv(iris_url, header=True, inferSchema=True)

# 3. Encode label column
indexer = StringIndexer(inputCol="species", outputCol="label")

# 4. Combine feature columns into a single vector
feature_cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# 5. Create a Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="label")

# 6. Build Pipeline
pipeline = Pipeline(stages=[indexer, assembler, lr])

# 7. Split data into training and test sets
train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)

# 8. Train the model
model = pipeline.fit(train_data)

# 9. Make predictions
predictions = model.transform(test_data)

# 10. Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.2f}")

# 11. Show predictions
predictions.select("features", "label", "prediction").show()

# 12. Stop Spark
spark.stop()


Test Accuracy: 0.88
+-----------------+-----+----------+
|         features|label|prediction|
+-----------------+-----+----------+
|[4.7,3.2,1.6,0.2]|  0.0|       0.0|
|[5.0,3.6,1.4,0.2]|  0.0|       0.0|
|[5.2,3.4,1.4,0.2]|  0.0|       0.0|
|[5.2,3.5,1.5,0.2]|  0.0|       0.0|
|[6.3,3.3,6.0,2.5]|  2.0|       2.0|
|[6.4,3.2,4.5,1.5]|  1.0|       2.0|
|[6.5,2.8,4.6,1.5]|  1.0|       1.0|
|[7.1,3.0,5.9,2.1]|  2.0|       2.0|
+-----------------+-----+----------+



In [None]:
# Step 1: Start Spark Session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("IrisClustering").getOrCreate()

# Step 2: Load Dataset (Assumes iris.csv is in same directory)
df = spark.read.csv("/iris.csv", header=True, inferSchema=True)

# Step 3: Assemble Features
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
    outputCol="features"
)

data = assembler.transform(df)
data = data.select("features")

# Step 4: Apply KMeans Clustering
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=3, seed=42)  # 3 clusters for 3 species
model = kmeans.fit(data)
predictions = model.transform(data)

# Step 5: Show Cluster Assignments
predictions.show(10)

# Step 6: Evaluate with Silhouette Score
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette Score = {silhouette:.2f}")

# Step 7: Stop Spark
spark.stop()


+-----------------+----------+
|         features|prediction|
+-----------------+----------+
|[5.1,3.5,1.4,0.2]|         1|
|[4.9,3.0,1.4,0.2]|         1|
|[4.7,3.2,1.3,0.2]|         1|
|[4.6,3.1,1.5,0.2]|         1|
|[5.0,3.6,1.4,0.2]|         1|
|[7.0,3.2,4.7,1.4]|         0|
|[6.4,3.2,4.5,1.5]|         0|
|[6.9,3.1,4.9,1.5]|         0|
|[5.5,2.3,4.0,1.3]|         2|
|[6.5,2.8,4.6,1.5]|         0|
+-----------------+----------+
only showing top 10 rows

Silhouette Score = 0.73


In [None]:
# Step 1: Import Spark
from pyspark.sql import SparkSession

# Start Spark Session
spark = SparkSession.builder.appName("SimpleRecommendationEngine").getOrCreate()

# Step 2: Create Ratings Dataset
data = [
    (1, 101, 5.0),
    (1, 102, 3.0),
    (1, 103, 2.5),
    (2, 101, 2.0),
    (2, 102, 2.5),
    (2, 103, 5.0),
    (2, 104, 4.0),
    (3, 101, 2.5),
    (3, 104, 4.5),
    (4, 102, 4.0),
    (4, 103, 3.0),
    (4, 104, 5.0),
]

columns = ["userId", "movieId", "rating"]
df = spark.createDataFrame(data, columns)

# Step 3: Show Sample Ratings
df.show()

# Step 4: Build ALS Model
from pyspark.ml.recommendation import ALS

als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    nonnegative=True,
    coldStartStrategy="drop"
)

model = als.fit(df)

# Step 5: Generate Top-2 Movie Recommendations for Each User
userRecs = model.recommendForAllUsers(2)
userRecs.show(truncate=False)

# Step 6: Optional - Evaluate Model
from pyspark.ml.evaluation import RegressionEvaluator

predictions = model.transform(df)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# Step 7: Stop Spark
spark.stop()


+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|    101|   5.0|
|     1|    102|   3.0|
|     1|    103|   2.5|
|     2|    101|   2.0|
|     2|    102|   2.5|
|     2|    103|   5.0|
|     2|    104|   4.0|
|     3|    101|   2.5|
|     3|    104|   4.5|
|     4|    102|   4.0|
|     4|    103|   3.0|
|     4|    104|   5.0|
+------+-------+------+

+------+------------------------------------+
|userId|recommendations                     |
+------+------------------------------------+
|1     |[{101, 4.730415}, {104, 3.305429}]  |
|2     |[{103, 4.690695}, {104, 4.0107307}] |
|3     |[{104, 4.3910356}, {103, 3.423058}] |
|4     |[{104, 4.9122496}, {102, 3.8038363}]|
+------+------------------------------------+

Root Mean Squared Error (RMSE): 0.14
