# Q1

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RecommendationSystem").getOrCreate()
data_path = './customers-100.csv'  
df = spark.read.csv(data_path, header=True, inferSchema=True)

df.show(5)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/21 10:38:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|Index|    Customer Id|First Name|Last Name|             Company|             City|             Country|             Phone 1|             Phone 2|               Email|Subscription Date|             Website|
+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|    1|DD37Cf93aecA6Dc|    Sheryl|   Baxter|     Rasmussen Group|     East Leonard|               Chile|        229.077.5154|    397.884.0519x718|zunigavanessa@smi...|       2020-08-24|http://www.stephe...|
|    2|1Ef7b82A4CAAD10|   Preston|   Lozano|         Vega-Gentry|East Jimmychester|            Djibouti|          5153435776|    686-620-1820x944|     vmata@colon.com|     

# Q2

In [4]:
from pyspark.sql.functions import col, lit
from pyspark.sql import functions as F

interaction_data = df.select(
    col("Customer Id").alias("userId"),
    col("Website").alias("itemId"),
    (F.rand() * 5.0).alias("rating") 
)
(training_data, test_data) = interaction_data.randomSplit([0.8, 0.2])


# Q3


In [9]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, lit
from pyspark.sql import functions as F

# Initialize Spark session
spark = SparkSession.builder \
    .appName("RecommendationSystem") \
    .getOrCreate()

# Load dataset
file_path = './customers-100.csv'  
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Generate interaction data (dummy example)
interaction_data = df.select(
    col("Customer Id").alias("userId"),
    col("Website").alias("itemId"),
    (F.rand() * 5.0).alias("rating")  # Replace with actual ratings in practice
)

# Index userId and itemId
user_indexer = StringIndexer(inputCol="userId", outputCol="userIndex")
item_indexer = StringIndexer(inputCol="itemId", outputCol="itemIndex")

interaction_data = user_indexer.fit(interaction_data).transform(interaction_data)
interaction_data = item_indexer.fit(interaction_data).transform(interaction_data)

# Prepare final data for ALS
final_data = interaction_data.select(
    col("userIndex").alias("userId"),
    col("itemIndex").alias("itemId"),
    col("rating")
)

# Split data into training and test sets
(training_data, test_data) = final_data.randomSplit([0.8, 0.2])

# Initialize ALS model
als = ALS(
    userCol="userId",
    itemCol="itemId",
    ratingCol="rating",
    rank=10,
    maxIter=10,
    regParam=0.1,
    coldStartStrategy="drop"
)

# Train the ALS model
model = als.fit(training_data)

# Make predictions on the test set
predictions = model.transform(test_data)

# Check for non-empty DataFrames
if predictions.count() == 0:
    print("The predictions DataFrame is empty. Check the training and prediction steps.")
else:
    # Show some predictions
    predictions.show()

    # Print schema of predictions
    predictions.printSchema()

    # Evaluate model performance
    evaluator = RegressionEvaluator(
        metricName="rmse",
        labelCol="rating",
        predictionCol="prediction"
    )

    # Calculate RMSE
    rmse = evaluator.evaluate(predictions)
    print(f"Root Mean Squared Error (RMSE): {rmse}")

# Stop Spark session
spark.stop()


The predictions DataFrame is empty. Check the training and prediction steps.
