In [1]:
from pyspark.sql import Window
from pyspark.sql import functions as F
from pyspark.ml.recommendation import ALS
import pandas
from src.utils.commons import spark_session
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.evaluation import Evaluator
import dagshub
import mlflow
from pyspark.sql.types import FloatType
import pandas as pd

In [2]:
spark = spark_session()
train_data = spark.read.parquet('../Artifacts/FeatureStore/train_transformed_data').select('user_id_index','product_id_index','interaction_score')
test_data = spark.read.parquet('../Artifacts/FeatureStore/test_transformed_data').select('user_id_index','product_id_index','interaction_score')

[2025-02-16 19:27:48,768 ] 205 root - INFO - Creating spark session


In [7]:
train_data.show()

+-------------+----------------+-----------------+
|user_id_index|product_id_index|interaction_score|
+-------------+----------------+-----------------+
|       2475.0|            38.0|              3.0|
|       2475.0|           235.0|              2.5|
|       2475.0|            35.0|              3.0|
|       2475.0|           152.0|              3.0|
|       2475.0|           173.0|              8.5|
|       2475.0|           173.0|              8.5|
|       2475.0|           173.0|              8.5|
|       2475.0|           173.0|              8.5|
|       2475.0|          1040.0|              2.0|
|       2475.0|         13061.0|              2.0|
|       2475.0|           710.0|              1.0|
|       2475.0|            13.0|             20.5|
|       2475.0|            13.0|             20.5|
|       2475.0|            13.0|             20.5|
|       2475.0|            13.0|             20.5|
|       2475.0|            13.0|             20.5|
|       2475.0|            13.0

In [3]:
model = ALS(userCol='user_id_index',itemCol='product_id_index',ratingCol='interaction_score',implicitPrefs=True,coldStartStrategy='drop')


In [18]:
userRecs = model.recommendForAllUsers(numItems=100)

In [20]:
from pyspark.sql.functions import collect_list

true_labels = test_data.groupBy('user_id_index').agg(collect_list('product_id_index').alias('true_items'))

true_labels.show()

+-------------+--------------------+
|user_id_index|          true_items|
+-------------+--------------------+
|          0.0|[23575.0, 25999.0...|
|          7.0|[2006.0, 357.0, 1...|
|          8.0|[38972.0, 42450.0...|
|         18.0|[4009.0, 5190.0, ...|
|         29.0|[15606.0, 14515.0...|
|         35.0|[1334.0, 306.0, 4...|
|         42.0|[15407.0, 1048.0,...|
|         44.0|[4526.0, 3383.0, ...|
|         47.0|[7057.0, 823.0, 8...|
|         49.0|[3240.0, 3240.0, ...|
|         62.0|[3570.0, 420.0, 1...|
|         64.0|[35179.0, 35179.0...|
|         67.0|[672.0, 12384.0, ...|
|         69.0|[1656.0, 3125.0, ...|
|         70.0|[2781.0, 3185.0, ...|
|         75.0|[2226.0, 1115.0, ...|
|         80.0|[289.0, 5.0, 5.0,...|
|         86.0|[1517.0, 90.0, 98...|
|         88.0|[8326.0, 23983.0,...|
|         96.0|[929.0, 207.0, 71...|
+-------------+--------------------+
only showing top 20 rows



In [21]:
pred_user_items = userRecs.select(
    F.col("user_id_index"),
    F.expr(f"transform(recommendations, x -> x.product_id_index)").alias("pred_items")
)



In [22]:
evaluation_data = true_labels.join(pred_user_items, on="user_id_index")

In [23]:
def recall_at_k(true_items, pred_items, k):
    return len(set(true_items).intersection(set(pred_items[:k]))) / len(true_items)

recall_udf = F.udf(lambda true, pred: recall_at_k(true, pred, 100), "float")


In [10]:
def precision_at_k(true_items, pred_items, k):
    return len(set(true_items).intersection(set(pred_items[:k]))) / k

precision_udf = F.udf(lambda true, pred: precision_at_k(true, pred, 10), "float")


In [12]:
import math

In [None]:
print(true_labels.intersect(pred_user_items).count())

In [11]:
def ndcg_at_k(true_items, pred_items, k):
    dcg = sum([1 / (math.log2(idx + 2)) for idx, item in enumerate(pred_items[:k]) if item in true_items])
    idcg = sum([1 / (math.log2(idx + 2)) for idx in range(min(len(true_items), k))])
    return dcg / idcg if idcg > 0 else 0

ndcg_udf = F.udf(lambda true, pred: ndcg_at_k(true, pred, 10), "float")


In [24]:

evaluation_results = evaluation_data.withColumn("recall_at_k", recall_udf("true_items", "pred_items")) 


average_metrics = evaluation_results.select(
    F.avg("recall_at_k").alias("avg_recall_at_k")).show()


+-------------------+
|    avg_recall_at_k|
+-------------------+
|0.17384799768576814|
+-------------------+



In [3]:
dagshub.init(repo_owner='GIDDY269', repo_name='Product_recommendation_system', mlflow=True)

mlflow.set_tracking_uri('https://dagshub.com/GIDDY269/Product_recommendation_system.mlflow')



[2025-02-16 19:28:09,467 ] 1038 httpx - INFO - HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"


[2025-02-16 19:28:09,501 ] 107 dagshub - INFO - Accessing as GIDDY269
[2025-02-16 19:28:10,517 ] 1038 httpx - INFO - HTTP Request: GET https://dagshub.com/api/v1/repos/GIDDY269/Product_recommendation_system "HTTP/1.1 200 OK"
[2025-02-16 19:28:11,470 ] 1038 httpx - INFO - HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"


[2025-02-16 19:28:11,486 ] 107 dagshub - INFO - Initialized MLflow to track repo "GIDDY269/Product_recommendation_system"


[2025-02-16 19:28:11,490 ] 107 dagshub - INFO - Repository GIDDY269/Product_recommendation_system initialized!


In [4]:
# Custom Evaluator for Recall@K
class RecallAtKEvaluator(Evaluator):
    def __init__(self, k=10):
        super(RecallAtKEvaluator, self).__init__()
        self.k = k
    
    def _evaluate(self, dataset, model):
        """
        Compute Recall@K: 
        - true_items: items in the test set
        - pred_items: top K recommendations from the model
        """
        # Collect true items grouped by userId
        true_items_df = dataset.groupBy("user_Id_index").agg(F.collect_set("product_id_index").alias("true_items"))

        # Generate top K recommendations for each user
        user_recs = model.recommendForAllUsers(self.k)
        pred_items_df = user_recs.select(
            "user_id_index", F.col("recommendations.product_id_index").alias("pred_items")
        )

        # Join true and predicted items
        joined_df = true_items_df.join(pred_items_df, "user_id_index")

        # Calculate Recall@K for each user
        def recall_at_k(true_items, pred_items):
            if not true_items:  # Avoid division by zero
                return 0.0
            return len(set(true_items).intersection(set(pred_items))) / len(true_items)

        recall_udf = F.udf(recall_at_k, "float")
        recall_df = joined_df.withColumn("recall", recall_udf(F.col("true_items"), F.col("pred_items")))

        # Return the average Recall@K across all users
        return recall_df.select(F.avg("recall")).collect()[0][0]


    


In [5]:
import mlflow.artifacts
from pyspark.ml.tuning import ParamGridBuilder




ALS_model = ALS(userCol='user_id_index',itemCol='product_id_index',ratingCol='interaction_score',implicitPrefs=True,coldStartStrategy='drop')
recall_evaluator = RecallAtKEvaluator(k=100)

param_grid = ParamGridBuilder()\
            .addGrid(ALS.rank, [20,30,50,]) \
            .addGrid(ALS.maxIter,[5,10,15,20])\
            .addGrid(ALS.regParam,[0.01,0.001,0.1])\
            .addGrid(ALS.alpha,[40,60,100,200])\
            .build() 
# evaluate each parameter combination manually
best_params =None
best_recall = 0.0
with mlflow.start_run():
    for params in param_grid:
        params = {param.name: value for param, value in params.items()}
        print(F' checkingg this {params}')
        ALS_tuned = ALS_model.setParams(**params)
        model = ALS_tuned.fit(train_data)
        print('evaluating model')
        recall = recall_evaluator._evaluate(test_data,model)
        print(f'params : {params}, Recall@k: {recall}')
        mlflow.log_params(params=params)
        mlflow.log_metric('recall@k',recall)
        if recall > best_recall:
            best_recall = recall
            best_params = params
    # Print the best hyperparameters and corresponding Recall@K
    print(f"Best Params: {best_params}, Best Recall@K: {best_recall}")

 checkingg this {'rank': 20, 'maxIter': 5, 'regParam': 0.1, 'alpha': 100.0}
evaluating model
params : {'rank': 20, 'maxIter': 5, 'regParam': 0.1, 'alpha': 100.0}, Recall@k: 0.3069345388884487
 checkingg this {'rank': 20, 'maxIter': 5, 'regParam': 0.1, 'alpha': 200.0}
evaluating model
params : {'rank': 20, 'maxIter': 5, 'regParam': 0.1, 'alpha': 200.0}, Recall@k: 0.3058807267370318
🏃 View run exultant-deer-762 at: https://dagshub.com/GIDDY269/Product_recommendation_system.mlflow/#/experiments/0/runs/210c9f7acff74671a3038a59a8b1f41a
🧪 View experiment at: https://dagshub.com/GIDDY269/Product_recommendation_system.mlflow/#/experiments/0


RestException: INVALID_PARAMETER_VALUE: Response: {'error_code': 'INVALID_PARAMETER_VALUE'}