# SetUp

In [None]:
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions as F
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.sql.types import StructType, StructField, ArrayType, FloatType, IntegerType

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Class Question1

In [None]:
class Question1:
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("Question1") \
            .getOrCreate()
        self.data = None
        self.U = None
        self.sigma = None
        self.V = None

    def load_data(self, file_path):
        """
        Load data from CSV file and save to DataFrame.
        :param file_path: Path to CSV file.
        """
        self.data = self.spark.read.csv(file_path, header=True, inferSchema=True)

    def compute_svd(self, k):
        """
        Calculate the SVD of the user-item matrix with a number of k concepts.
        :param k: Number of concepts to extract.
        """
        sparse_vectors = self.create_sparse_vectors()
        row_matrix = RowMatrix(sparse_vectors.map(lambda x: x[1]))
        svd = row_matrix.computeSVD(k, computeU=True)
        self.U = svd.U
        self.sigma = svd.s
        self.V = svd.V

    def create_sparse_vectors(self):
        """
        Represent each user in the data set as a sparse vector containing ratings.
        :return: RDD contains the pair (user, sparse_vector).
        """
        num_items = self.data.select("item").distinct().count()
        sparse_vectors = self.data.groupBy("user").agg(
            F.collect_list(F.struct("item", "rating")).alias("ratings")
        ).rdd.map(
            lambda row: (
                row["user"],
                Vectors.sparse(
                    num_items,
                    {int(item["item"]): float(item["rating"]) for item in row["ratings"]}
                )
            )
        )

        return sparse_vectors

    def top_concepts(self):
        """
        Use SVD to find the 32 concepts with the highest intensity in the dataset.
        Each concept is assigned an ID in the segment [0, 31].
        :return: list of pairs (concept_id, intensity)
        """
        singular_values = self.sigma.toArray()
        top_32_concepts = sorted(enumerate(singular_values), key=lambda x: x[1], reverse=True)[:32]

        return top_32_concepts

    def user_conceptsID(self):
        """
        Find concept ID for each user based on matrix U.
        :return: DataFrame with columns 'user_id' and 'concept_id'.
        """
        sparse_vectors = self.create_sparse_vectors()
        user_ids = sparse_vectors.map(lambda x: x[0]).sortBy(lambda x: x)
        df_concept_user = self.U.rows.zip(user_ids).map(
            lambda x: (x[1], int(np.argmax(x[0].toArray())))
        ).toDF(["user_id", "concept_id"])

        return df_concept_user

    def item_conceptsID(self):
        """
        Find concept ID for each item based on matrix V.
        :return: DataFrame with columns 'item_id' and 'concept_id'.
        """
        item_ids = self.data.select("item").distinct().rdd.map(lambda row: row[0]).sortBy(lambda x: x)
        item_concept_values = self.V.toArray().T
        item_ids_index = item_ids.zipWithIndex()
        df_concept_item = item_ids_index.map(
            lambda x: (
                x[0],
                int(np.argmax(item_concept_values[:, x[1]]))
            )
        ).toDF(["item_id", "concept_id"])

        return df_concept_item

    def concept_portion(self, df_concept_user, df_concept_item):
        """
        Calculate the proportion of each concept based on the number of users (users) and number of products (items).

        :param df_concept_user: DataFrame containing columns 'user_id' and 'concept_id'.
        :param df_concept_item: DataFrame containing columns 'item_id' and 'concept_id'.
        :return: DataFrame with columns `concept_id`, 'user_portion, 'item_portion'.
        """
        user_concept_count = df_concept_user.groupBy("concept_id").count()
        total_users = df_concept_user.select("user_id").distinct().count()
        user_concept_portion = user_concept_count.withColumn(
            "user_portion", F.col("count") / total_users
        ).select("concept_id", "user_portion")

        item_concept_count = df_concept_item.groupBy("concept_id").count()
        total_items = df_concept_item.select("item_id").distinct().count()
        item_concept_portion = item_concept_count.withColumn(
            "item_portion", F.col("count") / total_items
        ).select("concept_id", "item_portion")

        df_concept_portion = user_concept_portion.join(
            item_concept_portion, on="concept_id", how="outer"
        )
        df_concept_portion = df_concept_portion.fillna({"user_portion": 0, "item_portion": 0})

        return df_concept_portion

    def user_embeddings(self):
        """
        Find embedding for each user based on U matrix and singular sigma value.
        :return: DataFrame with columns 'user_id' and 'embedding'.
        """
        sigma = np.diag(self.sigma.toArray())
        u = self.U.rows.zipWithIndex()
        user_embeddings_rdd = u.map(
            lambda x: (x[1], np.dot(x[0].toArray(), sigma).tolist())
        )
        sparse_vectors = self.create_sparse_vectors()
        user_ids = sparse_vectors.map(lambda x: x[0])
        user_embeddings = user_ids.zip(user_embeddings_rdd.map(lambda x: x[1]))
        df_embedding_user = user_embeddings.toDF(["user_id", "embedding"])

        return df_embedding_user

# Init and load data

In [None]:
question1 = Question1()

question1.load_data("/content/drive/MyDrive/MMDS/EndtermProject/dataset/ratings2k.csv")

question1.compute_svd(32)

# Data description

In [None]:
question1.data.show()

+-----+----+----+------+
|index|user|item|rating|
+-----+----+----+------+
|    0|  73|  52|   4.0|
|    1|  36| 239|   3.0|
|    2|  72|  26|   1.0|
|    3|  59| 430|   2.5|
|    4|  72| 284|   3.0|
|    5|  36| 277|   3.0|
|    6|  72| 426|   4.0|
|    7|  18| 163|   3.0|
|    8|  67|  93|   4.0|
|    9|  59|  22|   3.5|
|   10|   8| 174|   2.0|
|   11|   5| 149|   2.0|
|   12|  26| 322|   3.5|
|   13|   8| 416|   4.0|
|   14|  31|  25|   2.0|
|   15|  41|  83|   2.0|
|   16|  25| 321|   3.0|
|   17|  47| 193|   3.0|
|   18|   9| 455|   4.0|
|   19|  43| 216|   3.5|
+-----+----+----+------+
only showing top 20 rows



In [None]:
question1.data.printSchema()

root
 |-- index: integer (nullable = true)
 |-- user: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- rating: double (nullable = true)



In [None]:
question1.data.describe().show()

+-------+-----------------+------------------+------------------+------------------+
|summary|            index|              user|              item|            rating|
+-------+-----------------+------------------+------------------+------------------+
|  count|             2365|              2365|              2365|              2365|
|   mean|           1182.0|38.002536997885834|221.79957716701904| 3.641860465116279|
| stddev|682.8610156295838| 23.50116086683459|130.38801426427398|1.0067781732668075|
|    min|                0|                 1|                 0|               0.5|
|    max|             2364|                75|               466|               5.0|
+-------+-----------------+------------------+------------------+------------------+



In [None]:
for col in question1.data.columns:
  print(f"Number of null values in {col}: {question1.data.filter(F.col(col).isNull()).count()}")

for col in question1.data.columns:
  print(f"Number of NaN values in {col}: {question1.data.filter(F.isnan(col)).count()}")

Number of null values in index: 0
Number of null values in user: 0
Number of null values in item: 0
Number of null values in rating: 0
Number of NaN values in index: 0
Number of NaN values in user: 0
Number of NaN values in item: 0
Number of NaN values in rating: 0


# Sparse Vectors

In [None]:
sparse_vectors = question1.create_sparse_vectors()

for user, sparse_vector in sparse_vectors.take(10):
    print(f"User {user}; Sparse Vector: {sparse_vector}")

User 31; Sparse Vector: (467,[7,8,11,25,26,28,29,34,36,60,67,68,77,79,81,83,89,93,102,118,140,144,155,159,161,163,164,176,186,190,191,193,199,200,204,211,214,216,225,236,248,251,258,262,272,288,299,301,318,319,320,321,322,323,324,326,328,333,345,346,373,374,379,386,392,406,413,422,429,432,440,441,444,449,459,462],[3.5,3.5,2.5,2.0,3.5,2.5,2.0,3.0,3.0,2.0,1.5,3.0,3.5,2.0,3.0,2.0,2.5,2.5,2.5,2.5,2.0,1.0,2.5,1.5,2.5,3.0,2.5,4.0,2.5,2.0,3.0,2.0,3.0,2.5,2.5,1.5,3.0,2.5,3.5,0.5,2.5,3.5,3.5,2.0,2.5,2.5,3.0,1.5,2.0,3.0,2.0,3.0,1.5,3.0,3.0,3.0,3.0,3.5,2.0,3.0,2.5,2.0,2.5,2.0,1.5,3.0,3.0,4.0,4.5,3.5,1.0,3.5,3.5,2.5,3.5,3.5])
User 65; Sparse Vector: (467,[176],[1.0])
User 53; Sparse Vector: (467,[288,324,333,430,432],[4.5,3.0,4.5,5.0,5.0])
User 34; Sparse Vector: (467,[19,67,163,212,213,249,269,375,413],[3.0,3.0,4.0,4.0,3.0,2.5,3.0,1.5,4.5])
User 28; Sparse Vector: (467,[25,140,144,161,174,216,301,335,356,376,379,440],[5.0,5.0,5.0,3.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0])
User 26; Sparse Vector: (467,

# Top 32 concept

In [None]:
top_32_concepts = question1.top_concepts()
for id, intensity in top_32_concepts:
    print(f"Concept ID: {id}, Intensity: {intensity}")

Concept ID: 0, Intensity: 102.77098686001769
Concept ID: 1, Intensity: 42.851398949250914
Concept ID: 2, Intensity: 40.60506471210495
Concept ID: 3, Intensity: 38.20954833591901
Concept ID: 4, Intensity: 36.51932232110635
Concept ID: 5, Intensity: 33.150453293770234
Concept ID: 6, Intensity: 31.476196989518442
Concept ID: 7, Intensity: 30.34329799645251
Concept ID: 8, Intensity: 29.79436382958975
Concept ID: 9, Intensity: 28.254048263651864
Concept ID: 10, Intensity: 25.987583757513598
Concept ID: 11, Intensity: 25.685335833380904
Concept ID: 12, Intensity: 25.286333879973142
Concept ID: 13, Intensity: 23.81998163949409
Concept ID: 14, Intensity: 23.06122189466223
Concept ID: 15, Intensity: 22.943931946669185
Concept ID: 16, Intensity: 22.650141490195875
Concept ID: 17, Intensity: 21.794056272873576
Concept ID: 18, Intensity: 20.783343400040735
Concept ID: 19, Intensity: 20.314965071240643
Concept ID: 20, Intensity: 19.411638597084245
Concept ID: 21, Intensity: 19.061167679093476
Conce

# df_concept_user

In [None]:
df_concept_user = question1.user_conceptsID()
df_concept_user.show()

+-------+----------+
|user_id|concept_id|
+-------+----------+
|      1|        29|
|      2|        29|
|      3|        23|
|      4|        27|
|      5|        29|
|      6|        25|
|      7|        18|
|      8|        31|
|      9|         4|
|     10|        19|
|     11|        18|
|     12|        30|
|     13|        31|
|     14|        29|
|     15|        29|
|     16|        28|
|     17|        28|
|     18|        24|
|     19|        20|
|     20|        30|
+-------+----------+
only showing top 20 rows



# df_concept_item

In [None]:
df_concept_item = question1.item_conceptsID()
df_concept_item.show()

+-------+----------+
|item_id|concept_id|
+-------+----------+
|      0|        11|
|      1|        28|
|      2|        24|
|      3|        30|
|      4|        27|
|      5|        29|
|      6|        14|
|      7|         2|
|      8|        12|
|      9|        22|
|     10|         7|
|     11|         6|
|     12|        22|
|     13|        25|
|     14|         9|
|     15|        28|
|     16|         9|
|     17|         5|
|     18|        22|
|     19|        27|
+-------+----------+
only showing top 20 rows



#  df_concept_portion

In [None]:
df_concept_portion = question1.concept_portion(df_concept_user, df_concept_item)
df_concept_portion.show()

+----------+--------------------+--------------------+
|concept_id|        user_portion|        item_portion|
+----------+--------------------+--------------------+
|         0|                 0.0|0.008565310492505354|
|         1|0.013513513513513514| 0.03854389721627409|
|         2|0.013513513513513514|0.055674518201284794|
|         3|                 0.0|0.017130620985010708|
|         4| 0.02702702702702703|0.057815845824411134|
|         5|                 0.0| 0.10492505353319058|
|         6|0.013513513513513514|0.017130620985010708|
|         7|0.013513513513513514|0.047109207708779445|
|         8|0.013513513513513514|0.042826552462526764|
|         9|0.013513513513513514|  0.0835117773019272|
|        10| 0.02702702702702703|0.057815845824411134|
|        11|0.013513513513513514|0.010706638115631691|
|        12|0.013513513513513514|0.023554603854389723|
|        13|0.013513513513513514|0.017130620985010708|
|        14| 0.02702702702702703|0.042826552462526764|
|        1

#df_embedding_user

In [None]:
df_embedding_user = question1.user_embeddings()
df_embedding_user.show(truncate=False)

+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|embedding                                                                                                                                                                                                                                                                                                

In [None]:
output_path = "/content/output"
df_embedding_user.write.json(output_path)