<a href="https://colab.research.google.com/github/DomizianoScarcelli/big-data-project/blob/main/user_based_CF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Config

# Configuration

In [1]:
#@title Download necessary libraries
!pip install pyspark
!pip install -U -q PyDrive 
!apt install openjdk-8-jdk-headless -qq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
openjdk-8-jdk-headless is already the newest version (8u372-ga~us1-0ubuntu1~20.04).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [2]:
#@title Imports
import os
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly

import pyspark
from pyspark.sql import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, FloatType, LongType
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.ml.linalg import SparseVector, DenseVector

from tqdm.notebook import tqdm
import time
import gc

from google.colab import drive

In [3]:
#@title Set up variables
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
GDRIVE_DIR = "/content/drive"
GDRIVE_HOME_DIR = GDRIVE_DIR + "/MyDrive"
GDRIVE_DATA_DIR = GDRIVE_HOME_DIR + "/Big Data/datasets"
DATASET_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_friendly_spotify_playlist_dataset")
AUDIO_FEATURES_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_track_features")
LITTLE_SLICE_FILE = os.path.join(GDRIVE_DATA_DIR, "little_slice")
SMALL_SLICE_FLIE = os.path.join(GDRIVE_DATA_DIR, "small_slice")
LITTLE_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "little_slice_audio_features")
MICRO_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "micro_slice_audio_features")
SPLITTED_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "splitted_pyspark_track_features")
SAVED_DFS_PATH = os.path.join(GDRIVE_DATA_DIR, "saved_dfs")
RANDOM_SEED = 42 # for reproducibility
os.environ["JAVA_HOME"] = JAVA_HOME
os.environ["PYSPARK_PYTHON"]="python"

In [4]:
#@title Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '100G').\
                set("spark.executor.extraJavaOptions", "-XX:+UseG1GC").\
                setAppName("PySparkTutorial").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [5]:
drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


## Setup ngrok

In [6]:
!pip install pyngrok

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
!ngrok authtoken 2NVN8kdoOnMVtlDGGWtwsbT5M3Q_2EJv2HE77FEXkz978Qtnq

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [48]:
from pyngrok import ngrok

# Open a ngrok tunnel on the port 4050 where Spark is running
port = '4050'
public_url = ngrok.connect(port).public_url

In [49]:
print("To access the Spark Web UI console, please click on the following link to the ngrok tunnel \"{}\" -> \"http://127.0.0.1:{}\"".format(public_url, port))

To access the Spark Web UI console, please click on the following link to the ngrok tunnel "https://27b6-34-125-179-2.ngrok-free.app" -> "http://127.0.0.1:4050"


In [10]:
#@title Check if everything is ok
spark, sc._conf.getAll()


(<pyspark.sql.session.SparkSession at 0x7ff5fbb43f40>,
 [('spark.executor.extraJavaOptions',
   '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false -XX:+UseG1GC'),
  ('spark.app.name', 'PySparkTutorial'),
  ('spark.executor.id', 'driver'),
  ('sp

# Data acquisition

In [11]:
from pyspark.ml.linalg import VectorUDT
song_schema = StructType([
    StructField("pos", IntegerType(), True),
    StructField("artist_name", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("artist_uri", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("album_uri", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("album_name", StringType(), True)
])

playlist_schema = StructType([
    StructField("name", StringType(), True),
    StructField("collaborative", StringType(), True),
    StructField("pid", IntegerType(), True),
    StructField("modified_at", IntegerType(), True),
    StructField("num_tracks", IntegerType(), True),
    StructField("num_albums", IntegerType(), True),
    StructField("num_followers", IntegerType(), True),
    StructField("tracks", ArrayType(song_schema), True),
    StructField("num_edits", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("num_artists", IntegerType(), True),
])

playlist_schema_mapped = StructType([
    StructField("name", StringType(), True),
    StructField("collaborative", StringType(), True),
    StructField("pid", IntegerType(), True),
    StructField("modified_at", IntegerType(), True),
    StructField("num_tracks", IntegerType(), True),
    StructField("num_albums", IntegerType(), True),
    StructField("num_followers", IntegerType(), True),
    StructField("tracks", VectorUDT(), True),
    StructField("num_edits", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("num_artists", IntegerType(), True),
])

audio_features_schema = StructType([
    StructField("danceability", FloatType(), True),
    StructField("energy", FloatType(), True),
    StructField("key", IntegerType(), True),
    StructField("loudness", FloatType(), True),
    StructField("mode", IntegerType(), True),
    StructField("speechiness", FloatType(), True),
    StructField("acousticness", FloatType(), True),
    StructField("instrumentalness", FloatType(), True),
    StructField("liveness", FloatType(), True),
    StructField("valence", FloatType(), True),
    StructField("tempo", FloatType(), True),
    StructField("type", StringType(), True),
    StructField("id", StringType(), True),
    StructField("uri", StringType(), True),
    StructField("track_href", StringType(), True),
    StructField("analysis_url", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("time_signature", IntegerType(), True)
])




In [12]:
playlist_df = spark.read.schema(playlist_schema).json(DATASET_FILE, multiLine=True)
slice_df = spark.read.schema(playlist_schema).json(SMALL_SLICE_FLIE, multiLine=True)
# slice_df = spark.read.schema(playlist_schema).json(LITTLE_SLICE_FILE, multiLine=True)
audio_df = spark.read.schema(audio_features_schema).json(SPLITTED_SLICE_AUDIO_FEATURES, multiLine=True) #has less songs than expected

In [13]:
# slice_df.select("tracks").first()

In [14]:
# slice_df.show()

# User-Based Collaborative Filtering
Note: The users are the playlists, the items are the songs and the ratings are 0 if the song is not in the playlist, 1 otherwise.

We have to define a function $sim(u,v)$ that defines the similarity between two users based on their ratings.

We represent the ratings $r_u \in \mathbb{R}^n$ as the $n$ dimensional vector that represents the ratings of the user $u$, where $n$ is the number of total songs in the dataset.

As the similarity function we can use Jaccard similarity.
\begin{equation}
sim(u,v) = J(r_u, r_v) = \frac{|r_u \cap r_v|}{|r_u \cup r_v|}
\end{equation}

Jaccard similarity ignores rating values, but we don't care here since the ratings are binary. In case of discrete value ratings we can use cosine similarity, or better pearson's correlation.

Done that, and defined as ${U^k}$ the neighborhood of $u$ ($k$ most similar users to $u$), we define the set of items rated by $u$'s neighborhood as

\begin{equation}
I^k = \{i \in I : \mathbf{r_{u,i}} \downarrow \land u \in U^k\}
\end{equation}

The rating for the item $i$ to the user $u$ will just be $\mathbf{r_u[i]}$.

In [15]:
from functools import wraps
import time

DEBUG = True
IGNORE_TIMING = False
def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        total_time = end_time - start_time
        if not IGNORE_TIMING:
          print(f'Function {func.__name__} Took {total_time:.4f} seconds')
        return result
    return timeit_wrapper

In [16]:
RATING_VECTOR_FILE_PATH = os.path.join(SAVED_DFS_PATH, "playlist_rating_df.parquet")
PLAYLIST_EMBEDDINGS = os.path.join(SAVED_DFS_PATH, "playlist_embeddings.json")
FULL_PLAYLIST_EMBEDDINGS = os.path.join(SAVED_DFS_PATH, "full_playlist_embeddings.parquet")

In [17]:
def dense_to_sparse(dense: DenseVector) -> SparseVector:
  nonzero_indices = np.nonzero(np.array(dense))[0]
  nonzero_values = np.array(dense)[nonzero_indices]
  sparse_vector = SparseVector(len(dense), nonzero_indices.tolist(), nonzero_values.tolist())
  return sparse_vector

In [18]:
def get_all_songs(playlist_df: DataFrame, set_in_playlist: bool = False) -> DataFrame:
   all_songs = playlist_df.select(explode("tracks.track_uri").alias("track_uri")).distinct()
   if set_in_playlist:
     all_songs = all_songs.withColumn("in_playlist", lit(1))
   return all_songs
  
def get_songs_info(playlist_df: DataFrame, set_in_playlist: bool = False) -> DataFrame:
   all_songs = playlist_df.select(explode("tracks")).select("col.*").drop("pos").distinct()
   if set_in_playlist:
     all_songs = all_songs.withColumn("in_playlist", lit(1))
   return all_songs

In [19]:
songs_info_df = get_songs_info(slice_df)
songs_info_df.createOrReplaceTempView("SONGS_INFO")

songs_info_df = spark.sql("""
SELECT 
    row_number() OVER (
        PARTITION BY '' 
        ORDER BY '' 
    ) as pos,
    *
FROM 
    SONGS_INFO
""")

songs_info_df = songs_info_df.sort("track_uri")

songs_df = songs_info_df.select("pos", "track_uri")

RATING_VECTOR_LENGTH = songs_df.count()

In [20]:
# songs_info_df.show()
# songs_df.show()

Preprocessing the dataframe in order to associate to each track_uri an integer, that will represent the position of the track in the rating_vector. This is useful in order to avoid doing a lot of joins when generating the rating_vectors.

In [21]:
# def map_track_df_to_pos(playlist_df: DataFrame, mapping: DataFrame) -> List[DataFrame]:
#   songs_df_list = [get_all_songs(spark.createDataFrame([row])) for row in tqdm(slice_df.collect(), desc="Creating list of dataframes")]
#   track_uri_to_id = songs_df.select('track_uri', 'pos').rdd.collectAsMap()
#   track_uri_to_id_udf = udf(lambda x: track_uri_to_id.get(x), IntegerType())
#   songs_df_mapped_list = []

#   for df in tqdm(songs_df_list, desc="Mapping uris to pos"):
#       df = df.withColumn('pos', track_uri_to_id_udf(col('track_uri')))
#       songs_df_mapped_list.append(df)
  
#   return songs_df_mapped_list

# songs_df_mapped_list = map_track_df_to_pos(slice_df, songs_df)

In [22]:
from pyspark.ml.linalg import VectorUDT
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, ArrayType
from functools import reduce

track_uri_to_id = songs_df.select('track_uri', 'pos').rdd.collectAsMap() # TODO: Pass it as a parameter maybe?
#TODO: Since the .rdd is very slow, I can embed the position information of the track inside the track itself,
# So then I can just do pos_list.add(row.rating_position) in a few miliseconds. 
@timeit
def map_track_df_to_pos(playlist_df: DataFrame, mapping: DataFrame) -> DataFrame:
    """
    Returns a DataFrames containing the playlists, but the tracks are represented as a binary sparse vector.
    """

    def extract_vector(tracks):
      pos_list = set()

      def reduce_fn(pos_list, row):
          pos_list.add(track_uri_to_id.get(row.track_uri))
          return pos_list
      
      pos_list = reduce(reduce_fn, tracks, pos_list)
      
      return SparseVector(RATING_VECTOR_LENGTH + 1, sorted(list(pos_list)), [1 for _ in pos_list])

    map_track_uri_udf = udf(lambda tracks: extract_vector(tracks), returnType=VectorUDT())

    # Apply the mapping UDF on the "tracks" column of the slice_df dataframe
    mapped_df = playlist_df.withColumn('tracks', map_track_uri_udf(col('tracks')))

    return mapped_df

if not os.path.exists(PLAYLIST_EMBEDDINGS):
  mapped_slice_df = map_track_df_to_pos(slice_df, songs_df)
  mapped_slice_df.write.json(PLAYLIST_EMBEDDINGS)
else:
  mapped_slice_df = spark.read.schema(playlist_schema_mapped).json(PLAYLIST_EMBEDDINGS)

In [23]:
mapped_slice_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- collaborative: string (nullable = true)
 |-- pid: integer (nullable = true)
 |-- modified_at: integer (nullable = true)
 |-- num_tracks: integer (nullable = true)
 |-- num_albums: integer (nullable = true)
 |-- num_followers: integer (nullable = true)
 |-- tracks: vector (nullable = true)
 |-- num_edits: integer (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- num_artists: integer (nullable = true)



In [24]:
print("The size of the track_uri -> position mapping dictionary is {} bytes".format(sys.getsizeof(track_uri_to_id)))

The size of the track_uri -> position mapping dictionary is 20971608 bytes


In [25]:
# def row_to_sparse_vector(item: dict) -> SparseVector:
#     """
#     Because of json serialization, the SparseVector is converted into a Row(indices=..., values=...),
#     this function converts it back to a pyspark.SparseVector with length RATING_VECTOR_LENGHT+1 as default.
#     """
#     return SparseVector(RATING_VECTOR_LENGTH+1, item.indices, item.values)

# @udf(returnType=VectorUDT())
# def parse_sparse_vector(row):
#   return row_to_sparse_vector(row)

# #TODO: Uncomment this and remove the other calls to row_to_spare_vector
# # mapped_slice_df = mapped_slice_df.withColumnRenamed("rating_vector", "temp")\
#   # .withColumn("rating_vector", parse_sparse_vector(col("temp")))\
#   # .drop("temp").cache()

In [26]:
#@title Old way of creating the rating vector dataframe
# def _create_rating_df(playlist_row: Row, songs_df: DataFrame) -> DataFrame:
#   """
#   Creates a dataframe that represents the "ratings" for a playlist in the dataframe
#   """
#   playlist_row = spark.createDataFrame([playlist_row], playlist_schema)
#   playlist_uris = get_all_songs(playlist_row)

#   joined = songs_df.join(playlist_uris, on="track_uri", how="right")
#   return joined


# def _check_songs_ordering(playlist_row: DataFrame, songs_df: DataFrame) -> bool:
#   """
#   Returns a boolean that indicates if the ordering in the songs_df and rating_df is the same
#   """
#   playlist_row = spark.createDataFrame([playlist_row], playlist_schema)
#   playlist_uris = get_all_songs(playlist_row, True).withColumnRenamed("in_playlist", "isin")

#   joined = songs_df.join(playlist_uris, on="track_uri", how="right")
#   joined_left = songs_df.join(playlist_uris, on="track_uri", how="left").filter("isin == 1")
#   assert joined.collect() == joined_left.collect(), f"The order of songs_df is different from the order of rating_df!"

# # def _extract_rating_vector(rating_df: DataFrame) -> SparseVector:
# #   """
# #   Extracts the rating vectors for each playlist 
# #   """
# #   dense_vector = DenseVector([row.isin for row in rating_df.select("isin").collect()])
# #   return dense_to_sparse(dense_vector)

# def _extrac_sparse_rating_vector(rating_df: DataFrame) -> SparseVector:
#   indices = np.sort([row.pos for row in rating_df.collect()])
#   return SparseVector(RATING_VECTOR_LENGTH, indices, np.ones(indices.shape[0]) )

# def rating_vector_from_row(playlist_row: Row, songs_df: DataFrame):
#   """
#   Pipelines togheter create_rating_df and extract_rating_vector.
#   """
#   rating_df_1 = _create_rating_df(playlist_row, songs_df)
#   rating_vector_1 = _extrac_sparse_rating_vector(rating_df_1)
#   return rating_vector_1

# # t1 = time.time() 

# # rating_vector_1 = rating_vector_from_row(slice_df.first(), songs_df)

# # t2 = time.time()

# # t2 - t1, rating_vector_1, type(rating_vector_1)

In [27]:
def jaccard_similarity(vector_1: SparseVector, vector_2: SparseVector) -> float:
  """
  Computes the Jaccard Similarity between two sparse binary vectors
  """
  # Convert SparseVectors to sets
  set1 = set(vector_1.indices)
  set2 = set(vector_2.indices)

  # Calculate the intersection and union of the sets
  intersection = len(set1.intersection(set2))
  union = len(set1.union(set2))

  # Calculate the similarity
  similarity = intersection / union

  return similarity

In [28]:
#@title Old way of creating the ratinf vector dataframe pt.2
# def create_rating_vectors_df(playlists_df: DataFrame) -> DataFrame:
#   rating_vectors = []

#   for playlist_row in tqdm(playlists_df.collect(), desc="Creating rating vectors"):
#     rating_vector = rating_vector_from_row(playlist_row, songs_df)
#     new_row = Row(playlist_id=playlist_row.pid, rating_vector=rating_vector)
#     rating_vectors.append([new_row])
#   return spark.createDataFrame(rating_vectors)

# if os.path.exists(RATING_VECTOR_FILE_PATH):
#   # rv_schema = StructType([StructField('playlist_id', LongType(), True), StructField('rating_vector', pyspark.ml.linalg.VectorUDT(), True)])
#   rating_vectors_df = spark.read.parquet(RATING_VECTOR_FILE_PATH)
#   rv_df = rating_vectors_df.select(col("_1.playlist_id").alias("playlist_id"), col("_1.rating_vector").alias("rating_vector"))
# else:
#   rating_vectors_df = create_rating_vectors_df(slice_df)
#   rating_vectors_df.write.parquet(RATING_VECTOR_FILE_PATH)

Creating a function that gets in input the playlist to continue, and returns a Dataframe that indicates its similarity with each other playlist in the dataset.

In [29]:
# mapped_slice_df.first()

In [30]:
@timeit
def create_similarity_df(input_vector: DataFrame, rating_vectors_df: DataFrame, similarityFunction: Callable) -> DataFrame:
  rv_df_input = rating_vectors_df.crossJoin(input_vector)
  similarity_udf = udf(similarityFunction, returnType='double')
  result_df = rv_df_input.withColumn("similarity", similarity_udf(rv_df_input["input_vector"], rv_df_input["rating_vector"]))
  return result_df

if DEBUG:
  rv_df = mapped_slice_df.withColumnRenamed("tracks", "rating_vector")
  # TODO: Just for test, we take the first playlist as the playlist to be continued 
  first_playlist_vector = rv_df.limit(1).select("rating_vector").withColumnRenamed("rating_vector","input_vector")
  result_df = create_similarity_df(first_playlist_vector, rv_df, jaccard_similarity)

Function create_similarity_df Took 0.5372 seconds


Curse of dimensionality! We can see that each playlist is very dissimilar from each other playlist.

If we filter the playlists that have a strictly positive similarity with the input playlist, and order them by descending similarity, we can see that the name (that we assume is very informative for the content of the playlist) is very similar, meaning that the algorithm seems to work!

In [31]:
# result_df.filter("similarity > 0").orderBy(col("similarity").desc()).show()

Now, in order to suggest some songs to continuate the input playlist, let's take the $k$ top most similar playlists

In [32]:
@timeit
def get_top_k_results(playlist_pid: int, similarity_df: DataFrame, k: int = 20) -> DataFrame:
  return similarity_df.filter( (col('pid') != playlist_pid)).orderBy(col("similarity").desc()).limit(k)

if DEBUG:
  first_playlist_pid = rv_df.limit(1).select("pid").first().pid
  top_k_results = get_top_k_results(first_playlist_pid, result_df)

Function get_top_k_results Took 0.0449 seconds


In [33]:
# top_k_results.show()

In [34]:
# from pyspark.ml.linalg import VectorUDT

# def add_sparse_vectors(accumulator: SparseVector, vector: SparseVector, weight: float) -> SparseVector:
#     accumulator_vec = accumulator.toArray() 
#     array_2 = vector.toArray() * weight

#     summed_array = accumulator_vec + array_2

#     values = [value for value in summed_array if value != 0]
#     sorted_indices = [index for index, value in enumerate(summed_array) if value != 0]
#     return SparseVector(accumulator_vec.size, sorted_indices, values)

# @udf(returnType=VectorUDT())
# def accumulate_sparse_vectors(accumulator, rating_vector, similarity):
#     summed_vector = add_sparse_vectors(accumulator, rating_vector, similarity)
#     return summed_vector

# df = top_k_results.withColumn('accumulated_vector', accumulate_sparse_vectors(top_k_results["rating_vector"], top_k_results["rating_vector"], top_k_results['similarity']))

We want to obtain a single embedding for all the $K$ top most similar playlists, that will be the rating vector. We can then pick the indices of the $n$ top greatest values form this vector, and those will be the $n$ songs that we will reccomend.

In order to aggregate the $k$ embeddings into a single one, I decided to take an average, weighted by the similarity value.

In [86]:
@timeit
def get_input_rating_vector(similarity_df: DataFrame) -> SparseVector:
  return similarity_df.limit(1).select("input_vector").collect()[0].input_vector

@timeit
def accumulate_top_k_results(top_k_results: DataFrame, input_vector: np.ndarray) -> DataFrame:

  @udf(returnType=VectorUDT())
  def sum_vector(sparse_vectors, similarities):
    similarities = np.array(similarities)
    sparse_vectors = np.array(sparse_vectors)
    acc = np.dot(sparse_vectors.T, similarities) #Compute the sum(vector * similarity) for each vector and similarity
    acc /= similarities.sum() #Normalize the vector
    acc -= (input_vector * acc) #If a song is present in the input playlist, don't consider it
    return SparseVector(acc.size, np.nonzero(acc)[0], acc[np.nonzero(acc)])

  return top_k_results.agg(sum_vector(collect_list('rating_vector'), collect_list("similarity")).alias('summed'))

## @timeit
# def accumulate_top_k_results_OLD(top_k_results: DataFrame, similarity_df: DataFrame) -> SparseVector:
#     rdd = top_k_results.rdd
#     accumulator: np.ndarray = rdd.map(lambda row: (row.rating_vector.toArray() * row.similarity)).reduce(lambda x, y: (x+y))
#     similarity_sum: float = rdd.map(lambda row: (row.similarity)).reduce(lambda x, y: x + y)

#     accumulator /= similarity_sum

#     #If a song is present in the input playlist, don't consider it
#     input_indexes: SparseVector = get_input_rating_vector(similarity_df).indices

#     values: List[float] = []
#     sorted_indices: List[int] = []
#     for index, value in enumerate(accumulator):
#       if value != 0 and index not in input_indexes:
#         values.append(value)
#         sorted_indices.append(index)
    
#     return SparseVector(len(accumulator), sorted_indices, values)

if DEBUG:
  input_vector = get_input_rating_vector(result_df).toArray()
  accumulated_vector_df = accumulate_top_k_results(top_k_results, input_vector)

Function get_input_rating_vector Took 0.4929 seconds


NotImplementedError: ignored

In [82]:
@udf(returnType=ArrayType(
    StructType([
      StructField("pos", IntegerType(), False),
      StructField("confidence", FloatType(), False)
])))
# def get_top_n_values(vector: SparseVector, n=10):
#   sorted_indices = sorted(range(len(vector.indices)), key=lambda i: vector.values[i], reverse=True)[:n]
#   return [(vector.indices[index], vector.values[index]) for index in sorted_indices]
def get_top_n_values(vector: SparseVector, n: int=10):
  sorted_elements = vector.toArray().tolist()
  top_n_indices = sorted(range(len(sorted_elements)), key=lambda i: sorted_elements[i], reverse=True)[:n]
  return [(index, sorted_elements[index]) for index in top_n_indices]

if DEBUG:
  top_n_reccomendations = accumulated_vector_df.withColumn("top_n_recommendations", get_top_n_values(col("summed"))).select(explode("top_n_recommendations")).select("col.*")

In [69]:
# top_n_reccomendations.show()

In [70]:
prediction_schema = StructType([
     StructField('artist_name', StringType(), True),
     StructField('track_uri', StringType(), True),
     StructField('artist_uri', StringType(), True),
     StructField('track_name', StringType(), True),
     StructField('album_uri', StringType(), True),
     StructField('duration_ms', LongType(), True),
     StructField('album_name', StringType(), True),
    StructField('confidence', FloatType(), True),
     ])

In [71]:
import json
#TODO: For now this works, but it's very slow, and since this has to be executed online,
# consider to directly embed the song information inside the dataframe when computing the songs
# to recommend.
@timeit
def recommendation_song_info(recommendation: DataFrame, songs_info_df: DataFrame) -> DataFrame:
  return recommendation.join(songs_info_df, "pos")

if DEBUG:
  songs_info = recommendation_song_info(top_n_reccomendations, songs_info_df)

Function recommendation_song_info Took 0.0249 seconds


### Putting it all togheter
We now define a single function that will get a playlist in input and will reccomend $n$ songs.

In [84]:
#TODO: this now takes a playlist and extracts its PID, if a playlist is built from scratch the PID shouldn't be defined
# A solution would be to pass the playlist row with the PID = Nan and then have a condition when extracting the PID. If Nan, ignore it
@timeit
def user_based_recommendation(playlist: DataFrame, 
                              mapped_slice_df: DataFrame, 
                              similarity_function: Callable, 
                              n:int = 50,
                              k: int = 20) -> DataFrame:
                              
  rv_df = mapped_slice_df.withColumnRenamed("tracks", "rating_vector") #TODO: Parse the rv_df before and then remove this
  
  #TODO: define the songs_df as input to the function
  playlist_vector = map_track_df_to_pos(playlist, songs_df).select("tracks").withColumnRenamed("tracks", "input_vector")
  similarity_df = create_similarity_df(playlist_vector, rv_df, jaccard_similarity)
  top_k_results = get_top_k_results(playlist.first().pid, similarity_df, k=k)
  input_vector = playlist_vector.select("input_vector").first()[0].toArray()
  accumulated_vector_df = accumulate_top_k_results(top_k_results, input_vector)

  # top_n_indices = get_top_n_values(accumulated_vector_df, n=n)
  top_n_indices = accumulated_vector_df\
                  .withColumn("top_n_recommendations", get_top_n_values(col("summed")))\
                  .select(explode("top_n_recommendations"))\
                  .select("col.*")
  recommended_songs_info = recommendation_song_info(top_n_indices, songs_info_df)

  return recommended_songs_info

if DEBUG:
  #Collect and createDataFrame because operations on limit(1) take as long as the entire slice_df, don't know why
  playlist = spark.createDataFrame(slice_df.filter("pid == 1010").limit(1).collect())
  final_recommendation = user_based_recommendation(playlist, mapped_slice_df, jaccard_similarity, n=5)

## Performance Evaluation

In [73]:
from sklearn.model_selection import train_test_split as sklearn_split

def train_test_split(playlist: Row) -> Tuple[Row, Row]:
    train_rows, test_rows = sklearn_split(playlist.tracks, random_state=42)

    playlist_train =  Row(
            name=playlist.name,
            collaborative=playlist.collaborative,
            pid=playlist.pid,
            modified_at=playlist.modified_at,
            num_tracks=playlist.num_tracks,
            num_albums=playlist.num_albums,
            num_followers=playlist.num_followers,
            tracks=train_rows,
            num_edits=playlist.num_edits,
            duration_ms=playlist.duration_ms,
            num_artists=playlist.num_artists,
        )

    playlist_test = Row(
            name=playlist.name,
            collaborative=playlist.collaborative,
            pid=playlist.pid,
            modified_at=playlist.modified_at,
            num_tracks=playlist.num_tracks,
            num_albums=playlist.num_albums,
            num_followers=playlist.num_followers,
            tracks=test_rows,
            num_edits=playlist.num_edits,
            duration_ms=playlist.duration_ms,
            num_artists=playlist.num_artists,
        )
    
    return playlist_train, playlist_test

Let's divide the whole playlist dataset into train and test splits

In [78]:
from pyspark.sql.functions import udf, struct
import shutil

def divide_whole_dataset(playlist_df: DataFrame) -> Tuple[DataFrame, DataFrame]:
  # Create a UDF to apply the divide_row function to each row
  train_test_split_udf = udf(train_test_split, returnType=ArrayType(StructType(playlist_df.schema.fields)))

  # Apply the divide_row UDF to each row of the DataFrame
  divided_df = playlist_df.withColumn("divided", train_test_split_udf(struct(*playlist_df.columns)))

  # Split the divided column into two separate columns: train and test
  train_test_df = divided_df.select(col('divided').getItem(0).alias('train'), col('divided').getItem(1).alias('test'))

  # Split the train and test columns into separate DataFrames
  train_df = train_test_df.select("train.*")
  test_df = train_test_df.select("test.*")
  return train_df, test_df

TRAIN_DF_PATH = os.path.join(SAVED_DFS_PATH, "train_df.json")
TEST_DF_PATH = os.path.join(SAVED_DFS_PATH, "test_df.json")

if os.path.exists(TRAIN_DF_PATH) and os.path.exists(TEST_DF_PATH):
  train_df = spark.read.schema(playlist_schema).json(TRAIN_DF_PATH)
  test_df = spark.read.schema(playlist_schema).json(TEST_DF_PATH)
else:
  # In order to avoid [PATH_ALREADY_EXISTS] errors. 
  if os.path.exists(TRAIN_DF_PATH):
    shutil.rmtree(TRAIN_DF_PATH)
  if os.path.exists(TEST_DF_PATH):
    shutil.rmtree(TEST_DF_PATH)

  train_df, test_df = divide_whole_dataset(slice_df)
  train_df.write.json(TRAIN_DF_PATH)
  test_df.write.json(TEST_DF_PATH)

In [None]:
# TODO: Super bug! Sometimes there there are duplicate playlists in the training set!!
#TODO: Doing this, there are 97,000 playlists in the train_df, and not 100,000 WHAT

# Fixed this somehow rerunning the code and overwriting the parquet files, IDK what happened kekw
# # train_df, test_df = divide_whole_dataset(slice_df)
# train_df.write.mode("overwrite").parquet(TRAIN_DF_PATH)
# test_df.write.mode("overwrite").parquet(TEST_DF_PATH)

In [79]:
from pyspark.sql import functions as F

@timeit
def precision_at_k(recommendations, ground_truth, num_of_recommendations):
    """
    Calculates precision at k for the recommendations.
    """
    recommended_relevant_tracks = recommendations.join(ground_truth, "track_uri").count() #this can be top_n_results.join in order to be more performant
    precision = recommended_relevant_tracks / float(num_of_recommendations)
    return precision

# TODO: Implemented in 1 sec, Test it
@timeit
def normalized_discounted_gain(recommendations: DataFrame, ground_truth: DataFrame) -> float:
    sorted_recs = recommendations.orderBy(F.desc("confidence")).select("track_uri").collect()

    def dcg(scores: List[int]) -> float:
        return sum([score / (F.log2(rank + 2)) for rank, score in enumerate(scores)])

    rec_scores = [1 if row in sorted_recs else 0 for row in ground_truth]
    max_dcg = dcg([1] * len(ground_truth))
    ndcg = dcg(rec_scores) / max_dcg if max_dcg != 0 else 0.0

    return ndcg

In [80]:
@timeit
def evaluate(pid: int) -> Tuple[DataFrame, float]:
    playlist_train = train_df.filter(f"pid == {pid}")
    playlist_test = test_df.filter(f"pid == {pid}")
    ground_truth = playlist_test.select(explode("tracks")).select("col.*")
    num_of_recommendations = ground_truth.count()
    recommendations = user_based_recommendation(playlist_train, 
                                                mapped_slice_df, 
                                                jaccard_similarity, 
                                                n=num_of_recommendations,
                                                k = 10)
    precision = precision_at_k(recommendations, ground_truth, num_of_recommendations)
    return playlist_train, playlist_test, recommendations, precision

In [85]:
EVALUATION_RESULTS_PATH = os.path.join(GDRIVE_DATA_DIR, "evaluation_results")
def perform_evaluation():
  SAMPLING_FRACTION = 1000 / 100_000
  sampled_playlists = train_df.sample(False, SAMPLING_FRACTION, seed=42)
  results = []
  for row in tqdm(sampled_playlists.collect(), desc="Performing evaluation"):
      pid = row['pid']
      result = evaluate(pid)
      results.append(result)
  with open(EVALUATION_RESULTS_PATH, "w") as f:
    json.dumps(results, f)
  return results

perform_evaluation()

Performing evaluation:   0%|          | 0/1024 [00:00<?, ?it/s]

Function map_track_df_to_pos Took 0.4519 seconds
Function create_similarity_df Took 0.0609 seconds
Function get_top_k_results Took 0.0205 seconds
Function accumulate_top_k_results Took 0.0622 seconds
Function recommendation_song_info Took 0.0119 seconds
Function user_based_recommendation Took 2.0639 seconds
Function precision_at_k Took 218.0364 seconds
Function evaluate Took 222.7419 seconds
Function map_track_df_to_pos Took 0.4973 seconds
Function create_similarity_df Took 0.0458 seconds
Function get_top_k_results Took 0.0190 seconds
Function accumulate_top_k_results Took 0.0696 seconds
Function recommendation_song_info Took 0.0121 seconds
Function user_based_recommendation Took 1.4726 seconds
Function precision_at_k Took 232.6897 seconds
Function evaluate Took 236.8716 seconds
Function map_track_df_to_pos Took 0.4920 seconds
Function create_similarity_df Took 0.0386 seconds


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


Function get_top_k_results Took 0.0226 seconds


KeyboardInterrupt: ignored

In [None]:
pl_train, pl_test, reccomendations, precision = evaluate(3001)

In [None]:
pl_train.show(), pl_test.show()

In [None]:
pl_train.select(explode("tracks")).select("col.*").show(),\
pl_test.select(explode("tracks")).select("col.*").show()

In [None]:
reccomendations.orderBy(col("confidence").desc()).show() #TODO: Confidence has some strange values, check them out to see if they are correct

In [None]:
precision

# Fighting against the curse of dimensionality: Matrix Factorization

We want to define $\mathbf{x}_u \in \mathbb{R}^d$ $d$-dimensional vector that represents the user $u$, and $\mathbf{w}_i \in \mathbb{R}^d$ vector that represent the item $i$.

We then can estimate the rating of user $u$ for the item $i$ by computing
\begin{equation}
\hat{r}_{u, i}=\mathbf{x}_u^T \cdot \mathbf{w}_i=\sum_{j=1}^d x_{u, j} w_{j, i}
\end{equation}
Or, in matrix notation,

\begin{equation}
\underbrace{R}_{m \times n} =
\underbrace{X}_{m \times d}
\underbrace{W^T}_{d \times n}
\end{equation}

### How to learn $X$ and $W$
The matrix $R$ is partially known and filled with the observations inside the dataset $\mathcal{D}$. In order to learn the latent factor representations $X$ and $W$, we minimize the following loss function:
\begin{equation}
L(X, W)=\sum_{(u, i) \in \mathcal{D}}\underbrace{\left(r_{u, i}-\mathbf{x}_u^T \cdot \mathbf{w}_i\right)^2}_{\text{squared error term}}+\underbrace{\lambda\left(\sum_{u \in \mathcal{D}}\left\|\mathbf{x}_u\right\|^2+\sum_{i \in \mathcal{D}}\left\|\mathbf{w}_i\right\|^2\right)}_{\text{regularization term}}
\end{equation}

We can then minimize the loss using Stochastic Gradient Descent or Alternating Least Squares.

# Matrix Factorization
Generate a matrix Y where each column represent a playlist and each row represent a song, the (i,j) entry will be 1 if the playlist contains the song, 0 otherwise.

In [None]:
# Throw error in order to not execute the following code
raise ValueError()

In [None]:
import pyspark.sql.functions as f
from pyspark.sql.functions import explode
spark.conf.set("spark.sql.pivotMaxValues", 1000000)

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql.functions import expr


In [None]:
from pyspark.sql.functions import explode
import random
tracks_df = slice_df.select("pid", explode("tracks").alias("track")).select("pid", "track.track_uri")
tracks_df = tracks_df.withColumn("rating", lit(1))
# tracks_df = tracks_df.withColumn("rating", (rand() * 10 + 1).cast("integer"))

In [None]:
tracks_df.show()

In [None]:
# # Explode the tracks array column into multiple rows
# # tracks_df = slice_df.select("pid", explode("tracks").alias("track"))
# # tracks_df = slice_df.select("pid", "tracks", "tracks")
# tracks_df = slice_df.select("pid", explode("tracks").alias("track")).select("pid", "track.track_uri", "track.pos")

# # Select relevant columns and add a rating column with value 1
# playlist_track_df = tracks_df.withColumn("rating", lit(1))

# # Get distinct track_uri values and join with playlist_track_df
# all_tracks_df = slice_df.select(explode("tracks").alias("track")).select("track.track_uri").distinct()
# all_playlists_df = slice_df.select("pid").distinct()

# all_against_all = all_tracks_df.join(all_playlists_df).distinct()

# from pyspark.sql.functions import when, col

# # playlist_track_rating_df = playlist_track_df.join(all_against_all, ["pid", "track_uri"], "left_outer") \
# #     .withColumn("rating", when(col("pos").isNull(), 0).otherwise(1))

# playlist_track_rating_df = all_against_all.join(playlist_track_df, ["pid", "track_uri"], "left_outer") \
#     .withColumn("rating", when(col("pos").isNull(), 0).otherwise(1)) \
#     .drop("pos")


In [None]:
playlist_track_rating_df = tracks_df.withColumn("song_id", dense_rank().over(Window.orderBy("track_uri")))

In [None]:
playlist_track_rating_df.show(truncate=False)

In [None]:
als = ALS(userCol="pid", itemCol="song_id", ratingCol="rating", nonnegative=True, coldStartStrategy="drop")

In [None]:
from typing import Tuple
import random

def train_test_split(df: DataFrame, split_ratio: float, seed: Optional[int] = None) -> Tuple[DataFrame, DataFrame]:
  random.seed(seed)
  distinct_pids = df.select("pid").distinct().rdd.map(lambda x: x[0]).collect()
  random.shuffle(distinct_pids)
  split_index = int(len(distinct_pids) * split_ratio)
  train_pids = distinct_pids[:split_index]
  test_pids = distinct_pids[split_index:]
  train_df = df.filter(col("pid").isin(train_pids))
  test_df = df.filter(col("pid").isin(test_pids))
  return train_df, test_df



In [None]:
training, test = playlist_track_rating_df.randomSplit([0.8, 0.2], seed=42)

In [None]:
model = als.fit(training)

In [None]:
predictions = model.transform(test)

In [None]:
predictions.show()

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

In [None]:
predictions.filter(col("prediction") != "NaN").count(), predictions.filter(col("prediction") == "NaN").count()

In [None]:
rmse

In [None]:
subset = playlist_track_rating_df.select("pid").distinct().limit(1)
subUserRecs = model.recommendForUserSubset(subset, 10)

In [None]:
subset.show()

In [None]:
subUserRecs.show(truncate=False)

In [None]:
def song_name_from_id(song_id: int, reverse_lookup: DataFrame) -> str:
  return 
  
def interpretRecommendation(recommended_result: DataFrame) -> str:
  return

In [None]:
userRecs = model.recommendForAllUsers(1).orderBy("recommendations")
userRecs.show(truncate=False)
userRecs.count()

In [None]:
slice_df.filter(col("pid") == 1710).select(explode("tracks.track_name")).show()

In [None]:
track_uris = playlist_track_rating_df.filter(col("song_id") == 588).select("track_uri")
track_uris.first()