<a href="https://colab.research.google.com/github/DomizianoScarcelli/big-data-project/blob/nn-model/user_based_CF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Config

# Configuration

In [1]:
import os
def is_running_on_colab():
    return "COLAB_GPU" in os.environ

LOCAL = not is_running_on_colab()

In [2]:
#@title Download necessary libraries
if not LOCAL:
    !pip install pyspark -qq
    !pip install -U -q PyDrive -qq
    !apt install openjdk-8-jdk-headless -qq

The operation couldn’t be completed. Unable to locate a Java Runtime that supports apt.
Please visit http://www.java.com for information on installing Java.



In [3]:
#@title Imports
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import plotly

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, FloatType, LongType
from pyspark import SparkContext, SparkConf
from pyspark.ml.linalg import SparseVector, DenseVector, VectorUDT

from tqdm.notebook import tqdm
import time
import gc

if not LOCAL:
    from google.colab import drive

from typing import Tuple
from functools import reduce
import pickle


In [4]:
#@title Set up variables
if not LOCAL:
    JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
    GDRIVE_DIR = "/content/drive"
    GDRIVE_HOME_DIR = GDRIVE_DIR + "/MyDrive"
    GDRIVE_DATA_DIR = GDRIVE_HOME_DIR + "/Big Data/datasets"
    DATASET_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_friendly_spotify_playlist_dataset")
    AUDIO_FEATURES_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_track_features")
    LITTLE_SLICE_FILE = os.path.join(GDRIVE_DATA_DIR, "little_slice")
    SMALL_SLICE_FLIE = os.path.join(GDRIVE_DATA_DIR, "small_slice")
    LITTLE_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "little_slice_audio_features")
    MICRO_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "micro_slice_audio_features")
    SPLITTED_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "splitted_pyspark_track_features")
    SAVED_DFS_PATH = os.path.join(GDRIVE_DATA_DIR, "saved_dfs")
    SAVED_MODELS = os.path.join(GDRIVE_DATA_DIR, "saved_models")
else:
    GDRIVE_DATA_DIR = os.path.abspath("./data")
    SAVED_DFS_PATH = os.path.join(GDRIVE_DATA_DIR, "saved_dfs")
    SAVED_MODELS = os.path.join(GDRIVE_DATA_DIR, "saved_models")
    DATASET_FILE = os.path.join(GDRIVE_DATA_DIR, "full_dataset")
    SMALL_SLICE_FLIE = os.path.join(GDRIVE_DATA_DIR, "small_slice")
    JAVA_HOME = "/opt/homebrew/opt/openjdk"
RANDOM_SEED = 42 # for reproducibility
os.environ["JAVA_HOME"] = JAVA_HOME
os.environ["PYSPARK_PYTHON"]="python"

In [5]:
#@title Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '100G').\
                set("spark.executor.extraJavaOptions", "-XX:+UseG1GC").\
                setAppName("PySparkTutorial").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/22 19:35:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
if not LOCAL:
    drive.mount(GDRIVE_DIR, force_remount=True)

In [11]:
#@title Check if everything is ok
spark, sc._conf.getAll()


(<pyspark.sql.session.SparkSession at 0x147f4d3f0>,
 [('spark.executor.extraJavaOptions',
   '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false -XX:+UseG1GC'),
  ('spark.app.name', 'PySparkTutorial'),
  ('spark.driver.host', '192.168.1.175'),
  

# Data acquisition

In [12]:
from pyspark.ml.linalg import VectorUDT
song_schema = StructType([
    StructField("pos", IntegerType(), True),
    StructField("artist_name", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("artist_uri", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("album_uri", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("album_name", StringType(), True)
])

playlist_schema = StructType([
    StructField("name", StringType(), True),
    StructField("collaborative", StringType(), True),
    StructField("pid", IntegerType(), True),
    StructField("modified_at", IntegerType(), True),
    StructField("num_tracks", IntegerType(), True),
    StructField("num_albums", IntegerType(), True),
    StructField("num_followers", IntegerType(), True),
    StructField("tracks", ArrayType(song_schema), True),
    StructField("num_edits", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("num_artists", IntegerType(), True),
])

playlist_schema_mapped = StructType([
    StructField("name", StringType(), True),
    StructField("collaborative", StringType(), True),
    StructField("pid", IntegerType(), True),
    StructField("modified_at", IntegerType(), True),
    StructField("num_tracks", IntegerType(), True),
    StructField("num_albums", IntegerType(), True),
    StructField("num_followers", IntegerType(), True),
    StructField("tracks", VectorUDT(), True),
    StructField("num_edits", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("num_artists", IntegerType(), True),
])

audio_features_schema = StructType([
    StructField("danceability", FloatType(), True),
    StructField("energy", FloatType(), True),
    StructField("key", IntegerType(), True),
    StructField("loudness", FloatType(), True),
    StructField("mode", IntegerType(), True),
    StructField("speechiness", FloatType(), True),
    StructField("acousticness", FloatType(), True),
    StructField("instrumentalness", FloatType(), True),
    StructField("liveness", FloatType(), True),
    StructField("valence", FloatType(), True),
    StructField("tempo", FloatType(), True),
    StructField("type", StringType(), True),
    StructField("id", StringType(), True),
    StructField("uri", StringType(), True),
    StructField("track_href", StringType(), True),
    StructField("analysis_url", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("time_signature", IntegerType(), True)
])




In [13]:
# playlist_df = spark.read.schema(playlist_schema).json(DATASET_FILE, multiLine=True)
slice_df = spark.read.schema(playlist_schema).json(SMALL_SLICE_FLIE, multiLine=True)
# slice_df = spark.read.schema(playlist_schema).json(LITTLE_SLICE_FILE, multiLine=True)

# User-Based Collaborative Filtering
Note: The users are the playlists, the items are the songs and the ratings are 0 if the song is not in the playlist, 1 otherwise.

We have to define a function $sim(u,v)$ that defines the similarity between two users based on their ratings.

We represent the ratings $r_u \in \mathbb{R}^n$ as the $n$ dimensional vector that represents the ratings of the user $u$, where $n$ is the number of total songs in the dataset.

As the similarity function we can use Jaccard similarity.
\begin{equation}
sim(u,v) = J(r_u, r_v) = \frac{|r_u \cap r_v|}{|r_u \cup r_v|}
\end{equation}

Jaccard similarity ignores rating values, but we don't care here since the ratings are binary. In case of discrete value ratings we can use cosine similarity, or better pearson's correlation.

Done that, and defined as ${U^k}$ the neighborhood of $u$ ($k$ most similar users to $u$), we define the set of items rated by $u$'s neighborhood as

\begin{equation}
I^k = \{i \in I : \mathbf{r_{u,i}} \downarrow \land u \in U^k\}
\end{equation}

The rating for the item $i$ to the user $u$ will just be $\mathbf{r_u[i]}$.

In [14]:
from functools import wraps
import time

DEBUG = True
IGNORE_TIMING = True
def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        total_time = end_time - start_time
        if not IGNORE_TIMING:
          print(f'Function {func.__name__} Took {total_time:.4f} seconds')
        return result
    return timeit_wrapper

In [15]:
NUM_PLAYLISTS = 100_000
SONGS_EMBEDDINGS_PATH = os.path.join(SAVED_DFS_PATH, f"songs_embeddings-{NUM_PLAYLISTS}.json")
SONGS_INFO_DF = os.path.join(SAVED_DFS_PATH, f"songs_info_df-{NUM_PLAYLISTS}.json") #TODO: Little bug this is songs_df, meaning it hasn't got any info, but we don't actually care.
RATING_VECTOR_LENGTH_PATH = os.path.join(SAVED_DFS_PATH, f"songs_vector_length-{NUM_PLAYLISTS}.txt")
with open(RATING_VECTOR_LENGTH_PATH, "r") as f:
  RATING_VECTOR_LENGTH = int(f.read())

songs_embeddings = spark.read.schema(playlist_schema_mapped).json(SONGS_EMBEDDINGS_PATH)
song_pos_mapping = spark.read.json(SONGS_INFO_DF)

                                                                                

23/06/22 19:35:55 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [16]:
song_pos_mapping.show(), songs_embeddings.show()

+---+--------------------+
|pos|           track_uri|
+---+--------------------+
|  0|spotify:track:1mr...|
|  1|spotify:track:1Uv...|
|  2|spotify:track:4WR...|
|  3|spotify:track:7B6...|
|  4|spotify:track:2Gy...|
|  5|spotify:track:7AO...|
|  6|spotify:track:48Z...|
|  7|spotify:track:1Um...|
|  8|spotify:track:7MO...|
|  9|spotify:track:27P...|
| 10|spotify:track:6lt...|
| 11|spotify:track:1yz...|
| 12|spotify:track:5Mz...|
| 13|spotify:track:3BU...|
| 14|spotify:track:4Cl...|
| 15|spotify:track:2dN...|
| 16|spotify:track:341...|
| 17|spotify:track:7ja...|
| 18|spotify:track:4eQ...|
| 19|spotify:track:6fy...|
+---+--------------------+
only showing top 20 rows

+-------------+-------------+----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|         name|collaborative| pid|modified_at|num_tracks|num_albums|num_followers|              tracks|num_edits|duration_ms|num_artists|
+-------------+-------------+----+-----------+----

(None, None)

Preprocessing the dataframe in order to associate to each track_uri an integer, that will represent the position of the track in the rating_vector. This is useful in order to avoid doing a lot of joins when generating the rating_vectors.

In [17]:
track_uri_to_id = song_pos_mapping.select('track_uri', 'pos').rdd.collectAsMap()
def map_track_df_to_pos(playlist_df: DataFrame) -> DataFrame:
    """
    Returns a DataFrames containing the playlists, but the tracks are represented as a binary sparse vector.
    """

    @F.udf(returnType=VectorUDT())
    def extract_vector(tracks):
      pos_list = set()

      def reduce_fn(pos_list, row):
          pos_list.add(track_uri_to_id.get(row.track_uri))
          return pos_list
      
      pos_list = reduce(reduce_fn, tracks, pos_list)
      
      return SparseVector(RATING_VECTOR_LENGTH, sorted(list(pos_list)), [1 for _ in pos_list])

    # Apply the mapping UDF on the "tracks" column of the slice_df dataframe
    mapped_df = playlist_df.withColumn('tracks', extract_vector(F.col('tracks')))

    return mapped_df

                                                                                

In [18]:
import sys
print("The size of the track_uri -> position mapping dictionary is {} MB".format(sys.getsizeof(track_uri_to_id) / 1_000_000))

The size of the track_uri -> position mapping dictionary is 20.971608 MB


In [19]:
# def row_to_sparse_vector(item: dict) -> SparseVector:
#     """
#     Because of json serialization, the SparseVector is converted into a Row(indices=..., values=...),
#     this function converts it back to a pyspark.SparseVector with length RATING_VECTOR_LENGHT+1 as default.
#     """
#     return SparseVector(RATING_VECTOR_LENGTH+1, item.indices, item.values)

# @udf(returnType=VectorUDT())
# def parse_sparse_vector(row):
#   return row_to_sparse_vector(row)

# #TODO: Uncomment this and remove the other calls to row_to_spare_vector
# # mapped_slice_df = mapped_slice_df.withColumnRenamed("rating_vector", "temp")\
#   # .withColumn("rating_vector", parse_sparse_vector(col("temp")))\
#   # .drop("temp").cache()

In [20]:
def jaccard_similarity(vector_1: SparseVector, vector_2: SparseVector) -> float:
  """
  Computes the Jaccard Similarity between two sparse binary vectors
  """
  # Convert SparseVectors to sets
  set1 = set(vector_1.indices)
  set2 = set(vector_2.indices)

  # Calculate the intersection and union of the sets
  intersection = len(set1.intersection(set2))
  union = len(set1.union(set2))

  # Calculate the similarity
  similarity = intersection / union

  return similarity

Creating a function that gets in input the playlist to continue, and returns a Dataframe that indicates its similarity with each other playlist in the dataset.

In [21]:
from typing import Callable
@timeit
def create_similarity_df(input_vector: DataFrame, rating_vectors_df: DataFrame, similarityFunction: Callable) -> DataFrame:  
  input_vector_cached = input_vector.cache()
  input_vector = input_vector.first()[0]
  
  @F.udf(returnType=FloatType())
  def compute_similarity(vector1):
    return jaccard_similarity(vector1, input_vector)

  rv_df_input = rating_vectors_df
  result_df = rv_df_input.withColumn("similarity", compute_similarity(rv_df_input.rating_vector))

  input_vector_cached.unpersist()
  
  return result_df

if DEBUG:
  rv_df = songs_embeddings.withColumnRenamed("tracks", "rating_vector")
  # Just to show, we take the first playlist as the playlist to be continued 
  first_playlist_vector = rv_df.limit(1).select("rating_vector").withColumnRenamed("rating_vector","input_vector")
  result_df = create_similarity_df(first_playlist_vector, rv_df, jaccard_similarity)
  result_df.cache()
  result_df.orderBy(F.col("similarity").desc()).show()



+---------------+-------------+-----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+----------+
|           name|collaborative|  pid|modified_at|num_tracks|num_albums|num_followers|       rating_vector|num_edits|duration_ms|num_artists|similarity|
+---------------+-------------+-----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+----------+
|         disney|        false| 1000| 1457827200|       189|        16|            1|(681805,[126,1903...|        4|   31428282|         65|       1.0|
|         Disney|        false|34420| 1430784000|       179|        15|            1|(681805,[254,1903...|        3|   25533248|         54|0.22635135|
|        Disney✨|        false|12990| 1450915200|       113|        32|            1|(681805,[1790,190...|        3|   20665774|         64|0.18775511|
|         Disney|        false| 3323| 1391817600|       133|        20|            1|(68

                                                                                

Curse of dimensionality! We can see that each playlist is very dissimilar from each other playlist.

If we filter the playlists that have a strictly positive similarity with the input playlist, and order them by descending similarity, we can see that the name (that we assume is very informative for the content of the playlist) is very similar, meaning that the algorithm seems to work!

In [22]:
# result_df.filter("similarity > 0").orderBy(col("similarity").desc()).show()

Now, in order to suggest some songs to continuate the input playlist, let's take the $k$ top most similar playlists

In [23]:
@timeit
def get_top_k_results(playlist_pid: int, similarity_df: DataFrame, k: int = 20) -> DataFrame:
  return similarity_df.filter((F.col("similarity") > 0) & (F.col("pid") != playlist_pid)).orderBy(F.col("similarity").desc()).limit(k)

if DEBUG:
  first_playlist_pid = rv_df.limit(1).select("pid").first().pid
  top_k_results = get_top_k_results(first_playlist_pid, result_df)
  top_k_results.cache()
  top_k_results.show()

+---------------+-------------+-----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+----------+
|           name|collaborative|  pid|modified_at|num_tracks|num_albums|num_followers|       rating_vector|num_edits|duration_ms|num_artists|similarity|
+---------------+-------------+-----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+----------+
|         Disney|        false|34420| 1430784000|       179|        15|            1|(681805,[254,1903...|        3|   25533248|         54|0.22635135|
|        Disney✨|        false|12990| 1450915200|       113|        32|            1|(681805,[1790,190...|        3|   20665774|         64|0.18775511|
|         Disney|        false| 3323| 1391817600|       133|        20|            1|(681805,[4132,494...|        6|   23208277|         64|0.16911764|
|         DISNEY|        false|44565| 1506470400|       179|        23|            1|(68

We want to obtain a single embedding for all the $K$ top most similar playlists, that will be the rating vector. We can then pick the indices of the $n$ top greatest values form this vector, and those will be the $n$ songs that we will reccomend.

In order to aggregate the $k$ embeddings into a single one, I decided to take an average, weighted by the similarity value.

In [24]:
@timeit
def get_input_rating_vector(similarity_df: DataFrame) -> SparseVector:
  return similarity_df.limit(1).select("input_vector").collect()[0].input_vector

@timeit
def accumulate_top_k_results(top_k_results: DataFrame, input_vector: np.ndarray) -> DataFrame:

  @F.udf(returnType=VectorUDT())
  def sum_vector(sparse_vectors, similarities):
    similarities = np.array(similarities)
    sparse_vectors = np.array(sparse_vectors)
    acc = np.dot(sparse_vectors.T, similarities) #Compute the sum(vector * similarity) for each vector and similarity
    acc /= similarities.sum() #Normalize the vector
    acc -= (input_vector * acc) #If a song is present in the input playlist, don't consider it
    return SparseVector(acc.size, np.nonzero(acc)[0], acc[np.nonzero(acc)])

  return top_k_results.agg(sum_vector(F.collect_list('rating_vector'), F.collect_list("similarity")).alias('summed'))

if DEBUG:
  t1 = time.time()
  input_vector = first_playlist_vector.first()[0]
  accumulated_vector_df = accumulate_top_k_results(top_k_results, input_vector)
  accumulated_vector_df.cache()
  accumulated_vector_df.show()
  t2 = time.time()
  print(t2-t1)

[Stage 12:>                                                         (0 + 1) / 1]

+--------------------+
|              summed|
+--------------------+
|(681805,[83,125,2...|
+--------------------+

28.124406099319458


                                                                                

In [25]:
@F.udf(returnType=ArrayType(
    StructType([
      StructField("pos", IntegerType(), False),
      StructField("confidence", FloatType(), False)
])))
def get_top_n_values(vector: SparseVector, n: int=10):
  sorted_elements = vector.toArray().tolist()
  top_n_indices = sorted(range(len(sorted_elements)), key=lambda i: sorted_elements[i], reverse=True)[:n]
  return [(index, sorted_elements[index]) for index in top_n_indices]

if DEBUG:
  t1 = time.time()
  top_n_reccomendations = accumulated_vector_df.withColumn("top_n_recommendations", get_top_n_values(F.col("summed"))).select(F.explode("top_n_recommendations")).select("col.*")
  top_n_reccomendations.show()
  t2 = time.time()
  print(t2-t1)

+------+----------+
|   pos|confidence|
+------+----------+
|505626|0.86660594|
|338046| 0.8451381|
|256245| 0.8245531|
|669595|0.81081593|
|174330|0.78816617|
|592258| 0.7713628|
|589100| 0.7507684|
|170221|0.73229903|
| 87563| 0.7185418|
|585110|0.71125495|
+------+----------+

0.3676290512084961


In [26]:
import json
#TODO: For now this works, but it's very slow, and since this has to be executed online,
# consider to directly embed the song information inside the dataframe when computing the songs
# to recommend.
@timeit
def recommendation_song_info(recommendation: DataFrame, songs_info_df: DataFrame) -> DataFrame:
  return recommendation.join(songs_info_df, "pos")

if DEBUG:
  t1 = time.time()
  songs_info = recommendation_song_info(top_n_reccomendations, song_pos_mapping)
  songs_info.show()
  t2 = time.time()
  print(t2-t1)

+------+----------+--------------------+
|   pos|confidence|           track_uri|
+------+----------+--------------------+
| 87563| 0.7185418|spotify:track:0qc...|
|170221|0.73229903|spotify:track:6P3...|
|174330|0.78816617|spotify:track:0qx...|
|256245| 0.8245531|spotify:track:28U...|
|338046| 0.8451381|spotify:track:5k3...|
|505626|0.86660594|spotify:track:1OY...|
|585110|0.71125495|spotify:track:2yi...|
|589100| 0.7507684|spotify:track:70b...|
|592258| 0.7713628|spotify:track:2AI...|
|669595|0.81081593|spotify:track:0HU...|
+------+----------+--------------------+

1.0036489963531494


### Putting it all togheter
We now define a single function that will get a playlist in input and will reccomend $n$ songs.

In [27]:
#TODO: this now takes a playlist and extracts its PID, if a playlist is built from scratch the PID shouldn't be defined
# A solution would be to pass the playlist row with the PID = Nan and then have a condition when extracting the PID. If Nan, ignore it
@timeit
def user_based_recommendation(playlist: DataFrame, 
                              mapped_slice_df: DataFrame, 
                              similarity_function: Callable, 
                              n:int = 50,
                              k: int = 20) -> DataFrame:
                              
  rv_df = mapped_slice_df.withColumnRenamed("tracks", "rating_vector").cache() #TODO: Parse the rv_df before and then remove this
  #TODO: Try not to use this map_track_df_to_pos
  playlist_vector = map_track_df_to_pos(playlist).select("tracks").withColumnRenamed("tracks", "input_vector").cache()
  similarity_df = create_similarity_df(playlist_vector, rv_df, jaccard_similarity).cache()
  top_k_results = get_top_k_results(playlist.first().pid, similarity_df, k=k).cache()
  input_vector = playlist_vector.select("input_vector").first()[0].toArray()
  accumulated_vector_df = accumulate_top_k_results(top_k_results, input_vector).cache()
  top_n_indices = accumulated_vector_df\
                  .withColumn("top_n_recommendations", get_top_n_values(F.col("summed")))\
                  .select(F.explode("top_n_recommendations"))\
                  .select("col.*").cache()
  #TODO: songs_df because it's faster, but it doesn't get all the info.
  recommended_songs_info = recommendation_song_info(top_n_indices, song_pos_mapping).cache() 

  playlist_vector.unpersist()
  similarity_df.unpersist()
  top_k_results.unpersist()
  accumulated_vector_df.unpersist()
  top_n_indices.unpersist()
  return recommended_songs_info
  

if DEBUG:
  #Collect and createDataFrame because operations on limit(1) take as long as the entire slice_df, don't know why
  playlist = spark.createDataFrame(slice_df.filter("pid == 1010").limit(1).collect())
  final_recommendation = user_based_recommendation(playlist, songs_embeddings, jaccard_similarity, n=5)

                                                                                

Creating rating vectors: 0.3389158248901367


23/06/22 19:36:45 WARN CacheManager: Asked to cache already cached data.
                                                                                

Computing similarity_df: 2.3518710136413574
Getting top_k_results: 2.7671151161193848
Getting the playlist's input vector: 1.2081599235534668
Getting the top n indices: 0.06534123420715332
Getting the reccomended_songs_info: 0.020317792892456055


In [28]:
final_recommendation.show()

                                                                                

+------+----------+--------------------+
|   pos|confidence|           track_uri|
+------+----------+--------------------+
|   402|0.37936306|spotify:track:78W...|
|255794|0.44066477|spotify:track:1Je...|
|333588|0.45380837|spotify:track:3zB...|
|416552|0.44611415|spotify:track:2Pp...|
|417324|0.35038823|spotify:track:0XU...|
|424672|0.44969997|spotify:track:5dN...|
|497574|0.34576124|spotify:track:6uQ...|
|500339| 0.4509231|spotify:track:5Rs...|
|591490|0.45051354|spotify:track:6cb...|
|669578| 0.4977322|spotify:track:4o6...|
+------+----------+--------------------+



## Performance Evaluation

In [29]:
TRAIN_DF_PATH = os.path.join(SAVED_DFS_PATH, f"train_df-{NUM_PLAYLISTS}.json")
TEST_DF_PATH = os.path.join(SAVED_DFS_PATH, f"test_df-{NUM_PLAYLISTS}.json")
train_df = spark.read.schema(playlist_schema).json(TRAIN_DF_PATH)
test_df = spark.read.schema(playlist_schema).json(TEST_DF_PATH)

In [30]:
# train_df.count(), test_df.count()

                                                                                

(100000, 100000)

In [31]:
# train_df.show(), test_df.show()

+-------------+-------------+----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|         name|collaborative| pid|modified_at|num_tracks|num_albums|num_followers|              tracks|num_edits|duration_ms|num_artists|
+-------------+-------------+----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|       disney|        false|1000| 1457827200|       189|        16|            1|[{31, Daughters o...|        4|   31428282|         65|
|Indie Electro|        false|1001| 1417824000|       165|        18|            2|[{117, Boards of ...|        2|   38241566|          8|
|  jack & jack|        false|1002| 1465430400|        17|        14|            1|[{14, Jack & Jack...|        3|    3549358|          3|
|        vibes|        false|1003| 1498435200|       225|       195|            2|[{119, PREP, spot...|       91|   51242585|        157|
|        Indie|        false|1004|

(None, None)

In [32]:
from pyspark.sql import functions as F

@timeit
def precision_at_k(recommendations, ground_truth, num_of_recommendations) -> float:
    """
    Calculates precision at k for the recommendations.
    """
    recommended_relevant_tracks = recommendations.join(ground_truth, "track_uri").cache()
    reccomended_relevant_tracks_count = recommended_relevant_tracks.count() #this can be top_n_results.join in order to be more performant
    recommended_relevant_tracks.unpersist()
    precision = reccomended_relevant_tracks_count / float(num_of_recommendations)

    return precision


import math
def normalized_discounted_cumulative_gain(recommendations: DataFrame, ground_truth: DataFrame, num_of_recommendations: int) -> float:
  recommendations = recommendations.orderBy(F.col("confidence").desc())
  recommendations_list = recommendations.collect()
  cumulative_gain = 0

  intersection = recommendations.join(ground_truth, "track_uri").count()
  if intersection == 0: return 0

  ideal_cumulative_gain = 1 + np.array([(1 / math.log(i, 2)) for i in range(2, 2+intersection)]).sum() #TODO: replace this with sum([])
  for index, row in enumerate(recommendations_list):
    i = index + 1
    is_rel = ground_truth.filter(F.col("track_uri").isin(row.track_uri)).count() > 0
    rel = 1 if is_rel else 0
    if i == 1:
      cumulative_gain += rel
    else:
      cumulative_gain += (rel / math.log(i, 2))
  return cumulative_gain / ideal_cumulative_gain

In [34]:
@timeit
def evaluate(pid: int) -> Tuple[DataFrame, float]:
    playlist_train = train_df.filter(f"pid == {pid}").cache()
    playlist_test = test_df.filter(f"pid == {pid}").cache()
    ground_truth = playlist_test.select(F.explode("tracks")).select("col.*").cache()
    #TODO: This can be remove by inserting the number of songs when creating the train and test df
    num_of_recommendations = ground_truth.count()
    recommendations = user_based_recommendation(playlist_train, 
                                                songs_embeddings, 
                                                jaccard_similarity, 
                                                n=num_of_recommendations,
                                                k = 10).cache()
    precision = precision_at_k(recommendations, ground_truth, num_of_recommendations)
    gain = normalized_discounted_cumulative_gain(recommendations, ground_truth, num_of_recommendations)

    playlist_train.unpersist()
    playlist_test.unpersist()
    ground_truth.unpersist()
    return playlist_train, playlist_test, ground_truth, recommendations, precision, gain

In [35]:
train, test, gt, rec, prec, gain = evaluate(47264)
prec, gain

23/06/22 19:37:25 WARN CacheManager: Asked to cache already cached data.


Creating rating vectors: 0.4294600486755371


23/06/22 19:37:25 WARN CacheManager: Asked to cache already cached data.
                                                                                

Computing similarity_df: 3.1668317317962646
Getting top_k_results: 3.326906681060791


                                                                                

Getting the playlist's input vector: 2.7075142860412598
Getting the top n indices: 0.130018949508667
Getting the reccomended_songs_info: 0.0507967472076416


23/06/22 19:37:32 WARN CacheManager: Asked to cache already cached data.
                                                                                

Total time: 28.260274171829224


(0.14814814814814814, 0.49528491839179634)

In [36]:
train.show(), test.show(), rec.show(), prec

+--------------+-------------+-----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|          name|collaborative|  pid|modified_at|num_tracks|num_albums|num_followers|              tracks|num_edits|duration_ms|num_artists|
+--------------+-------------+-----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|all the way up|        false|47264| 1484784000|       106|        91|            8|[{73, Kevin Gates...|       45|   23079971|         71|
+--------------+-------------+-----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+

+--------------+-------------+-----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|          name|collaborative|  pid|modified_at|num_tracks|num_albums|num_followers|              tracks|num_edits|duration_ms|num_artists|
+--------------+---

(None, None, None, 0.14814814814814814)

In [40]:
EVALUATION_RESULTS_PATH = os.path.join(GDRIVE_DATA_DIR, "UB_evaluation_results_FINAL")
def perform_evaluation():
  SAMPLING_FRACTION = 0.01
  sampled_playlists = train_df.sample(False, SAMPLING_FRACTION, seed=42).cache()

  results = []
  for index, row in enumerate(tqdm(sampled_playlists.collect(), desc="Performing evaluation")):
      CHECKPOINT_RESULTS = os.path.join(GDRIVE_DATA_DIR, f"UB_evaluation_results_check_{index}")
      pid = row['pid']
      train, test, gt, rec, prec, gain = evaluate(pid)
      results.append((prec, gain))
      if index % 10 == 0:
         with open(CHECKPOINT_RESULTS, "w") as f:
            json.dump(results, f)
  with open(EVALUATION_RESULTS_PATH, "w") as f:
    json.dump(results, f)
  
  sampled_playlists.unpersist()
  return results


results = perform_evaluation()

                                                                                

Performing evaluation:   0%|          | 0/8 [00:00<?, ?it/s]

23/06/22 19:45:04 WARN CacheManager: Asked to cache already cached data.


Creating rating vectors: 0.5889122486114502


23/06/22 19:45:04 WARN CacheManager: Asked to cache already cached data.
                                                                                

Computing similarity_df: 1.9946949481964111
Getting top_k_results: 2.081190824508667


                                                                                

Getting the playlist's input vector: 1.7408912181854248
Getting the top n indices: 0.14092803001403809
Getting the reccomended_songs_info: 0.01810479164123535


23/06/22 19:45:08 WARN CacheManager: Asked to cache already cached data.
                                                                                

Total time: 22.711556911468506


23/06/22 19:45:27 WARN CacheManager: Asked to cache already cached data.


Creating rating vectors: 0.5471527576446533


23/06/22 19:45:27 WARN CacheManager: Asked to cache already cached data.
                                                                                

Computing similarity_df: 1.2961370944976807
Getting top_k_results: 1.3294332027435303


                                                                                

Getting the playlist's input vector: 1.0208487510681152
Getting the top n indices: 0.05796313285827637
Getting the reccomended_songs_info: 0.010725021362304688


23/06/22 19:45:30 WARN CacheManager: Asked to cache already cached data.
                                                                                

Total time: 19.990849018096924


23/06/22 19:45:47 WARN CacheManager: Asked to cache already cached data.


Creating rating vectors: 0.37842369079589844


23/06/22 19:45:48 WARN CacheManager: Asked to cache already cached data.
                                                                                

Computing similarity_df: 1.2348601818084717
Getting top_k_results: 1.2761881351470947


                                                                                

Getting the playlist's input vector: 1.0355291366577148
Getting the top n indices: 0.06665396690368652
Getting the reccomended_songs_info: 0.019361019134521484


23/06/22 19:45:50 WARN CacheManager: Asked to cache already cached data.
                                                                                

Total time: 23.85125207901001


23/06/22 19:46:12 WARN CacheManager: Asked to cache already cached data.


Creating rating vectors: 0.5056109428405762


23/06/22 19:46:12 WARN CacheManager: Asked to cache already cached data.
                                                                                

Computing similarity_df: 4.174385070800781
Getting top_k_results: 4.268749952316284


                                                                                

Getting the playlist's input vector: 3.195374011993408
Getting the top n indices: 0.19579100608825684
Getting the reccomended_songs_info: 0.03510904312133789


23/06/22 19:46:20 WARN CacheManager: Asked to cache already cached data.
                                                                                

Total time: 28.603249073028564


23/06/22 19:46:42 WARN CacheManager: Asked to cache already cached data.


Creating rating vectors: 0.49161386489868164


23/06/22 19:46:42 WARN CacheManager: Asked to cache already cached data.
                                                                                

Computing similarity_df: 4.09291410446167
Getting top_k_results: 4.203598976135254


                                                                                

Getting the playlist's input vector: 3.019865036010742
Getting the top n indices: 0.24603486061096191
Getting the reccomended_songs_info: 0.025109052658081055


23/06/22 19:46:50 WARN CacheManager: Asked to cache already cached data.
                                                                                

Total time: 25.962573051452637


23/06/22 19:47:08 WARN CacheManager: Asked to cache already cached data.


Creating rating vectors: 0.38521599769592285


23/06/22 19:47:08 WARN CacheManager: Asked to cache already cached data.
                                                                                

Computing similarity_df: 3.154144048690796
Getting top_k_results: 3.22196102142334


                                                                                

Getting the playlist's input vector: 2.4859459400177
Getting the top n indices: 0.10286116600036621
Getting the reccomended_songs_info: 0.016600847244262695


23/06/22 19:47:14 WARN CacheManager: Asked to cache already cached data.
                                                                                

Total time: 24.216420650482178


23/06/22 19:47:33 WARN CacheManager: Asked to cache already cached data.


Creating rating vectors: 0.4311859607696533


23/06/22 19:47:33 WARN CacheManager: Asked to cache already cached data.
                                                                                

Computing similarity_df: 4.868103981018066
Getting top_k_results: 4.940645933151245


23/06/22 19:47:40 WARN CacheManager: Asked to cache already cached data.        


Getting the playlist's input vector: 2.320175886154175
Getting the top n indices: 0.05518221855163574
Getting the reccomended_songs_info: 0.011857032775878906


                                                                                

Total time: 27.15517282485962


23/06/22 19:48:00 WARN CacheManager: Asked to cache already cached data.


Creating rating vectors: 0.4046962261199951


23/06/22 19:48:01 WARN CacheManager: Asked to cache already cached data.
                                                                                

Computing similarity_df: 3.424952983856201
Getting top_k_results: 3.4794199466705322


                                                                                

Getting the playlist's input vector: 2.6219921112060547
Getting the top n indices: 0.1208488941192627
Getting the reccomended_songs_info: 0.01769280433654785


23/06/22 19:48:07 WARN CacheManager: Asked to cache already cached data.
                                                                                

Total time: 26.437735319137573


In [41]:
avg_prec = np.array(results).mean()
avg_prec, avg_gain = 0, 0
for prec, gain in results:
  avg_prec += prec
  avg_gain += gain 
tot = len(results)
avg_prec /= tot
avg_gain /= tot
avg_prec, avg_gain

(0.14186958874458874, 0.37676438321859024)

# Fighting against the curse of dimensionality: Matrix Factorization

We want to define $\mathbf{x}_u \in \mathbb{R}^d$ $d$-dimensional vector that represents the user $u$, and $\mathbf{w}_i \in \mathbb{R}^d$ vector that represent the item $i$.

We then can estimate the rating of user $u$ for the item $i$ by computing
\begin{equation}
\hat{r}_{u, i}=\mathbf{x}_u^T \cdot \mathbf{w}_i=\sum_{j=1}^d x_{u, j} w_{j, i}
\end{equation}
Or, in matrix notation,

\begin{equation}
\underbrace{R}_{m \times n} =
\underbrace{X}_{m \times d}
\underbrace{W^T}_{d \times n}
\end{equation}

### How to learn $X$ and $W$
The matrix $R$ is partially known and filled with the observations inside the dataset $\mathcal{D}$. In order to learn the latent factor representations $X$ and $W$, we minimize the following loss function:
\begin{equation}
L(X, W)=\sum_{(u, i) \in \mathcal{D}}\underbrace{\left(r_{u, i}-\mathbf{x}_u^T \cdot \mathbf{w}_i\right)^2}_{\text{squared error term}}+\underbrace{\lambda\left(\sum_{u \in \mathcal{D}}\left\|\mathbf{x}_u\right\|^2+\sum_{i \in \mathcal{D}}\left\|\mathbf{w}_i\right\|^2\right)}_{\text{regularization term}}
\end{equation}

We can then minimize the loss using Stochastic Gradient Descent or Alternating Least Squares.

# Matrix Factorization
Generate a matrix Y where each column represent a playlist and each row represent a song, the (i,j) entry will be 1 if the playlist contains the song, 0 otherwise.

In [None]:
# Throw error in order to not execute the following code
raise ValueError()

In [None]:
import pyspark.sql.functions as f
from pyspark.sql.functions import explode
spark.conf.set("spark.sql.pivotMaxValues", 1000000)

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql.functions import expr


In [None]:
from pyspark.sql.functions import explode
import random
tracks_df = slice_df.select("pid", explode("tracks").alias("track")).select("pid", "track.track_uri")
tracks_df = tracks_df.withColumn("rating", lit(1))
# tracks_df = tracks_df.withColumn("rating", (rand() * 10 + 1).cast("integer"))

In [None]:
tracks_df.show()

In [None]:
# # Explode the tracks array column into multiple rows
# # tracks_df = slice_df.select("pid", explode("tracks").alias("track"))
# # tracks_df = slice_df.select("pid", "tracks", "tracks")
# tracks_df = slice_df.select("pid", explode("tracks").alias("track")).select("pid", "track.track_uri", "track.pos")

# # Select relevant columns and add a rating column with value 1
# playlist_track_df = tracks_df.withColumn("rating", lit(1))

# # Get distinct track_uri values and join with playlist_track_df
# all_tracks_df = slice_df.select(explode("tracks").alias("track")).select("track.track_uri").distinct()
# all_playlists_df = slice_df.select("pid").distinct()

# all_against_all = all_tracks_df.join(all_playlists_df).distinct()

# from pyspark.sql.functions import when, col

# # playlist_track_rating_df = playlist_track_df.join(all_against_all, ["pid", "track_uri"], "left_outer") \
# #     .withColumn("rating", when(col("pos").isNull(), 0).otherwise(1))

# playlist_track_rating_df = all_against_all.join(playlist_track_df, ["pid", "track_uri"], "left_outer") \
#     .withColumn("rating", when(col("pos").isNull(), 0).otherwise(1)) \
#     .drop("pos")


In [None]:
playlist_track_rating_df = tracks_df.withColumn("song_id", dense_rank().over(Window.orderBy("track_uri")))

In [None]:
playlist_track_rating_df.show(truncate=False)

In [None]:
als = ALS(userCol="pid", itemCol="song_id", ratingCol="rating", nonnegative=True, coldStartStrategy="drop")

In [None]:
from typing import Tuple
import random

def train_test_split(df: DataFrame, split_ratio: float, seed: Optional[int] = None) -> Tuple[DataFrame, DataFrame]:
  random.seed(seed)
  distinct_pids = df.select("pid").distinct().rdd.map(lambda x: x[0]).collect()
  random.shuffle(distinct_pids)
  split_index = int(len(distinct_pids) * split_ratio)
  train_pids = distinct_pids[:split_index]
  test_pids = distinct_pids[split_index:]
  train_df = df.filter(col("pid").isin(train_pids))
  test_df = df.filter(col("pid").isin(test_pids))
  return train_df, test_df



In [None]:
training, test = playlist_track_rating_df.randomSplit([0.8, 0.2], seed=42)

In [None]:
model = als.fit(training)

In [None]:
predictions = model.transform(test)

In [None]:
predictions.show()

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

In [None]:
predictions.filter(col("prediction") != "NaN").count(), predictions.filter(col("prediction") == "NaN").count()

In [None]:
rmse

In [None]:
subset = playlist_track_rating_df.select("pid").distinct().limit(1)
subUserRecs = model.recommendForUserSubset(subset, 10)

In [None]:
subset.show()

In [None]:
subUserRecs.show(truncate=False)

In [None]:
def song_name_from_id(song_id: int, reverse_lookup: DataFrame) -> str:
  return 
  
def interpretRecommendation(recommended_result: DataFrame) -> str:
  return

In [None]:
userRecs = model.recommendForAllUsers(1).orderBy("recommendations")
userRecs.show(truncate=False)
userRecs.count()

In [None]:
slice_df.filter(col("pid") == 1710).select(explode("tracks.track_name")).show()

In [None]:
track_uris = playlist_track_rating_df.filter(col("song_id") == 588).select("track_uri")
track_uris.first()