<a href="https://colab.research.google.com/github/DomizianoScarcelli/big-data-project/blob/item-based-cf/item_based_CF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Configuration

Here we configure the environment. Since I alternated from Google Colab to Local development, I define a LOCAL variable that allows me to know in which environment I am. 

In [8]:
import os
def is_running_on_colab():
    return "COLAB_GPU" in os.environ

LOCAL = not is_running_on_colab()

In [9]:
#@title Download necessary libraries
if not LOCAL:
    !pip install pyspark -qq
    !pip install -U -q PyDrive -qq
    !apt install openjdk-8-jdk-headless -qq

In [10]:
#@title Imports
import numpy as np
import json
import sys
import math

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, FloatType, LongType
from pyspark import SparkConf
from pyspark.ml.linalg import SparseVector, VectorUDT
from pyspark.ml.feature import MinHashLSH, MinHashLSHModel

from tqdm.notebook import tqdm
import time

if not LOCAL:
    from google.colab import drive

from typing import Tuple, Dict, List
from functools import reduce
from pprint import pprint

In [11]:
#@title Set up variables
if not LOCAL:
    JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
    GDRIVE_DIR = "/content/drive"
    GDRIVE_HOME_DIR = GDRIVE_DIR + "/MyDrive"
    GDRIVE_DATA_DIR = GDRIVE_HOME_DIR + "/Big Data/datasets"
    DATASET_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_friendly_spotify_playlist_dataset")
    AUDIO_FEATURES_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_track_features")
    LITTLE_SLICE_FILE = os.path.join(GDRIVE_DATA_DIR, "little_slice")
    SMALL_SLICE_FLIE = os.path.join(GDRIVE_DATA_DIR, "small_slice")
    LITTLE_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "little_slice_audio_features")
    MICRO_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "micro_slice_audio_features")
    SPLITTED_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "splitted_pyspark_track_features")
    SAVED_DFS_PATH = os.path.join(GDRIVE_DATA_DIR, "saved_dfs")
    SAVED_MODELS = os.path.join(GDRIVE_DATA_DIR, "saved_models")
    EVALUATION_FOLDER = os.path.join(GDRIVE_DATA_DIR, "evaluation", "item_base_evaluation")
else:
    GDRIVE_DATA_DIR = os.path.abspath("./data")
    SAVED_DFS_PATH = os.path.join(GDRIVE_DATA_DIR, "saved_dfs")
    SAVED_MODELS = os.path.join(GDRIVE_DATA_DIR, "saved_models")
    DATASET_FILE = os.path.join(GDRIVE_DATA_DIR, "full_dataset")
    SMALL_SLICE_FLIE = os.path.join(GDRIVE_DATA_DIR, "small_slice")
    EVALUATION_FOLDER = os.path.join(os.path.abspath("./evaluation"), "item_base_evaluation")
    
    JAVA_HOME = "/opt/homebrew/opt/openjdk"
RANDOM_SEED = 42 # for reproducibility
os.environ["JAVA_HOME"] = JAVA_HOME
os.environ["PYSPARK_PYTHON"]="python"

In [12]:
#@title Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '100G').\
                set("spark.executor.extraJavaOptions", "-XX:+UseG1GC").\
                setAppName("PySparkTutorial").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

23/06/30 19:12:49 WARN Utils: Your hostname, MacBook-Air-di-Domiziano.local resolves to a loopback address: 127.0.0.1; using 192.168.1.175 instead (on interface en0)
23/06/30 19:12:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/30 19:12:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/30 19:12:49 WARN Utils: Service 'SparkUI' could not bind on port 4050. Attempting port 4051.
23/06/30 19:12:49 WARN Utils: Service 'SparkUI' could not bind on port 4051. Attempting port 4052.


In [13]:
if not LOCAL:
    drive.mount(GDRIVE_DIR, force_remount=True)

In [14]:
#@title Check if everything is ok
spark, sc._conf.getAll()

(<pyspark.sql.session.SparkSession at 0x14481beb0>,
 [('spark.executor.extraJavaOptions',
   '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false -XX:+UseG1GC'),
  ('spark.app.name', 'PySparkTutorial'),
  ('spark.driver.host', '192.168.1.175'),
  

# Load DataFrame

Define the `DataFrame` schemas and load the primary `DataFrame` containing the 100K playlists. 

In [15]:
song_schema = StructType([
    StructField("pos", IntegerType(), True),
    StructField("artist_name", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("artist_uri", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("album_uri", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("album_name", StringType(), True)
])

playlist_schema = StructType([
    StructField("name", StringType(), True),
    StructField("collaborative", StringType(), True),
    StructField("pid", IntegerType(), True),
    StructField("modified_at", IntegerType(), True),
    StructField("num_tracks", IntegerType(), True),
    StructField("num_albums", IntegerType(), True),
    StructField("num_followers", IntegerType(), True),
    StructField("tracks", ArrayType(song_schema), True),
    StructField("num_edits", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("num_artists", IntegerType(), True),
])

playlist_schema_mapped = StructType([
    StructField("name", StringType(), True),
    StructField("collaborative", StringType(), True),
    StructField("pid", IntegerType(), True),
    StructField("modified_at", IntegerType(), True),
    StructField("num_tracks", IntegerType(), True),
    StructField("num_albums", IntegerType(), True),
    StructField("num_followers", IntegerType(), True),
    StructField("tracks", VectorUDT(), True),
    StructField("num_edits", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("num_artists", IntegerType(), True),
])

In [16]:
slice_df = spark.read.schema(playlist_schema).json(SMALL_SLICE_FLIE, multiLine=True)

# Item-Based Collaborative Filtering

Item-Based Collaboartive Filtering is the "transpose" approach to user-based CF. This time we won't consider the users' feature vectors, but the items'.
An item's vector representation $\mathbf{r}_i$ is the vector that embeds the information about where the item apperas in the user preferences. Since we are dealing with songs and playlists, it will model in which playlist each song appears.

Let $m$ be the number of users, $n$ the number of playlists, then $\mathbf{r}_i \in \mathbb{R}^m$ and $\mathbf{R} \in \mathbb{R}^{m \times n}$.

In order to make a prediction, we take the set of items $I_u$ rated by the user $u$, and we compute the set $I^k_u$ of the top-$k$ most similar items to $i$ rated by $u$, for each item $i \in I_u$. Once done that, we average the $k$ rating vectors weighting them by their respective similarity.

In [21]:
DEBUG = True #If True, execute code that helps to debug the code

Let's load the playlist embedding encodings, and the `DataFrame` that maps each playlist to a position.

In [18]:
NUM_PLAYLISTS = 100_000
SONGS_INFO_DF = os.path.join(SAVED_DFS_PATH, f"songs_info_df-{NUM_PLAYLISTS}.json")
songs_df = spark.read.json(SONGS_INFO_DF)

playlist_map_schema = StructType([
    StructField("track_uri", StringType(), True),
    StructField("embedding", VectorUDT(), True)
])
PLAYLIST_MAP_PATH = os.path.join(SAVED_DFS_PATH, f"playlist_map-{NUM_PLAYLISTS}.json")
playlist_map = spark.read.schema(playlist_map_schema).json(PLAYLIST_MAP_PATH)

                                                                                

In [19]:
songs_df.show()

+---+--------------------+
|pos|           track_uri|
+---+--------------------+
|  0|spotify:track:1mr...|
|  1|spotify:track:1Uv...|
|  2|spotify:track:4WR...|
|  3|spotify:track:7B6...|
|  4|spotify:track:2Gy...|
|  5|spotify:track:7AO...|
|  6|spotify:track:48Z...|
|  7|spotify:track:1Um...|
|  8|spotify:track:7MO...|
|  9|spotify:track:27P...|
| 10|spotify:track:6lt...|
| 11|spotify:track:1yz...|
| 12|spotify:track:5Mz...|
| 13|spotify:track:3BU...|
| 14|spotify:track:4Cl...|
| 15|spotify:track:2dN...|
| 16|spotify:track:341...|
| 17|spotify:track:7ja...|
| 18|spotify:track:4eQ...|
| 19|spotify:track:6fy...|
+---+--------------------+
only showing top 20 rows



In [20]:
playlist_map.show()

+--------------------+--------------------+
|           track_uri|           embedding|
+--------------------+--------------------+
|spotify:track:001...|(100001,[7109,250...|
|spotify:track:001...|(100001,[80892],[...|
|spotify:track:001...|(100001,[50317,63...|
|spotify:track:003...|(100001,[7021],[1...|
|spotify:track:003...|(100001,[52415],[...|
|spotify:track:004...|(100001,[44691,88...|
|spotify:track:004...|(100001,[78197],[...|
|spotify:track:004...|(100001,[4159],[1...|
|spotify:track:005...|(100001,[85373],[...|
|spotify:track:005...|(100001,[5737,265...|
|spotify:track:006...|(100001,[40014],[...|
|spotify:track:006...|(100001,[53178],[...|
|spotify:track:007...|(100001,[5765],[1...|
|spotify:track:007...|(100001,[28490,29...|
|spotify:track:007...|(100001,[24445],[...|
|spotify:track:008...|(100001,[3054,151...|
|spotify:track:009...|(100001,[12483,24...|
|spotify:track:009...|(100001,[63483],[...|
|spotify:track:00A...|(100001,[75865],[...|
|spotify:track:00B...|(100001,[6

23/06/30 19:13:03 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


For simplicity, I will load the training and testing set here.

In [22]:
TRAIN_DF_PATH = os.path.join(SAVED_DFS_PATH, f"train_df-{NUM_PLAYLISTS}.json")
TEST_DF_PATH = os.path.join(SAVED_DFS_PATH, f"test_df-{NUM_PLAYLISTS}.json")

train_df = spark.read.schema(playlist_schema).json(TRAIN_DF_PATH)
test_df = spark.read.schema(playlist_schema).json(TEST_DF_PATH)

The full similarity computation between the songs in the playlist and each other song is intractable, we can approximate the nearest neighbour search by clustering the similar tracks in buckets using a Locally Sensitive Hashing algorithm, implemented using pyspark's `MinHashLSHModel`, which uses Jaccard Similarity as similarity function under the hood.

I decided to use Jaccard Similarity since I don't need the information about the single values in the vector, but just which indexes are in each vector.

The number of hash tables is an hyperparameter, and it was chosen experimentally. A lower `NUM_HASH_TABLES` makes the algorithm faster, but less accurate. On the other hand, a higher value will improve the accuracy, compromising the speed performances.

In [23]:
NUM_HASH_TABLES = 20
LSH_MODEL_PATH = os.path.join(SAVED_DFS_PATH, f"lsh_model-{NUM_HASH_TABLES}-{NUM_PLAYLISTS}.pickle")
if os.path.exists(LSH_MODEL_PATH):
  model = MinHashLSHModel.load(LSH_MODEL_PATH)
else:
  mh = MinHashLSH(inputCol="embedding", outputCol="hashes", numHashTables=NUM_HASH_TABLES)
  model = mh.fit(playlist_map)
  model.save(LSH_MODEL_PATH)

model.transform(playlist_map).show(truncate=False)

+------------------------------------+---------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|track_uri                           |embedding                                                                        |hashes                                                                                                                                                                                                                                                                                                                                    |
+------------------------------------+----------------------------------------------------------

Once we extract, for each song $s \in S$ (where $S$ is the collection of songs in the input playlist), the list of $k$ nearest neighbours of $s$, we will have a collection of $|S|$ DataFrames. Since $k$ will be a small value, we know for a fact that these DataFrames will be small, and so we can convert them to python dictionaries in order to further improve the performances when aggregating them into a single DataFrame. 

The aggreagation works in this way:
- Given a collection of dictionaries $D$ of the type `{track_uri: str: distance: int}`
- Create a new dictionary $d_{\text{agg}}$, and put as keys each `track_uri` that appears in $D$, then as value:
  - if: `track_uri` appears only once, the list `[distance]` of the distance associated
  - else: the list `[distance1, distance2, ...]` of distances associated with the `track_uri` in each place where `track_uri` appears in $D$.
- Map to each `track_uri` in $d_{\text{agg}}$ the mean of the values inside of associated value.


In [31]:
def df_to_dict(df: DataFrame) -> DataFrame:
  """
  Given a DataFrame that includes the track_uri and the distance to the input track, it converts
  it to a python dictionary.
  """
  return df.select("track_uri", "distCol").rdd.collectAsMap()

def merge_dicts(d1: Dict[str, float], d2: Dict[str, float]) -> Dict[str, float]:
  """
  Aggregates the values of the d2 dataframe into the d1 dataframe, which is treated like an accumulator.
  The function has to be used inside of a reduce().
  """
  for key, value in d2.items():
    if key in d1:
      if type(d1[key]) is float:
        d1[key] = [d1[key]]
      d1[key] += [value]
    else:
      d1[key] = [value]
  return d1

if DEBUG:
  key = playlist_map.first()[1]
  neigh = model.approxNearestNeighbors(playlist_map, key, 10).cache()
  merged_df_2 = merge_dicts(df_to_dict(neigh), df_to_dict(neigh))
  pprint(merged_df_2)



{'spotify:track:001BVhvaZTf2icV88rU3DA': [0.0, 0.0],
 'spotify:track:1iO2inxYIzmPnMuDFfU1Rl': [0.8571428571428572,
                                          0.8571428571428572],
 'spotify:track:2EhgEpfn3U0lmpryqDujwt': [0.8571428571428572,
                                          0.8571428571428572],
 'spotify:track:3DeMqzJj9477nCcyTXl3Ye': [0.8571428571428572,
                                          0.8571428571428572],
 'spotify:track:3Ff2kaO1uxXjd9HkHfMw4h': [0.8571428571428572,
                                          0.8571428571428572],
 'spotify:track:3lkFKOOQRp1AqWk2PPAW6B': [0.8571428571428572,
                                          0.8571428571428572],
 'spotify:track:4OEHuq3q8kjkPS1jKI96JP': [0.8571428571428572,
                                          0.8571428571428572],
 'spotify:track:50r5hQgwJ61tCwEL1maGsG': [0.8571428571428572,
                                          0.8571428571428572],
 'spotify:track:558Km9MuklF6yKJDVVjIli': [0.8571428571428572,
          

                                                                                

Let's define the function that actually extracts the nearest neighbours for each song, and creates the accumulated python dictionary.

In [27]:
def extract_similar_songs(playlist_tracks: DataFrame, playlist_map, model, k=10, disable_pbar=False) -> Dict[str, float]:
  """
  Given the a DataFrame containing the input playlist's tracks, the LSH model and the DataFrame that maps
  each playlist to a position, gets the approximate k-nearest-neighbours for each track, and aggregates the 
  results into a python dictionary.
  """
  aggregate_df = None
  tracks_embedding = playlist_map.join(F.broadcast(playlist_tracks), "track_uri").select("track_uri", "embedding")
  transformed_tracks_embeddings = model.transform(tracks_embedding).cache()
  k_neighs = []

  for row in tqdm(tracks_embedding.collect(), desc='Extracting k-neighbors', disable=disable_pbar):
    k_neigh = model.approxNearestNeighbors(playlist_map, row["embedding"], k).cache()
    k_neighs.append(df_to_dict(k_neigh))

  aggregate_df = reduce(merge_dicts, k_neighs)

  transformed_tracks_embeddings.unpersist()
  return aggregate_df

if DEBUG:
  # first_playlist = train_df.limit(1).select(F.explode("tracks")).select("col.*").distinct()
  first_playlist = train_df.filter("pid == 1005").select(F.explode("tracks")).select("col.*").distinct()
  recommendations = extract_similar_songs(first_playlist, playlist_map, model, k=10)
  pprint(recommendations)

                                                                                

Extracting k-neighbors:   0%|          | 0/21 [00:00<?, ?it/s]



{'spotify:track:4Hj5yNHgu2dyrnzRQN7Gld': 0.0, 'spotify:track:1jNyxG5S2P9gztbfAnrq85': 0.84, 'spotify:track:34yAAS72vZUXvsQPHGZexE': 0.8571428571428572, 'spotify:track:3nVDOYBJpdCkRR6r1DbZum': 0.8761467889908257, 'spotify:track:7yQ2NWAIeDcIEauyVG5lck': 0.8840579710144928, 'spotify:track:0GS2K1xle0UpcAWa3u4BnG': 0.8846153846153846, 'spotify:track:366DLDHAnvMnHP5ECYt4N7': 0.8901098901098901, 'spotify:track:7sdqtgsO9zxvKdrrVDpaNu': 0.9027777777777778, 'spotify:track:45oGS3iHeT2cvkihSP7KzK': 0.904109589041096, 'spotify:track:2TLlFL6dpycifLqKtTE7UZ': 0.9146341463414634, 'spotify:track:53mrVsi49rLHIaKBiSvElG': [0.0], 'spotify:track:7F9vK8hNFMml4GtHsaXui6': [0.914187643020595, 0.9158485273492286, 0.8746543778801843, 0.8666666666666667, 0.920099875156055, 0.0], 'spotify:track:0ofbQMrRDsUaVKq2mGLEAb': [0.9173878835562549, 0.8796736913664174, 0.8557457212713937, 0.919732441471572, 0.841748304446119], 'spotify:track:0CokSRCu5hZgPxcZBaEzVE': [0.9246231155778895, 0.0, 0.8666666666666667, 0.896081771

                                                                                

In [30]:
if DEBUG:
    pprint(recommendations)

{'spotify:track:08QmhAybC6VHeG8gah84qA': [0.941747572815534],
 'spotify:track:0CokSRCu5hZgPxcZBaEzVE': [0.9246231155778895,
                                          0.0,
                                          0.8666666666666667,
                                          0.8960817717206133],
 'spotify:track:0Fv5N0cHBsl4bzCbollCAS': [0.0],
 'spotify:track:0GS2K1xle0UpcAWa3u4BnG': 0.8846153846153846,
 'spotify:track:0NiXXAI876aGImAd6rTj8w': [0.0],
 'spotify:track:0Qh38w01QRXK6KHIv0e3hb': [0.0],
 'spotify:track:0SGkqnVQo9KPytSri1H6cF': [0.7495854063018241],
 'spotify:track:0UioblV1x795s55Ur58c6c': [0.7365648050579557],
 'spotify:track:0VgkVdmE4gld66l8iyGjgx': [0.8052325581395349],
 'spotify:track:0XLOf9LhyazPX9Ld8jPiUq': [0.0, 0.8746543778801843],
 'spotify:track:0bXFIF7iL17TYLyx8JHziM': [0.0],
 'spotify:track:0iCMHSwNGZBqns7Ko54yVZ': [0.9436619718309859],
 'spotify:track:0jdny0dhgjUwoIp5GkqEaA': [0.8956692913385826],
 'spotify:track:0ofbQMrRDsUaVKq2mGLEAb': [0.9173878835562549,
      

We can see that the `reccomendations` python dictionary obtained from converting the DataFrame is actually very small, and occupies only few KB of memory. 

In [32]:
if DEBUG:
  print(f"The reccomendation dictionary is {sys.getsizeof(recommendations) / 1_000} KB")

The reccomendation dictionary is 4.696 KB


Once we have computed the aggreageted python dictionary, we can average the similarity values inside of the dictionary's values, and obtain a pyspark DataFrame back.

In [33]:
def aggregate_recommendations(recommendations: Dict[str, float | List[float]]) -> DataFrame:
  """
  Given the python dictionary of aggreagated recommendations, it converts it into a pyspark DataFrame.
  """
  aggregated = {}
  for key, value in recommendations.items():
    if type(value) is list:
      aggregated[key] = sum(x for x in value) / len(value)
    else:
      aggregated[key] = value
      
  recommendations_schema = StructType([
      StructField("track_uri", StringType(), True),
      StructField("distance", FloatType(), True)
  ])

  recommendations_df = spark.createDataFrame(data=aggregated.items(), schema=recommendations_schema)

  return recommendations_df

if DEBUG:
  k = 40
  recommendations_df = aggregate_recommendations(recommendations)
  recommendations_df.show(truncate=False)
  first_playlist.show(truncate=False)
  print(recommendations_df.count(), first_playlist.count())

+------------------------------------+----------+
|track_uri                           |distance  |
+------------------------------------+----------+
|spotify:track:4Hj5yNHgu2dyrnzRQN7Gld|0.0       |
|spotify:track:1jNyxG5S2P9gztbfAnrq85|0.84      |
|spotify:track:34yAAS72vZUXvsQPHGZexE|0.85714287|
|spotify:track:3nVDOYBJpdCkRR6r1DbZum|0.8761468 |
|spotify:track:7yQ2NWAIeDcIEauyVG5lck|0.884058  |
|spotify:track:0GS2K1xle0UpcAWa3u4BnG|0.88461536|
|spotify:track:366DLDHAnvMnHP5ECYt4N7|0.8901099 |
|spotify:track:7sdqtgsO9zxvKdrrVDpaNu|0.9027778 |
|spotify:track:45oGS3iHeT2cvkihSP7KzK|0.9041096 |
|spotify:track:2TLlFL6dpycifLqKtTE7UZ|0.91463417|
|spotify:track:53mrVsi49rLHIaKBiSvElG|0.0       |
|spotify:track:7F9vK8hNFMml4GtHsaXui6|0.74857616|
|spotify:track:0ofbQMrRDsUaVKq2mGLEAb|0.8828576 |
|spotify:track:0CokSRCu5hZgPxcZBaEzVE|0.6718429 |
|spotify:track:2ekn2ttSfGqwhhate0LSR0|0.8810708 |
|spotify:track:2771LMNxwf62FTAdpJMQfM|0.8064928 |
|spotify:track:6EpRaXYhGOB3fj4V2uDkMJ|0.7445807 |


                                                                                

+---+---------------+------------------------------------+-------------------------------------+-----------------------------------------------------+------------------------------------+-----------+-----------------------------------------------------+
|pos|artist_name    |track_uri                           |artist_uri                           |track_name                                           |album_uri                           |duration_ms|album_name                                           |
+---+---------------+------------------------------------+-------------------------------------+-----------------------------------------------------+------------------------------------+-----------+-----------------------------------------------------+
|18 |Aminé          |spotify:track:4Hj5yNHgu2dyrnzRQN7Gld|spotify:artist:3Gm5F95VdRxW3mqCn8RPBJ|Yellow                                               |spotify:album:3lajefIuUk4SfzqVBSJy8p|180000     |Good For You                           

                                                                                

Here I define a function that, given the input playlist tracks DataFrame and the DataFrame of recommended tracks, removes from the latter the songs that are already in the playlist, in order to not recommend them.

In [34]:
def remove_existing_tracks(playlist_tracks: DataFrame, recommendations_df: DataFrame) -> DataFrame:
  """
  Given a DataFrame of input playlist's tracks, and a DataFrame of recommendations, removes from the
  recommendations the tracks that already appear in the input playlist's tracks.
  """
  playlist_tracks = playlist_tracks.select("track_uri").cache()
  playlist_tracks_compatible = playlist_tracks.join(F.broadcast(recommendations_df), on="track_uri")
  playlist_tracks.unpersist()
  return recommendations_df.exceptAll(playlist_tracks_compatible)

if DEBUG:
  clean_df = remove_existing_tracks(first_playlist, recommendations_df)
  clean_df.show()

+--------------------+----------+
|           track_uri|  distance|
+--------------------+----------+
|spotify:track:34y...|0.85714287|
|spotify:track:1jN...|      0.84|
|spotify:track:45o...| 0.9041096|
|spotify:track:3nV...| 0.8761468|
|spotify:track:7yQ...|  0.884058|
|spotify:track:0GS...|0.88461536|
|spotify:track:7sd...| 0.9027778|
|spotify:track:366...| 0.8901099|
|spotify:track:0of...| 0.8828576|
|spotify:track:2TL...|0.91463417|
|spotify:track:2ek...| 0.8810708|
|spotify:track:277...| 0.8064928|
|spotify:track:59J...|  0.811459|
|spotify:track:4xB...|0.93406594|
|spotify:track:2fQ...| 0.8339222|
|spotify:track:7hC...| 0.9221902|
|spotify:track:1nX...|0.83590376|
|spotify:track:1WI...|0.81096107|
|spotify:track:1sC...|0.88184625|
|spotify:track:1ND...|0.90926903|
+--------------------+----------+
only showing top 20 rows



Putting all togheter:

In [35]:
def item_based_recommendation(playlist: DataFrame,playlist_map: DataFrame, model: MinHashLSHModel, n=50):
  """
  Given a DataFrame with a single element in it that includes the details of the playlist and the vector
  representation, it performs the recommendation pipeline and return the DataFrame containing the n
  recommended songs that continuate the playlist.
  """
  playlist_songs = playlist.select(F.explode("tracks")).select("col.*")
  recommendations = extract_similar_songs(playlist_songs, playlist_map, model, k=20, disable_pbar=True)
  recommendations_df = aggregate_recommendations(recommendations)
  recommendations_df = remove_existing_tracks(playlist_songs, recommendations_df)
  return recommendations_df.orderBy(F.col("distance").asc()).limit(n).cache()

if DEBUG:
  playlist = train_df.filter("pid == 2005")
  result = item_based_recommendation(playlist, playlist_map, model)
  result.show()



+--------------------+----------+
|           track_uri|  distance|
+--------------------+----------+
|spotify:track:2t1...|0.64285713|
|spotify:track:2AO...| 0.6969697|
|spotify:track:2gh...|       0.7|
|spotify:track:5IF...|0.71900827|
|spotify:track:3t1...| 0.7605634|
|spotify:track:2ru...|0.77286583|
|spotify:track:5Qf...| 0.7910448|
|spotify:track:1Vu...| 0.8076923|
|spotify:track:6jL...|0.81005585|
|spotify:track:6gR...|0.81060606|
|spotify:track:5Cd...| 0.8152174|
|spotify:track:1ak...|0.81595093|
|spotify:track:4ej...| 0.8244275|
|spotify:track:2wm...|0.82828283|
|spotify:track:6oQ...|0.82961684|
|spotify:track:2BZ...|0.83035713|
|spotify:track:7qi...| 0.8330721|
|spotify:track:4I3...| 0.8359133|
|spotify:track:0b0...|0.83709276|
|spotify:track:2Qv...| 0.8372093|
+--------------------+----------+
only showing top 20 rows



                                                                                

# Performance Evaluation

We are ready for performance evaluation. Let's define the metrics, which, as described also in the other notebooks, will be R-Precision and Normalized Discounted Cumulative Gain. 

In [23]:
def r_prec(recommendations, ground_truth, num_of_recommendations) -> float:
    """
    Calculates R-Precision for the recommendations.
    """ 
    recommended_relevant_tracks = recommendations.join(ground_truth, "track_uri").cache()
    reccomended_relevant_tracks_count = recommended_relevant_tracks.count() #this can be top_n_results.join in order to be more performant
    recommended_relevant_tracks.unpersist()
    precision = reccomended_relevant_tracks_count / float(num_of_recommendations)

    return precision

def normalized_discounted_cumulative_gain(recommendations: DataFrame, ground_truth: DataFrame, num_of_recommendations: int) -> float:
  """
  Calculates the Normalized Discounted Cumulative Gain between the DataFrame of recommendations and the DataFrame of ground truth.
  """
  recommendations = recommendations.orderBy(F.col("distance").asc())
  recommendations_list = recommendations.collect()
  cumulative_gain = 0

  intersection = recommendations.join(ground_truth, "track_uri").count()
  if intersection == 0: return 0

  ideal_cumulative_gain = 1 + sum((1 / math.log(i, 2)) for i in range(2, 2+intersection))
  for index, row in enumerate(recommendations_list):
    i = index + 1
    is_rel = ground_truth.filter(F.col("track_uri").isin(row.track_uri)).count() > 0
    rel = 1 if is_rel else 0
    if i == 1:
      cumulative_gain += rel
    else:
      cumulative_gain += (rel / math.log(i, 2))
  return cumulative_gain / ideal_cumulative_gain

This is a function that computes the evaluation metrics on a single playlist.

In [24]:
def evaluate(pid: int, playlist_map) -> Tuple[DataFrame, float]:
    t1 = time.time()

    playlist_train = train_df.filter(f"pid == {pid}").cache()
    playlist_test = test_df.filter(f"pid == {pid}").cache()
    ground_truth = playlist_test.select(F.explode("tracks")).select("col.*").cache()
    num_of_recommendations = ground_truth.count()
    recommendations = item_based_recommendation(playlist_train, playlist_map, model, n=num_of_recommendations).cache()

    precision = r_prec(recommendations, ground_truth, num_of_recommendations)
    gain = normalized_discounted_cumulative_gain(recommendations, ground_truth, num_of_recommendations)

    t2 = time.time()
    print(f"Total time: {t2-t1}")

    playlist_train.unpersist()
    playlist_test.unpersist()
    ground_truth.unpersist()
    recommendations.unpersist()

    return playlist_train, playlist_test, ground_truth, recommendations, precision, gain


if DEBUG:
  train, test, gt, rec, prec, gain  = evaluate(1005, playlist_map)
  train.show(), test.show(), gt.show(), rec.show(truncate=False)
  print(f"Precision: {prec}, Gain: {gain}")

Let's define a function that samples 1000 playlists from the 100K playlists in the train set, and use them to evaluate the model. Since the evaluation take a long time, I used checkpointing every 10 playlist evaluated.

In [None]:
LAST_CHECKPOINT_INDEX = 240
EVALUATION_RESULTS_PATH = os.path.join(EVALUATION_FOLDER, f'{NUM_HASH_TABLES}_hash_tables', "IB_evaluation_results_FINAL")

def perform_evaluation():
  SAMPLING_FRACTION = 0.01
  sampled_playlists = train_df.sample(False, SAMPLING_FRACTION, seed=42).cache()

  transformed_playlist_map = model.transform(playlist_map).cache()
  results = []
  for index, row in enumerate(tqdm(sampled_playlists.collect(), desc="Performing evaluation")):
      if index <= LAST_CHECKPOINT_INDEX: continue 

      CHECKPOINT_RESULTS = os.path.join(EVALUATION_FOLDER, f'{NUM_HASH_TABLES}_hash_tables', f"IB_evaluation_results_check_{index}")
      pid = row['pid']
      train, test, gt, rec, prec, gain = evaluate(pid,transformed_playlist_map)
      print((prec, gain))
      results.append((prec, gain))
      if index % 10 == 0:
         with open(CHECKPOINT_RESULTS, "w") as f:
            json.dump(results, f)
  
  sampled_playlists.unpersist() 
  with open(EVALUATION_RESULTS_PATH, "w") as f:
    json.dump(results, f)
  return results

results = perform_evaluation()

Let's load the `results`, average them and see how the model performed.

In [12]:
EVALUATION_FOLDER = os.path.join(EVALUATION_FOLDER, f'{NUM_HASH_TABLES}_hash_tables')
results = []
for file in os.listdir(EVALUATION_FOLDER):
    if file == ".DS_Store": continue
    with open(os.path.join(EVALUATION_FOLDER, file), "r") as f:
        file_results = json.load(f)
        results.extend(file_results)

avg_prec, avg_gain = 0, 0
for prec, gain in results:
  avg_prec += prec
  avg_gain += gain 
tot = len(results)
avg_prec /= tot
avg_gain /= tot
avg_prec, avg_gain

(0.08970312126944059, 0.2610040737759509)

# All against all full computation (Not used)

Since computing the k nearest neighbors is super slow, I can pre-compute them offline and store them. This will require like a week.

NOTE: I didn't use this piece of code inside of the project because it required too much time, but I put it here to demonstrate how it would work. Once the DataFrame that maps each song to its $k$ nearest neighbours, the neighbours of a given song can be efficiently gotten with a lookup.

In [36]:
mh = MinHashLSH(inputCol="embedding", outputCol="hashes", numHashTables=NUM_HASH_TABLES)
model = mh.fit(playlist_map)

In [None]:
def compute_all_k_neighbors(playlist_map: DataFrame, model) -> DataFrame:
    result = []
    transformed_playlist_map = model.transform(playlist_map).cache()
    for index, row in enumerate(tqdm(playlist_map.collect(), desc="Computing k-neighbors")):
        k_neighs = model.approxNearestNeighbors(transformed_playlist_map, row.embedding, 10).select("track_uri", F.col("distCol").alias("similarity"))
        result.append((row.track_uri, k_neighs.collect()))

    k_neighs_schema = StructType([
        StructField("track_uri", StringType(), nullable=True),
        StructField("distCol", FloatType(), nullable=True)
    ])

    schema = StructType([
        StructField("track_uri", StringType(), nullable=True),
        StructField("k_neighs", ArrayType(k_neighs_schema), nullable=True)
    ])

    result_df = spark.createDataFrame(result, schema)
    transformed_playlist_map.unpersist()

    return result_df

result_df = compute_all_k_neighbors(playlist_map, model)

In [None]:
K_NEIGHBOURS_PATH = os.path.join(GDRIVE_DATA_DIR,"saved_models", f"k_neighbours-{NUM_PLAYLISTS}.parquet")
result_df.write.parquet(K_NEIGHBOURS_PATH)