<a href="https://colab.research.google.com/github/DomizianoScarcelli/big-data-project/blob/main/PlaylistReccomender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Config

# Configuration

In [1]:
#@title Download necessary libraries
!pip install pyspark
!pip install -U -q PyDrive 
!apt install openjdk-8-jdk-headless -qq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
openjdk-8-jdk-headless is already the newest version (8u362-ga-0ubuntu1~20.04.1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [2]:
#@title Imports
import os
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly

import pyspark
from pyspark.sql import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, FloatType, LongType
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.ml.linalg import SparseVector, DenseVector

from tqdm.notebook import tqdm
import time
import gc

from google.colab import drive

In [3]:
#@title Set up variables
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
GDRIVE_DIR = "/content/drive"
GDRIVE_HOME_DIR = GDRIVE_DIR + "/MyDrive"
GDRIVE_DATA_DIR = GDRIVE_HOME_DIR + "/Big Data/datasets"
DATASET_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_friendly_spotify_playlist_dataset")
AUDIO_FEATURES_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_track_features")
LITTLE_SLICE_FILE = os.path.join(GDRIVE_DATA_DIR, "little_slice")
LITTLE_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "little_slice_audio_features")
MICRO_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "micro_slice_audio_features")
SPLITTED_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "splitted_pyspark_track_features")
SAVED_DFS_PATH = os.path.join(GDRIVE_DATA_DIR, "saved_dfs")
RANDOM_SEED = 42 # for reproducibility
os.environ["JAVA_HOME"] = JAVA_HOME

In [4]:
#@title Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '100G').\
                set("spark.executor.extraJavaOptions", "-XX:+UseG1GC").\
                setAppName("PySparkTutorial").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [5]:
drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


## Setup ngrok

In [6]:
!pip install pyngrok

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
!ngrok authtoken 2NVN8kdoOnMVtlDGGWtwsbT5M3Q_2EJv2HE77FEXkz978Qtnq

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [8]:
from pyngrok import ngrok

# Open a ngrok tunnel on the port 4050 where Spark is running
port = '4050'
public_url = ngrok.connect(port).public_url



In [9]:
print("To access the Spark Web UI console, please click on the following link to the ngrok tunnel \"{}\" -> \"http://127.0.0.1:{}\"".format(public_url, port))

To access the Spark Web UI console, please click on the following link to the ngrok tunnel "https://5d2d-35-185-57-204.ngrok-free.app" -> "http://127.0.0.1:4050"


In [10]:
#@title Check if everything is ok
spark, sc._conf.getAll()


(<pyspark.sql.session.SparkSession at 0x7f2f89b5e050>,
 [('spark.executor.extraJavaOptions',
   '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false -XX:+UseG1GC'),
  ('spark.app.name', 'PySparkTutorial'),
  ('spark.app.id', 'local-1683814269817')

# Data acquisition

In [11]:

song_schema = StructType([
    StructField("pos", IntegerType(), True),
    StructField("artist_name", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("artist_uri", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("album_uri", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("album_name", StringType(), True)
])

playlist_schema = StructType([
    StructField("name", StringType(), True),
    StructField("collaborative", StringType(), True),
    StructField("pid", IntegerType(), True),
    StructField("modified_at", IntegerType(), True),
    StructField("num_tracks", IntegerType(), True),
    StructField("num_albums", IntegerType(), True),
    StructField("num_followers", IntegerType(), True),
    StructField("tracks", ArrayType(song_schema), True),
    StructField("num_edits", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("num_artists", IntegerType(), True),
])

audio_features_schema = StructType([
    StructField("danceability", FloatType(), True),
    StructField("energy", FloatType(), True),
    StructField("key", IntegerType(), True),
    StructField("loudness", FloatType(), True),
    StructField("mode", IntegerType(), True),
    StructField("speechiness", FloatType(), True),
    StructField("acousticness", FloatType(), True),
    StructField("instrumentalness", FloatType(), True),
    StructField("liveness", FloatType(), True),
    StructField("valence", FloatType(), True),
    StructField("tempo", FloatType(), True),
    StructField("type", StringType(), True),
    StructField("id", StringType(), True),
    StructField("uri", StringType(), True),
    StructField("track_href", StringType(), True),
    StructField("analysis_url", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("time_signature", IntegerType(), True)
])


In [12]:
playlist_df = spark.read.schema(playlist_schema).json(DATASET_FILE, multiLine=True)
sampled_df = playlist_df.limit(100000)
small_df = playlist_df.limit(1000)
micro_df = playlist_df.limit(100)
slice_df = spark.read.schema(playlist_schema).json(LITTLE_SLICE_FILE, multiLine=True)
audio_df = spark.read.schema(audio_features_schema).json(SPLITTED_SLICE_AUDIO_FEATURES, multiLine=True) #has less songs than expected

In [13]:
# slice_df.select("tracks").first()

In [14]:
slice_df.show()

+-------------+-------------+----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|         name|collaborative| pid|modified_at|num_tracks|num_albums|num_followers|              tracks|num_edits|duration_ms|num_artists|
+-------------+-------------+----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|       disney|        false|1000| 1457827200|       189|        16|            1|[{0, Original Bro...|        4|   31428282|         65|
|Indie Electro|        false|1001| 1417824000|       165|        18|            2|[{0, The Octopus ...|        2|   38241566|          8|
|  jack & jack|        false|1002| 1465430400|        17|        14|            1|[{0, Jack & Jack,...|        3|    3549358|          3|
|        vibes|        false|1003| 1498435200|       225|       195|            2|[{0, LANY, spotif...|       91|   51242585|        157|
|        Indie|        false|1004|

# User-Based Collaborative Filtering
Note: The users are the playlists, the items are the songs and the ratings are 0 if the song is not in the playlist, 1 otherwise.

We have to define a function $sim(u,v)$ that defines the similarity between two users based on their ratings.

We represent the ratings $r_u \in \mathbb{R}^n$ as the $n$ dimensional vector that represents the ratings of the user $u$, where $n$ is the number of total songs in the dataset.

As the similarity function we can use Jaccard similarity.
\begin{equation}
sim(u,v) = J(r_u, r_v) = \frac{|r_u \cap r_v|}{|r_u \cup r_v|}
\end{equation}

Jaccard similarity ignores rating values, but we don't care here since the ratings are binary. In case of discrete value ratings we can use cosine similarity, or better pearson's correlation.

Done that, and defined as ${U^k}$ the neighborhood of $u$ ($k$ most similar users to $u$), we define the set of items rated by $u$'s neighborhood as

\begin{equation}
I^k = \{i \in I : \mathbf{r_{u,i}} \downarrow \land u \in U^k\}
\end{equation}

The rating for the item $i$ to the user $u$ will just be $\mathbf{r_u[i]}$.

In [15]:
RATING_VECTOR_FILE_PATH = os.path.join(SAVED_DFS_PATH, "ml_2_playlist_rating_df.parquet")

In [16]:
def dense_to_sparse(dense: DenseVector) -> SparseVector:
  nonzero_indices = np.nonzero(np.array(dense))[0]
  nonzero_values = np.array(dense)[nonzero_indices]
  sparse_vector = SparseVector(len(dense), nonzero_indices.tolist(), nonzero_values.tolist())
  return sparse_vector

In [252]:
def get_all_songs(playlist_df: DataFrame, set_in_playlist: bool = False) -> DataFrame:
   all_songs = playlist_df.select(explode("tracks.track_uri").alias("track_uri")).distinct()
   if set_in_playlist:
     all_songs = all_songs.withColumn("in_playlist", lit(1))
   return all_songs
  
def get_songs_info(playlist_df: DataFrame, set_in_playlist: bool = False) -> DataFrame:
   all_songs = playlist_df.select(explode("tracks")).select("col.*").drop("pos").distinct()
   if set_in_playlist:
     all_songs = all_songs.withColumn("in_playlist", lit(1))
   return all_songs

songs_df = get_all_songs(slice_df)
songs_df = songs_df.withColumn("pos", monotonically_increasing_id())
# songs_df.show()

songs_info_df = get_songs_info(slice_df)
# songs_info_df = songs_info_df.withColumn("pos", monotonically_increasing_id())

songs_info_df = songs_info_df.join(songs_df, "track_uri", "left")

RATING_VECTOR_LENGTH = songs_df.count()

In [254]:
songs_info_df.filter("pos == 30001").show(truncate=False)

+------------------------------------+----------------+-------------------------------------+------------------------+------------------------------------+-----------+-----------+-----+
|track_uri                           |artist_name     |artist_uri                           |track_name              |album_uri                           |duration_ms|album_name |pos  |
+------------------------------------+----------------+-------------------------------------+------------------------+------------------------------------+-----------+-----------+-----+
|spotify:track:11ST7a8iwPZZSRz4g0PgnP|Michael Kiwanuka|spotify:artist:0bzfPKdbXL5ezYW2z3UGQj|Love & Hate - Radio Edit|spotify:album:1IhwCsddYddLs41f6QQDvP|203866     |Love & Hate|30001|
+------------------------------------+----------------+-------------------------------------+------------------------+------------------------------------+-----------+-----------+-----+



In [255]:
songs_info_df.count(), songs_df.count()

(36175, 36175)

In [18]:
def _create_rating_df(playlist_row: Row, songs_df: DataFrame) -> DataFrame:
  """
  Creates a dataframe that represents the "ratings" for a playlist in the dataframe
  """
  playlist_row = spark.createDataFrame([playlist_row], playlist_schema)
  playlist_uris = get_all_songs(playlist_row)

  joined = songs_df.join(playlist_uris, on="track_uri", how="right")
  return joined


def _check_songs_ordering(playlist_row: DataFrame, songs_df: DataFrame) -> bool:
  """
  Returns a boolean that indicates if the ordering in the songs_df and rating_df is the same
  """
  playlist_row = spark.createDataFrame([playlist_row], playlist_schema)
  playlist_uris = get_all_songs(playlist_row, True).withColumnRenamed("in_playlist", "isin")

  joined = songs_df.join(playlist_uris, on="track_uri", how="right")
  joined_left = songs_df.join(playlist_uris, on="track_uri", how="left").filter("isin == 1")
  assert joined.collect() == joined_left.collect(), f"The order of songs_df is different from the order of rating_df!"

# def _extract_rating_vector(rating_df: DataFrame) -> SparseVector:
#   """
#   Extracts the rating vectors for each playlist 
#   """
#   dense_vector = DenseVector([row.isin for row in rating_df.select("isin").collect()])
#   return dense_to_sparse(dense_vector)

def _extrac_sparse_rating_vector(rating_df: DataFrame) -> SparseVector:
  indices = np.sort([row.pos for row in rating_df.collect()])
  return SparseVector(RATING_VECTOR_LENGTH, indices, np.ones(indices.shape[0]) )

def rating_vector_from_row(playlist_row: Row, songs_df: DataFrame):
  """
  Pipelines togheter create_rating_df and extract_rating_vector.
  """
  rating_df_1 = _create_rating_df(playlist_row, songs_df)
  rating_vector_1 = _extrac_sparse_rating_vector(rating_df_1)
  return rating_vector_1

t1 = time.time() 
rating_vector_1 = rating_vector_from_row(slice_df.first(), songs_df)

t2 = time.time()

t2 - t1, rating_vector_1, type(rating_vector_1)

(5.642845153808594,
 SparseVector(36175, {0: 1.0, 153: 1.0, 545: 1.0, 727: 1.0, 910: 1.0, 1095: 1.0, 1414: 1.0, 1763: 1.0, 1764: 1.0, 2129: 1.0, 2677: 1.0, 2678: 1.0, 2880: 1.0, 3389: 1.0, 3559: 1.0, 3726: 1.0, 3913: 1.0, 3914: 1.0, 4102: 1.0, 4283: 1.0, 4646: 1.0, 4647: 1.0, 4824: 1.0, 4993: 1.0, 5512: 1.0, 5684: 1.0, 5685: 1.0, 5864: 1.0, 6048: 1.0, 6248: 1.0, 6249: 1.0, 6423: 1.0, 6424: 1.0, 6770: 1.0, 6927: 1.0, 7128: 1.0, 7282: 1.0, 7460: 1.0, 7461: 1.0, 7462: 1.0, 7463: 1.0, 7638: 1.0, 7789: 1.0, 7790: 1.0, 7965: 1.0, 7966: 1.0, 7967: 1.0, 8125: 1.0, 8126: 1.0, 8309: 1.0, 8686: 1.0, 8687: 1.0, 8852: 1.0, 9050: 1.0, 9051: 1.0, 9215: 1.0, 9385: 1.0, 9386: 1.0, 9738: 1.0, 9924: 1.0, 9925: 1.0, 9926: 1.0, 10287: 1.0, 10510: 1.0, 10691: 1.0, 10692: 1.0, 10885: 1.0, 11072: 1.0, 11073: 1.0, 11448: 1.0, 11649: 1.0, 11650: 1.0, 12217: 1.0, 13494: 1.0, 13674: 1.0, 13675: 1.0, 13676: 1.0, 13677: 1.0, 13857: 1.0, 14223: 1.0, 14224: 1.0, 14419: 1.0, 14795: 1.0, 14796: 1.0, 14797: 1.0, 14798: 

In [19]:
def jaccard_similarity(vector_1: SparseVector, vector_2: SparseVector) -> float:
  # Convert SparseVectors to sets
  set1 = set(vector_1.indices)
  set2 = set(vector_2.indices)

  # Calculate the intersection and union of the sets
  intersection = len(set1.intersection(set2))
  union = len(set1.union(set2))

  # Calculate the similarity
  similarity = intersection / union

  return similarity

In [20]:
def create_rating_vectors_df(playlists_df: DataFrame) -> DataFrame:
  rating_vectors = []

  for playlist_row in tqdm(playlists_df.collect(), desc="Creating rating vectors"):
    rating_vector = rating_vector_from_row(playlist_row, songs_df)
    new_row = Row(playlist_id=playlist_row.pid, rating_vector=rating_vector)
    rating_vectors.append([new_row])
  return spark.createDataFrame(rating_vectors)

if os.path.exists(RATING_VECTOR_FILE_PATH):
  # rv_schema = StructType([StructField('playlist_id', LongType(), True), StructField('rating_vector', pyspark.ml.linalg.VectorUDT(), True)])
  rating_vectors_df = spark.read.parquet(RATING_VECTOR_FILE_PATH)
  rv_df = rating_vectors_df.select(col("_1.playlist_id").alias("playlist_id"), col("_1.rating_vector").alias("rating_vector"))
else:
  rating_vectors_df = create_rating_vectors_df(slice_df)
  rating_vectors_df.write.parquet(RATING_VECTOR_FILE_PATH)

In [21]:
def create_similarity_df(input_vector: DataFrame, rating_vectors_df: DataFrame, similarityFunction: Callable) -> DataFrame:
  rv_df_input = rating_vectors_df.crossJoin(input_vector)
  similarity_udf = udf(similarityFunction, returnType='double')
  result_df = rv_df_input.withColumn("similarity", similarity_udf(rv_df_input["input_vector"], rv_df_input["rating_vector"]))
  return result_df

first_playlist_vector = rv_df.limit(1).select("rating_vector").withColumnRenamed("rating_vector","input_vector")
result_df = create_similarity_df(first_playlist_vector, rv_df, jaccard_similarity)

Curse of dimensionality! We can see that each playlist is very dissimilar from each other playlist.

In [22]:
rv_df.show()

+-----------+--------------------+
|playlist_id|       rating_vector|
+-----------+--------------------+
|       1000|(36175,[0,153,545...|
|       1001|(36175,[336,728,9...|
|       1002|(36175,[1947,4103...|
|       1003|(36175,[154,155,3...|
|       1004|(36175,[1,2,156,1...|
|       1005|(36175,[338,3048,...|
|       1006|(36175,[547,1417,...|
|       1007|(36175,[339,913,9...|
|       1008|(36175,[158,730,1...|
|       1009|(36175,[4650,4998...|
|       1010|(36175,[3,159,160...|
|       1011|(36175,[161,162,5...|
|       1012|(36175,[549,1950,...|
|       1013|(36175,[340,1612,...|
|       1014|(36175,[163,341,1...|
|       1015|(36175,[164,732,1...|
|       1016|(36175,[4,165,342...|
|       1017|(36175,[550,916,1...|
|       1018|(36175,[551,736,1...|
|       1019|(36175,[917,4107,...|
+-----------+--------------------+
only showing top 20 rows



In [75]:
first_playlist_pid = rv_df.limit(1).select("playlist_id")
K = 20
top_k_results = result_df.filter( (col('playlist_id') != 1000)).orderBy(col("similarity").desc()).limit(K)
top_k_results.show()

Making a prediction

In [185]:
# from pyspark.ml.linalg import VectorUDT

# def add_sparse_vectors(accumulator: SparseVector, vector: SparseVector, weight: float) -> SparseVector:
#     accumulator_vec = accumulator.toArray() 
#     array_2 = vector.toArray() * weight

#     summed_array = accumulator_vec + array_2

#     values = [value for value in summed_array if value != 0]
#     sorted_indices = [index for index, value in enumerate(summed_array) if value != 0]
#     return SparseVector(accumulator_vec.size, sorted_indices, values)

# @udf(returnType=VectorUDT())
# def accumulate_sparse_vectors(accumulator, rating_vector, similarity):
#     summed_vector = add_sparse_vectors(accumulator, rating_vector, similarity)
#     return summed_vector

# df = top_k_results.withColumn('accumulated_vector', accumulate_sparse_vectors(top_k_results["rating_vector"], top_k_results["rating_vector"], top_k_results['similarity']))

In [190]:
def accumulate_top_k_results(top_k_results: DataFrame):
  accumulator = np.zeros(RATING_VECTOR_LENGTH)
  top_k = top_k_results.collect()
  for row in top_k:
    accumulator += row.rating_vector.toArray() * row.similarity

  values = [value for value in accumulator if value != 0]
  sorted_indices = [index for index, value in enumerate(accumulator) if value != 0]
  return SparseVector(accumulator.size, sorted_indices, values)

accumulated_vector = accumulate_top_k_results(top_k_results)

In [186]:
# @udf(returnType="int")
# def compute_vector_len(rating_vector):
#     return len(rating_vector.indices)
# final_df = df.withColumn('accumulated_vector_len', compute_vector_len(df["rating_vector"]))

In [187]:
# accumulated_vector = final_df.orderBy(col("accumulated_vector_len").desc()).first().accumulated_vector

In [198]:
def get_top_n_values(vector: SparseVector, n:int) -> List[int]:
  elements = list(enumerate(vector.toArray()))
  sorted_elements = sorted(elements, key=lambda x: x[1], reverse=True)
  top_n_indices = [(index, confidence) for index, confidence in sorted_elements[:n]]
  return top_n_indices

top_n_reccomendations = get_top_n_values(accumulated_vector, 10)

In [199]:
top_n_reccomendations

[(23144, 0.38746786105535674),
 (35366, 0.336245434026654),
 (34013, 0.32934889670085665),
 (21002, 0.31602170502756466),
 (6786, 0.31053805066966406),
 (16204, 0.295313691695305),
 (33729, 0.2889794179645733),
 (14625, 0.28850952772202343),
 (30031, 0.28850952772202343),
 (2129, 0.284991026833473)]

In [256]:
def song_info_from_index(index: int, confidence: float) -> Row:
  song_info = songs_info_df.filter(f"pos == {index}").withColumn("confidence", lit(confidence)).first()
  return song_info

songs_info = [song_info_from_index(index, confidence) for index, confidence in top_n_reccomendations]

In [257]:
reccomendations = spark.createDataFrame(songs_info)

In [265]:
slice_df.filter(f"pid == {first_playlist_pid.first().playlist_id}").show()

+------+-------------+----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|  name|collaborative| pid|modified_at|num_tracks|num_albums|num_followers|              tracks|num_edits|duration_ms|num_artists|
+------+-------------+----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|disney|        false|1000| 1457827200|       189|        16|            1|[{0, Original Bro...|        4|   31428282|         65|
+------+-------------+----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+



In [258]:
reccomendations.show()

+--------------------+----------------+--------------------+--------------------+--------------------+-----------+--------------------+-----+-------------------+
|           track_uri|     artist_name|          artist_uri|          track_name|           album_uri|duration_ms|          album_name|  pos|         confidence|
+--------------------+----------------+--------------------+--------------------+--------------------+-----------+--------------------+-----+-------------------+
|spotify:track:7tU...|     Jodi Benson|spotify:artist:4m...|Part of Your Worl...|spotify:album:4aA...|     195493|      Little Mermaid|23144|0.38746786105535674|
|spotify:track:3IP...|    Kristen Bell|spotify:artist:2k...|Love Is an Open Door|spotify:album:7lZ...|     124733|              Frozen|35366|  0.336245434026654|
|spotify:track:6bt...| Angela Lansbury|spotify:artist:0L...|Be Our Guest - Fr...|spotify:album:3O5...|     224733|Beauty and the Beast|34013|0.32934889670085665|
|spotify:track:5VI...|     L

#Item-Based Collaborative Filtering


# Fighting against the curse of dimensionality: Matrix Factorization

We want to define $\mathbf{x}_u \in \mathbb{R}^d$ $d$-dimensional vector that represents the user $u$, and $\mathbf{w}_i \in \mathbb{R}^d$ vector that represent the item $i$.

We then can estimate the rating of user $u$ for the item $i$ by computing
\begin{equation}
\hat{r}_{u, i}=\mathbf{x}_u^T \cdot \mathbf{w}_i=\sum_{j=1}^d x_{u, j} w_{j, i}
\end{equation}
Or, in matrix notation,

\begin{equation}
\underbrace{R}_{m \times n} =
\underbrace{X}_{m \times d}
\underbrace{W^T}_{d \times n}
\end{equation}

### How to learn $X$ and $W$
The matrix $R$ is partially known and filled with the observations inside the dataset $\mathcal{D}$. In order to learn the latent factor representations $X$ and $W$, we minimize the following loss function:
\begin{equation}
L(X, W)=\sum_{(u, i) \in \mathcal{D}}\underbrace{\left(r_{u, i}-\mathbf{x}_u^T \cdot \mathbf{w}_i\right)^2}_{\text{squared error term}}+\underbrace{\lambda\left(\sum_{u \in \mathcal{D}}\left\|\mathbf{x}_u\right\|^2+\sum_{i \in \mathcal{D}}\left\|\mathbf{w}_i\right\|^2\right)}_{\text{regularization term}}
\end{equation}

We can then minimize the loss using Stochastic Gradient Descent or Alternating Least Squares.

# Matrix Factorization
Generate a matrix Y where each column represent a playlist and each row represent a song, the (i,j) entry will be 1 if the playlist contains the song, 0 otherwise.

In [26]:
# Throw error in order to not execute the following code
raise ValueError()

ValueError: ignored

In [27]:
import pyspark.sql.functions as f
from pyspark.sql.functions import explode
spark.conf.set("spark.sql.pivotMaxValues", 1000000)

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
# Get a DataFrame of only the relevant columns from the playlist schema
# playlists = slice_df.select("pid", "tracks.track_uri")
# playlists = playlists.select("pid", explode("track_uri").alias("song_uri"))
# playlists = playlists.withColumn("song_id", dense_rank().over(Window.orderBy("song_uri")))
# plaulists = playlists.withColumn("rating", lit(1))
# playlists.count()

from pyspark.sql.functions import expr


In [28]:
from pyspark.sql.functions import explode
import random
tracks_df = slice_df.select("pid", explode("tracks").alias("track")).select("pid", "track.track_uri")
tracks_df = tracks_df.withColumn("rating", lit(1))
# tracks_df = tracks_df.withColumn("rating", (rand() * 10 + 1).cast("integer"))

In [29]:
tracks_df.show()

+----+--------------------+------+
| pid|           track_uri|rating|
+----+--------------------+------+
|1000|spotify:track:5Ib...|     1|
|1000|spotify:track:6rK...|     1|
|1000|spotify:track:6Jl...|     1|
|1000|spotify:track:0Xh...|     1|
|1000|spotify:track:0AB...|     1|
|1000|spotify:track:1iV...|     1|
|1000|spotify:track:0yL...|     1|
|1000|spotify:track:2jx...|     1|
|1000|spotify:track:5eH...|     1|
|1000|spotify:track:0hf...|     1|
|1000|spotify:track:5q6...|     1|
|1000|spotify:track:4Mi...|     1|
|1000|spotify:track:2JE...|     1|
|1000|spotify:track:4AO...|     1|
|1000|spotify:track:5Tw...|     1|
|1000|spotify:track:2dc...|     1|
|1000|spotify:track:07I...|     1|
|1000|spotify:track:5Kw...|     1|
|1000|spotify:track:3ze...|     1|
|1000|spotify:track:3Yi...|     1|
+----+--------------------+------+
only showing top 20 rows



In [None]:
# # Explode the tracks array column into multiple rows
# # tracks_df = slice_df.select("pid", explode("tracks").alias("track"))
# # tracks_df = slice_df.select("pid", "tracks", "tracks")
# tracks_df = slice_df.select("pid", explode("tracks").alias("track")).select("pid", "track.track_uri", "track.pos")

# # Select relevant columns and add a rating column with value 1
# playlist_track_df = tracks_df.withColumn("rating", lit(1))

# # Get distinct track_uri values and join with playlist_track_df
# all_tracks_df = slice_df.select(explode("tracks").alias("track")).select("track.track_uri").distinct()
# all_playlists_df = slice_df.select("pid").distinct()

# all_against_all = all_tracks_df.join(all_playlists_df).distinct()

# from pyspark.sql.functions import when, col

# # playlist_track_rating_df = playlist_track_df.join(all_against_all, ["pid", "track_uri"], "left_outer") \
# #     .withColumn("rating", when(col("pos").isNull(), 0).otherwise(1))

# playlist_track_rating_df = all_against_all.join(playlist_track_df, ["pid", "track_uri"], "left_outer") \
#     .withColumn("rating", when(col("pos").isNull(), 0).otherwise(1)) \
#     .drop("pos")


In [30]:
playlist_track_rating_df = tracks_df.withColumn("song_id", dense_rank().over(Window.orderBy("track_uri")))

In [31]:
playlist_track_rating_df.show(truncate=False)

+----+------------------------------------+------+-------+
|pid |track_uri                           |rating|song_id|
+----+------------------------------------+------+-------+
|1289|spotify:track:000xQL6tZNLJzIrtIgxqSl|1     |1      |
|1011|spotify:track:000xQL6tZNLJzIrtIgxqSl|1     |1      |
|1028|spotify:track:000xQL6tZNLJzIrtIgxqSl|1     |1      |
|1511|spotify:track:000xQL6tZNLJzIrtIgxqSl|1     |1      |
|1627|spotify:track:000xQL6tZNLJzIrtIgxqSl|1     |1      |
|1775|spotify:track:000xQL6tZNLJzIrtIgxqSl|1     |1      |
|1814|spotify:track:000xQL6tZNLJzIrtIgxqSl|1     |1      |
|1000|spotify:track:002PgfoyfrOGiKch4EW8Wm|1     |2      |
|1294|spotify:track:004skCQeDn1iLntSom0rRr|1     |3      |
|1923|spotify:track:004skCQeDn1iLntSom0rRr|1     |3      |
|1487|spotify:track:005CGalYNgMNZcvWMIFeK8|1     |4      |
|1225|spotify:track:005X0FmdtkM1kiutosXLTR|1     |5      |
|1144|spotify:track:009NFK4GeY7xzHCJPrPKdz|1     |6      |
|1070|spotify:track:00AxNl4D4jHL2AEf1W55j5|1     |7     

In [39]:
als = ALS(userCol="pid", itemCol="song_id", ratingCol="rating", nonnegative=True, coldStartStrategy="drop")

In [33]:
from typing import Tuple
import random

def train_test_split(df: DataFrame, split_ratio: float, seed: Optional[int] = None) -> Tuple[DataFrame, DataFrame]:
  random.seed(seed)
  distinct_pids = df.select("pid").distinct().rdd.map(lambda x: x[0]).collect()
  random.shuffle(distinct_pids)
  split_index = int(len(distinct_pids) * split_ratio)
  train_pids = distinct_pids[:split_index]
  test_pids = distinct_pids[split_index:]
  train_df = df.filter(col("pid").isin(train_pids))
  test_df = df.filter(col("pid").isin(test_pids))
  return train_df, test_df



In [40]:
training, test = playlist_track_rating_df.randomSplit([0.8, 0.2], seed=42)

In [41]:
model = als.fit(training)

In [42]:
predictions = model.transform(test)

In [43]:
predictions.show()

+----+--------------------+------+-------+----------+
| pid|           track_uri|rating|song_id|prediction|
+----+--------------------+------+-------+----------+
|1003|spotify:track:7vV...|     1|  35820| 0.8823883|
|1036|spotify:track:44A...|     1|  18911|0.90074456|
|1054|spotify:track:63M...|     1|  28088|  0.900115|
|1070|spotify:track:5lA...|     1|  26708| 0.9001825|
|1072|spotify:track:4S1...|     1|  20683| 0.9001505|
|1143|spotify:track:6yM...|     1|  32414| 0.9000766|
|1177|spotify:track:44A...|     1|  18911| 0.9001289|
|1255|spotify:track:4S1...|     1|  20683|  0.899677|
|1271|spotify:track:6bc...|     1|  30654| 0.8723924|
|1272|spotify:track:44A...|     1|  18911| 0.9001565|
|1288|spotify:track:1Q3...|     1|   6620|0.89832044|
|1289|spotify:track:3JB...|     1|  15447| 0.9000847|
|1289|spotify:track:63M...|     1|  28088|0.89990747|
|1292|spotify:track:5lA...|     1|  26708| 0.8997205|
|1318|spotify:track:3nF...|     1|  17679| 0.9010409|
|1330|spotify:track:4kn...| 

In [44]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

In [None]:
predictions.filter(col("prediction") != "NaN").count(), predictions.filter(col("prediction") == "NaN").count()

In [45]:
rmse

0.1095725395656702

In [67]:
subset = playlist_track_rating_df.select("pid").distinct().limit(1)
subUserRecs = model.recommendForUserSubset(subset, 10)

In [68]:
subset.show()

+----+
| pid|
+----+
|1000|
+----+



In [69]:
subUserRecs.show(truncate=False)

+----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|pid |recommendations                                                                                                                                                                                             |
+----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1000|[{16060, 0.94707656}, {9901, 0.92970276}, {3485, 0.92970276}, {8221, 0.9084438}, {29454, 0.9058341}, {19852, 0.90229636}, {14336, 0.9021202}, {29633, 0.90173846}, {29431, 0.90173846}, {29007, 0.90173846}]|
+----+--------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
def song_name_from_id(song_id: int, reverse_lookup: DataFrame) -> str:
  return 
  
def interpretRecommendation(recommended_result: DataFrame) -> str:
  return

In [None]:
userRecs = model.recommendForAllUsers(1).orderBy("recommendations")
userRecs.show(truncate=False)
userRecs.count()

In [None]:
slice_df.filter(col("pid") == 1710).select(explode("tracks.track_name")).show()

In [None]:
track_uris = playlist_track_rating_df.filter(col("song_id") == 588).select("track_uri")
track_uris.first()