<a href="https://colab.research.google.com/github/DomizianoScarcelli/big-data-project/blob/main/PlaylistReccomender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Config

# Configuration

In [2]:
#@title Download necessary libraries
!pip install pyspark
!pip install -U -q PyDrive 
!apt install openjdk-8-jdk-headless -qq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
openjdk-8-jdk-headless is already the newest version (8u362-ga-0ubuntu1~20.04.1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [165]:
#@title Imports
import os
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly

import pyspark
from pyspark.sql import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, FloatType, LongType
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.ml.linalg import SparseVector, DenseVector

from tqdm.notebook import tqdm
import time
import gc

from google.colab import drive

In [4]:
#@title Set up variables
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
GDRIVE_DIR = "/content/drive"
GDRIVE_HOME_DIR = GDRIVE_DIR + "/MyDrive"
GDRIVE_DATA_DIR = GDRIVE_HOME_DIR + "/Big Data/datasets"
DATASET_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_friendly_spotify_playlist_dataset")
AUDIO_FEATURES_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_track_features")
LITTLE_SLICE_FILE = os.path.join(GDRIVE_DATA_DIR, "little_slice")
LITTLE_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "little_slice_audio_features")
MICRO_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "micro_slice_audio_features")
SPLITTED_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "splitted_pyspark_track_features")
SAVED_DFS_PATH = os.path.join(GDRIVE_DATA_DIR, "saved_dfs")
RANDOM_SEED = 42 # for reproducibility
os.environ["JAVA_HOME"] = JAVA_HOME

In [5]:
#@title Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '100G').\
                set("spark.executor.extraJavaOptions", "-XX:+UseG1GC").\
                setAppName("PySparkTutorial").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [175]:
drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


## Setup ngrok

In [7]:
!pip install pyngrok

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyngrok
  Downloading pyngrok-6.0.0.tar.gz (681 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m681.2/681.2 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyngrok
  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone
  Created wheel for pyngrok: filename=pyngrok-6.0.0-py3-none-any.whl size=19867 sha256=8075d7e08e473c6ad76acdd31b0f915b304b5d46f8be80c837fbdce95113838e
  Stored in directory: /root/.cache/pip/wheels/5c/42/78/0c3d438d7f5730451a25f7ac6cbf4391759d22a67576ed7c2c
Successfully built pyngrok
Installing collected packages: pyngrok
Successfully installed pyngrok-6.0.0


In [8]:
!ngrok authtoken 2NVN8kdoOnMVtlDGGWtwsbT5M3Q_2EJv2HE77FEXkz978Qtnq

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [9]:
from pyngrok import ngrok

# Open a ngrok tunnel on the port 4050 where Spark is running
port = '4050'
public_url = ngrok.connect(port).public_url



In [10]:
print("To access the Spark Web UI console, please click on the following link to the ngrok tunnel \"{}\" -> \"http://127.0.0.1:{}\"".format(public_url, port))

To access the Spark Web UI console, please click on the following link to the ngrok tunnel "https://e6af-35-185-57-204.ngrok-free.app" -> "http://127.0.0.1:4050"


In [11]:
#@title Check if everything is ok
spark, sc._conf.getAll()


(<pyspark.sql.session.SparkSession at 0x7f54ec76e3e0>,
 [('spark.executor.extraJavaOptions',
   '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false -XX:+UseG1GC'),
  ('spark.app.name', 'PySparkTutorial'),
  ('spark.app.startTime', '1683794269002'

# Data acquisition

In [12]:

song_schema = StructType([
    StructField("pos", IntegerType(), True),
    StructField("artist_name", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("artist_uri", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("album_uri", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("album_name", StringType(), True)
])

playlist_schema = StructType([
    StructField("name", StringType(), True),
    StructField("collaborative", StringType(), True),
    StructField("pid", IntegerType(), True),
    StructField("modified_at", IntegerType(), True),
    StructField("num_tracks", IntegerType(), True),
    StructField("num_albums", IntegerType(), True),
    StructField("num_followers", IntegerType(), True),
    StructField("tracks", ArrayType(song_schema), True),
    StructField("num_edits", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("num_artists", IntegerType(), True),
])

audio_features_schema = StructType([
    StructField("danceability", FloatType(), True),
    StructField("energy", FloatType(), True),
    StructField("key", IntegerType(), True),
    StructField("loudness", FloatType(), True),
    StructField("mode", IntegerType(), True),
    StructField("speechiness", FloatType(), True),
    StructField("acousticness", FloatType(), True),
    StructField("instrumentalness", FloatType(), True),
    StructField("liveness", FloatType(), True),
    StructField("valence", FloatType(), True),
    StructField("tempo", FloatType(), True),
    StructField("type", StringType(), True),
    StructField("id", StringType(), True),
    StructField("uri", StringType(), True),
    StructField("track_href", StringType(), True),
    StructField("analysis_url", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("time_signature", IntegerType(), True)
])


In [13]:
playlist_df = spark.read.schema(playlist_schema).json(DATASET_FILE, multiLine=True)
sampled_df = playlist_df.limit(100000)
small_df = playlist_df.limit(1000)
micro_df = playlist_df.limit(100)
slice_df = spark.read.schema(playlist_schema).json(LITTLE_SLICE_FILE, multiLine=True)
audio_df = spark.read.schema(audio_features_schema).json(SPLITTED_SLICE_AUDIO_FEATURES, multiLine=True) #has less songs than expected

In [14]:
# slice_df.select("tracks").first()

In [15]:
slice_df.show()

+-------------+-------------+----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|         name|collaborative| pid|modified_at|num_tracks|num_albums|num_followers|              tracks|num_edits|duration_ms|num_artists|
+-------------+-------------+----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|       disney|        false|1000| 1457827200|       189|        16|            1|[{0, Original Bro...|        4|   31428282|         65|
|Indie Electro|        false|1001| 1417824000|       165|        18|            2|[{0, The Octopus ...|        2|   38241566|          8|
|  jack & jack|        false|1002| 1465430400|        17|        14|            1|[{0, Jack & Jack,...|        3|    3549358|          3|
|        vibes|        false|1003| 1498435200|       225|       195|            2|[{0, LANY, spotif...|       91|   51242585|        157|
|        Indie|        false|1004|

# User-Based Collaborative Filtering
Note: The users are the playlists, the items are the songs and the ratings are 0 if the song is not in the playlist, 1 otherwise.

We have to define a function $sim(u,v)$ that defines the similarity between two users based on their ratings.

We represent the ratings $r_u \in \mathbb{R}^n$ as the $n$ dimensional vector that represents the ratings of the user $u$, where $n$ is the number of total songs in the dataset.

As the similarity function we can use Jaccard similarity.
\begin{equation}
sim(u,v) = J(r_u, r_v) = \frac{|r_u \cap r_v|}{|r_u \cup r_v|}
\end{equation}

Jaccard similarity ignores rating values, but we don't care here since the ratings are binary. In case of discrete value ratings we can use cosine similarity, or better pearson's correlation.

Done that, and defined as ${U^k}$ the neighborhood of $u$ ($k$ most similar users to $u$), we define the set of items rated by $u$'s neighborhood as

\begin{equation}
I^k = \{i \in I : \mathbf{r_{u,i}} \downarrow \land u \in U^k\}
\end{equation}

The rating for the item $i$ to the user $u$ will just be $\mathbf{r_u[i]}$.

In [174]:
RATING_VECTOR_FILE_PATH = os.path.join(SAVED_DFS_PATH, "ml_2_playlist_rating_df.parquet")

In [167]:
def dense_to_sparse(dense: DenseVector) -> SparseVector:
  nonzero_indices = np.nonzero(np.array(dense))[0]
  nonzero_values = np.array(dense)[nonzero_indices]
  sparse_vector = SparseVector(len(dense), nonzero_indices.tolist(), nonzero_values.tolist())
  return sparse_vector

In [168]:
def get_all_songs(playlist_df: DataFrame, set_in_playlist: bool = False) -> DataFrame:
   all_songs = playlist_df.select(explode("tracks.track_uri").alias("uri")).distinct()
   if set_in_playlist:
     all_songs = all_songs.withColumn("in_playlist", lit(1))
   return all_songs
  
songs_df = get_all_songs(slice_df)
songs_df = songs_df.withColumn("pos", monotonically_increasing_id())
songs_df.show()

RATING_VECTOR_LENGTH = songs_df.count()

+--------------------+---+
|                 uri|pos|
+--------------------+---+
|spotify:track:2Y6...|  0|
|spotify:track:5v7...|  1|
|spotify:track:5C5...|  2|
|spotify:track:5WK...|  3|
|spotify:track:0Ry...|  4|
|spotify:track:5N5...|  5|
|spotify:track:4wq...|  6|
|spotify:track:2cI...|  7|
|spotify:track:6EZ...|  8|
|spotify:track:1Oc...|  9|
|spotify:track:27A...| 10|
|spotify:track:2Og...| 11|
|spotify:track:4hY...| 12|
|spotify:track:27P...| 13|
|spotify:track:3sZ...| 14|
|spotify:track:2x7...| 15|
|spotify:track:5ud...| 16|
|spotify:track:3Ms...| 17|
|spotify:track:1jj...| 18|
|spotify:track:1mr...| 19|
+--------------------+---+
only showing top 20 rows



In [171]:
def _create_rating_df(playlist_row: Row, songs_df: DataFrame) -> DataFrame:
  """
  Creates a dataframe that represents the "ratings" for a playlist in the dataframe
  """
  playlist_row = spark.createDataFrame([playlist_row], playlist_schema)
  playlist_uris = get_all_songs(playlist_row)

  joined = songs_df.join(playlist_uris, on="uri", how="right")
  return joined


def _check_songs_ordering(playlist_row: DataFrame, songs_df: DataFrame) -> bool:
  """
  Returns a boolean that indicates if the ordering in the songs_df and rating_df is the same
  """
  playlist_row = spark.createDataFrame([playlist_row], playlist_schema)
  playlist_uris = get_all_songs(playlist_row, True).withColumnRenamed("in_playlist", "isin")

  joined = songs_df.join(playlist_uris, on="uri", how="right")
  joined_left = songs_df.join(playlist_uris, on="uri", how="left").filter("isin == 1")
  assert joined.collect() == joined_left.collect(), f"The order of songs_df is different from the order of rating_df!"

# def _extract_rating_vector(rating_df: DataFrame) -> SparseVector:
#   """
#   Extracts the rating vectors for each playlist 
#   """
#   dense_vector = DenseVector([row.isin for row in rating_df.select("isin").collect()])
#   return dense_to_sparse(dense_vector)

def _extrac_sparse_rating_vector(rating_df: DataFrame) -> SparseVector:
  indices = np.sort([row.pos for row in rating_df.collect()])
  return SparseVector(RATING_VECTOR_LENGTH, indices, np.ones(indices.shape[0]) )

def rating_vector_from_row(playlist_row: Row, songs_df: DataFrame):
  """
  Pipelines togheter create_rating_df and extract_rating_vector.
  """
  rating_df_1 = _create_rating_df(playlist_row, songs_df)
  rating_vector_1 = _extrac_sparse_rating_vector(rating_df_1)
  return rating_vector_1

t1 = time.time() 
rating_vector_1 = rating_vector_from_row(slice_df.first(), songs_df)

t2 = time.time()

t2 - t1, rating_vector_1, type(rating_vector_1)

(1.8183293342590332,
 SparseVector(36175, {0: 1.0, 153: 1.0, 545: 1.0, 727: 1.0, 910: 1.0, 1095: 1.0, 1414: 1.0, 1763: 1.0, 1764: 1.0, 2129: 1.0, 2677: 1.0, 2678: 1.0, 2880: 1.0, 3389: 1.0, 3559: 1.0, 3726: 1.0, 3913: 1.0, 3914: 1.0, 4102: 1.0, 4283: 1.0, 4646: 1.0, 4647: 1.0, 4824: 1.0, 4993: 1.0, 5512: 1.0, 5684: 1.0, 5685: 1.0, 5864: 1.0, 6048: 1.0, 6248: 1.0, 6249: 1.0, 6423: 1.0, 6424: 1.0, 6770: 1.0, 6927: 1.0, 7128: 1.0, 7282: 1.0, 7460: 1.0, 7461: 1.0, 7462: 1.0, 7463: 1.0, 7638: 1.0, 7789: 1.0, 7790: 1.0, 7965: 1.0, 7966: 1.0, 7967: 1.0, 8125: 1.0, 8126: 1.0, 8309: 1.0, 8686: 1.0, 8687: 1.0, 8852: 1.0, 9050: 1.0, 9051: 1.0, 9215: 1.0, 9385: 1.0, 9386: 1.0, 9738: 1.0, 9924: 1.0, 9925: 1.0, 9926: 1.0, 10287: 1.0, 10510: 1.0, 10691: 1.0, 10692: 1.0, 10885: 1.0, 11072: 1.0, 11073: 1.0, 11448: 1.0, 11649: 1.0, 11650: 1.0, 12217: 1.0, 13494: 1.0, 13674: 1.0, 13675: 1.0, 13676: 1.0, 13677: 1.0, 13857: 1.0, 14223: 1.0, 14224: 1.0, 14419: 1.0, 14795: 1.0, 14796: 1.0, 14797: 1.0, 14798:

In [172]:
def jaccard_similarity(vector_1: SparseVector, vector_2: SparseVector) -> float:
  # Convert SparseVectors to sets
  set1 = set(vector_1.indices)
  set2 = set(vector_2.indices)

  # Calculate the intersection and union of the sets
  intersection = len(set1.intersection(set2))
  union = len(set1.union(set2))

  # Calculate the similarity
  similarity = intersection / union

  return similarity

In [182]:
def create_rating_vectors_df(playlists_df: DataFrame) -> DataFrame:
  rating_vectors = []

  for playlist_row in tqdm(playlists_df.collect(), desc="Creating rating vectors"):
    rating_vector = rating_vector_from_row(playlist_row, songs_df)
    new_row = Row(playlist_id=playlist_row.pid, rating_vector=rating_vector)
    rating_vectors.append([new_row])
  return spark.createDataFrame(rating_vectors)

if os.path.exists(RATING_VECTOR_FILE_PATH):
  # rv_schema = StructType([StructField('playlist_id', LongType(), True), StructField('rating_vector', pyspark.ml.linalg.VectorUDT(), True)])
  rating_vectors_df = spark.read.parquet(RATING_VECTOR_FILE_PATH)
  rv_df = rating_vectors_df.select(col("_1.playlist_id").alias("playlist_id"), col("_1.rating_vector").alias("rating_vector"))
else:
  rating_vectors_df = create_rating_vectors_df(slice_df)
  rating_vectors_df.write.parquet(RATING_VECTOR_FILE_PATH)

In [184]:
def create_similarity_df(input_vector: DataFrame, rating_vectors_df: DataFrame, similarityFunction: Callable) -> DataFrame:
  rv_df_input = rating_vectors_df.crossJoin(input_vector)
  similarity_udf = udf(similarityFunction, returnType='double')
  result_df = rv_df_input.withColumn("similarity", similarity_udf(rv_df_input["input_vector"], rv_df_input["rating_vector"]))
  return result_df

first_playlist_vector = rv_df.limit(1).select("rating_vector").withColumnRenamed("rating_vector","input_vector")
result_df = create_similarity_df(first_playlist_vector, rv_df, jaccard_similarity)

Curse of dimensionality! We can see that each playlist is very dissimilar from each other playlist.

In [185]:
rv_df.show()

+-----------+--------------------+
|playlist_id|       rating_vector|
+-----------+--------------------+
|       1000|(36175,[0,153,545...|
|       1001|(36175,[336,728,9...|
|       1002|(36175,[1947,4103...|
|       1003|(36175,[154,155,3...|
|       1004|(36175,[1,2,156,1...|
|       1005|(36175,[338,3048,...|
|       1006|(36175,[547,1417,...|
|       1007|(36175,[339,913,9...|
|       1008|(36175,[158,730,1...|
|       1009|(36175,[4650,4998...|
|       1010|(36175,[3,159,160...|
|       1011|(36175,[161,162,5...|
|       1012|(36175,[549,1950,...|
|       1013|(36175,[340,1612,...|
|       1014|(36175,[163,341,1...|
|       1015|(36175,[164,732,1...|
|       1016|(36175,[4,165,342...|
|       1017|(36175,[550,916,1...|
|       1018|(36175,[551,736,1...|
|       1019|(36175,[917,4107,...|
+-----------+--------------------+
only showing top 20 rows



In [186]:
result_df.filter("similarity > 0").show()

+-----------+--------------------+--------------------+--------------------+
|playlist_id|       rating_vector|        input_vector|          similarity|
+-----------+--------------------+--------------------+--------------------+
|       1000|(36175,[0,153,545...|(36175,[0,153,545...|                 1.0|
|       1048|(36175,[2129,4119...|(36175,[0,153,545...| 0.03286384976525822|
|       1056|(36175,[176,177,7...|(36175,[0,153,545...|0.009836065573770493|
|       1190|(36175,[917,3578,...|(36175,[0,153,545...|0.004761904761904762|
|       1219|(36175,[217,392,5...|(36175,[0,153,545...|0.005917159763313609|
|       1259|(36175,[601,3726,...|(36175,[0,153,545...|0.024752475247524754|
|       1260|(36175,[1476,2380...|(36175,[0,153,545...|0.004784688995215311|
|       1301|(36175,[60,61,983...|(36175,[0,153,545...|            0.078125|
|       1323|(36175,[59,214,37...|(36175,[0,153,545...|0.003095975232198...|
|       1364|(36175,[570,3464,...|(36175,[0,153,545...|0.017699115044247787|

Trying dimensionality reduction with PCA

In [188]:
from pyspark.ml.feature import PCA
pca = PCA(k=10, inputCol="rating_vector")
pca.setOutputCol("pca_features")
model = pca.fit(rv_df)
model.getK()

Py4JJavaError: ignored

#Item-Based Collaborative Filtering


# Fighting against the curse of dimensionality: Matrix Factorization

We want to define $\mathbf{x}_u \in \mathbb{R}^d$ $d$-dimensional vector that represents the user $u$, and $\mathbf{w}_i \in \mathbb{R}^d$ vector that represent the item $i$.

We then can estimate the rating of user $u$ for the item $i$ by computing
\begin{equation}
\hat{r}_{u, i}=\mathbf{x}_u^T \cdot \mathbf{w}_i=\sum_{j=1}^d x_{u, j} w_{j, i}
\end{equation}
Or, in matrix notation,

\begin{equation}
\underbrace{R}_{m \times n} =
\underbrace{X}_{m \times d}
\underbrace{W^T}_{d \times n}
\end{equation}

### How to learn $X$ and $W$
The matrix $R$ is partially known and filled with the observations inside the dataset $\mathcal{D}$. In order to learn the latent factor representations $X$ and $W$, we minimize the following loss function:
\begin{equation}
L(X, W)=\sum_{(u, i) \in \mathcal{D}}\underbrace{\left(r_{u, i}-\mathbf{x}_u^T \cdot \mathbf{w}_i\right)^2}_{\text{squared error term}}+\underbrace{\lambda\left(\sum_{u \in \mathcal{D}}\left\|\mathbf{x}_u\right\|^2+\sum_{i \in \mathcal{D}}\left\|\mathbf{w}_i\right\|^2\right)}_{\text{regularization term}}
\end{equation}

We can then minimize the loss using Stochastic Gradient Descent or Alternating Least Squares.

In [183]:
type(rv_df.first().rating_vector)

pyspark.ml.linalg.SparseVector

In [156]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["rating_vector"], outputCol="features")
df_assembled = assembler.transform(rv_df)

IllegalArgumentException: ignored

# Matrix Factorization
Generate a matrix Y where each column represent a playlist and each row represent a song, the (i,j) entry will be 1 if the playlist contains the song, 0 otherwise.

In [None]:
import torch
import pyspark.sql.functions as f
from pyspark.sql.functions import explode
spark.conf.set("spark.sql.pivotMaxValues", 1000000)

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
# Get a DataFrame of only the relevant columns from the playlist schema
# playlists = slice_df.select("pid", "tracks.track_uri")
# playlists = playlists.select("pid", explode("track_uri").alias("song_uri"))
# playlists = playlists.withColumn("song_id", dense_rank().over(Window.orderBy("song_uri")))
# plaulists = playlists.withColumn("rating", lit(1))
# playlists.count()

from pyspark.sql.functions import expr


In [None]:
from pyspark.sql.functions import explode
import random
tracks_df = slice_df.select("pid", explode("tracks").alias("track")).select("pid", "track.track_uri")
tracks_df = tracks_df.withColumn("rating", lit(1))
# tracks_df = tracks_df.withColumn("rating", (rand() * 10 + 1).cast("integer"))

In [None]:
tracks_df.show()

In [None]:


# Explode the tracks array column into multiple rows
# tracks_df = slice_df.select("pid", explode("tracks").alias("track"))
# tracks_df = slice_df.select("pid", "tracks", "tracks")
tracks_df = slice_df.select("pid", explode("tracks").alias("track")).select("pid", "track.track_uri", "track.pos")

# Select relevant columns and add a rating column with value 1
playlist_track_df = tracks_df.withColumn("rating", lit(1))

# Get distinct track_uri values and join with playlist_track_df
all_tracks_df = slice_df.select(explode("tracks").alias("track")).select("track.track_uri").distinct()
all_playlists_df = slice_df.select("pid").distinct()

all_against_all = all_tracks_df.join(all_playlists_df).distinct()

from pyspark.sql.functions import when, col

# playlist_track_rating_df = playlist_track_df.join(all_against_all, ["pid", "track_uri"], "left_outer") \
#     .withColumn("rating", when(col("pos").isNull(), 0).otherwise(1))

playlist_track_rating_df = all_against_all.join(playlist_track_df, ["pid", "track_uri"], "left_outer") \
    .withColumn("rating", when(col("pos").isNull(), 0).otherwise(1)) \
    .drop("pos")


In [None]:
playlist_track_rating_df = tracks_df.withColumn("song_id", dense_rank().over(Window.orderBy("track_uri")))

In [None]:
playlist_track_rating_df.show()

In [None]:
playlist_track_rating_df.groupBy("rating").count().show()

In [None]:
als = ALS(userCol="pid", itemCol="song_id", ratingCol="rating", nonnegative=True)

In [None]:
training, test = playlist_track_rating_df.randomSplit([0.8, 0.2])
model = als.fit(training)

In [None]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

In [None]:
predictions.show()

In [None]:
predictions.filter(col("prediction") != "NaN").count(), predictions.filter(col("prediction") == "NaN").count()

In [None]:
rmse

In [None]:
userRecs = model.recommendForAllUsers(10).orderBy("recommendations")
userRecs.first()

In [None]:
slice_df.filter(col("pid") == 1710).select(explode("tracks.track_name")).show()

In [None]:
track_uris = playlist_track_rating_df.filter(col("song_id") == 588).select("track_uri")
track_uris.first()