<a href="https://colab.research.google.com/github/DomizianoScarcelli/big-data-project/blob/main/PlaylistReccomender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Config

# Configuration

In [None]:
#@title Download necessary libraries
!pip install pyspark
!pip install -U -q PyDrive 
!apt install openjdk-8-jdk-headless -qq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
openjdk-8-jdk-headless is already the newest version (8u362-ga-0ubuntu1~20.04.1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [None]:
#@title Imports
import os
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly

import pyspark
from pyspark.sql import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, FloatType, LongType
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from google.colab import drive

In [None]:
#@title Set up variables
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
GDRIVE_DIR = "/content/drive"
GDRIVE_HOME_DIR = GDRIVE_DIR + "/MyDrive"
GDRIVE_DATA_DIR = GDRIVE_HOME_DIR + "/Big Data/datasets"
DATASET_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_friendly_spotify_playlist_dataset")
AUDIO_FEATURES_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_track_features")
LITTLE_SLICE_FILE = os.path.join(GDRIVE_DATA_DIR, "little_slice")
LITTLE_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "little_slice_audio_features")
MICRO_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "micro_slice_audio_features")
SPLITTED_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "splitted_pyspark_track_features")
SAVED_DFS_PATH = os.path.join(GDRIVE_DATA_DIR, "saved_dfs")
RANDOM_SEED = 42 # for reproducibility
os.environ["JAVA_HOME"] = JAVA_HOME

In [None]:
#@title Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '5G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '100G').\
                set("spark.executor.extraJavaOptions", "-XX:+UseG1GC").\
                setAppName("PySparkTutorial").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

ValueError: ignored

In [None]:
drive.mount(GDRIVE_DIR, force_remount=True)

## Setup ngrok

In [None]:
!pip install pyngrok

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!ngrok authtoken 2NVN8kdoOnMVtlDGGWtwsbT5M3Q_2EJv2HE77FEXkz978Qtnq

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [None]:
from pyngrok import ngrok

# Open a ngrok tunnel on the port 4050 where Spark is running
port = '4050'
public_url = ngrok.connect(port).public_url



In [None]:
print("To access the Spark Web UI console, please click on the following link to the ngrok tunnel \"{}\" -> \"http://127.0.0.1:{}\"".format(public_url, port))

To access the Spark Web UI console, please click on the following link to the ngrok tunnel "https://6f1b-34-90-174-193.ngrok-free.app" -> "http://127.0.0.1:4050"


In [None]:
#@title Check if everything is ok
spark, sc._conf.getAll()


(<pyspark.sql.session.SparkSession at 0x7f8eb509c340>,
 [('spark.executor.extraJavaOptions',
   '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false -XX:+UseG1GC'),
  ('spark.app.name', 'PySparkTutorial'),
  ('spark.app.submitTime', '1683481593332

# Data acquisition

In [None]:

song_schema = StructType([
    StructField("pos", IntegerType(), True),
    StructField("artist_name", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("artist_uri", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("album_uri", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("album_name", StringType(), True)
])

playlist_schema = StructType([
    StructField("name", StringType(), True),
    StructField("collaborative", StringType(), True),
    StructField("pid", IntegerType(), True),
    StructField("modified_at", IntegerType(), True),
    StructField("num_tracks", IntegerType(), True),
    StructField("num_albums", IntegerType(), True),
    StructField("num_followers", IntegerType(), True),
    StructField("tracks", ArrayType(song_schema), True),
    StructField("num_edits", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("num_artists", IntegerType(), True),
])

audio_features_schema = StructType([
    StructField("danceability", FloatType(), True),
    StructField("energy", FloatType(), True),
    StructField("key", IntegerType(), True),
    StructField("loudness", FloatType(), True),
    StructField("mode", IntegerType(), True),
    StructField("speechiness", FloatType(), True),
    StructField("acousticness", FloatType(), True),
    StructField("instrumentalness", FloatType(), True),
    StructField("liveness", FloatType(), True),
    StructField("valence", FloatType(), True),
    StructField("tempo", FloatType(), True),
    StructField("type", StringType(), True),
    StructField("id", StringType(), True),
    StructField("uri", StringType(), True),
    StructField("track_href", StringType(), True),
    StructField("analysis_url", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("time_signature", IntegerType(), True)
])


In [None]:
playlist_df = spark.read.schema(playlist_schema).json(DATASET_FILE, multiLine=True)
sampled_df = playlist_df.limit(100000)
small_df = playlist_df.limit(1000)
micro_df = playlist_df.limit(100)
slice_df = spark.read.schema(playlist_schema).json(LITTLE_SLICE_FILE, multiLine=True)
audio_df = spark.read.schema(audio_features_schema).json(SPLITTED_SLICE_AUDIO_FEATURES, multiLine=True) #has less songs than expected

In [None]:
# slice_df.select("tracks").first()

In [None]:
slice_df.show()

+-------------+-------------+----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|         name|collaborative| pid|modified_at|num_tracks|num_albums|num_followers|              tracks|num_edits|duration_ms|num_artists|
+-------------+-------------+----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|       disney|        false|1000| 1457827200|       189|        16|            1|[{0, Original Bro...|        4|   31428282|         65|
|Indie Electro|        false|1001| 1417824000|       165|        18|            2|[{0, The Octopus ...|        2|   38241566|          8|
|  jack & jack|        false|1002| 1465430400|        17|        14|            1|[{0, Jack & Jack,...|        3|    3549358|          3|
|        vibes|        false|1003| 1498435200|       225|       195|            2|[{0, LANY, spotif...|       91|   51242585|        157|
|        Indie|        false|1004|

# User-Based Collaborative Filtering
Note: The users are the playlists, the items are the songs and the ratings are 0 if the song is not in the playlist, 1 otherwise.

We have to define a function $sim(u,v)$ that defines the similarity between two users based on their ratings.

We represent the ratings $r_u \in \mathbb{R}^n$ as the $n$ dimensional vector that represents the ratings of the user $u$, where $n$ is the number of total songs in the dataset.

As the similarity function we can use Jaccard similarity.
\begin{equation}
sim(u,v) = J(r_u, r_v) = \frac{|r_u \cap r_v|}{|r_u \cup r_v|}
\end{equation}

Jaccard similarity ignores rating values, but we don't care here since the ratings are binary. In case of discrete value ratings we can use cosine similarity, or better pearson's correlation.

Done that, and defined as ${U^k}$ the neighborhood of $u$ ($k$ most similar users to $u$), we define the set of items rated by $u$'s neighborhood as

\begin{equation}
I^k = \{i \in I : \mathbf{r_{u,i}} \downarrow \land u \in U^k\}
\end{equation}

The rating for the item $i$ to the user $u$ will just be $\mathbf{r_u[i]}$.

In [None]:
from pyspark.mllib.linalg import SparseVector, DenseVector

def dense_to_sparse(dense: DenseVector) -> SparseVector:
  nonzero_indices = np.nonzero(np.array(dense))[0]
  nonzero_values = np.array(dense)[nonzero_indices]
  sparse_vector = SparseVector(len(dense), nonzero_indices.tolist(), nonzero_values.tolist())
  return sparse_vector

In [None]:
def get_all_songs(playlist_df: DataFrame, set_in_playlist: bool) -> DataFrame:
   all_songs = playlist_df.select(explode("tracks.track_uri").alias("uri")).distinct()
   return all_songs.withColumn("in_playlist", lit(1 if set_in_playlist else 0))

songs_df = get_all_songs(slice_df, False)
songs_df.show

<bound method DataFrame.show of DataFrame[uri: string, in_playlist: int]>

In [None]:
def create_rating_df(playlist_row: Row, songs_df: DataFrame) -> DataFrame:
    """
    Creates a dataframe that represents the "ratings" for a user in the dataframe
    """
    playlist_row = spark.createDataFrame([playlist_row], playlist_schema)
    playlist_uris = get_all_songs(playlist_row, True)
    playlist_uris = playlist_uris.withColumnRenamed("in_playlist", "isin")

    joined = songs_df.join(playlist_uris, on="uri", how="left")
    return joined.drop('in_playlist').fillna(0)

def check_songs_ordering(songs_df: DataFrame, rating_df: DataFrame) -> bool:
  """
  Returns a boolean that indicates if the ordering in the songs_df and rating_df is the same
  """
  songs_uris = [row.uri for row in songs_df.select("uri").collect()]
  playlist_uris = [row.uri for row in rating_df.select("uri").collect()]
  assert songs_uris == playlist_uris, f"The order of songs_df is different from the order of rating_df!"

def extract_rating_vector(rating_df: DataFrame) -> SparseVector:
  """
  Extracts the rating vectors for each playlist 
  """
  dense_vector = DenseVector([row.isin for row in rating_df.select("isin").collect()])
  return dense_to_sparse(dense_vector)

def rating_vector_from_row(playlist_row: Row, songs_df: DataFrame):
  rating_df_1 = create_rating_df(playlist_row, songs_df)
  rating_vector_1 = extract_rating_vector(rating_df_1)
  return rating_vector_1


# first_playlist = spark.createDataFrame([slice_df.first()], playlist_schema)
rating_df_1 = create_rating_df(slice_df.first(), songs_df)

check_songs_ordering(songs_df, rating_df_1) # Test

rating_vector_1 = extract_rating_vector(rating_df_1)
rating_vector_1.toArray()
import gc
gc.collect()

14

In [None]:
from tqdm.notebook import tqdm
import gc


def jaccard_similarity(vector_1: SparseVector, vector_2: SparseVector) -> float:
  return vector_1.norm(1) / vector_2.norm(2)

def create_similarity_df(main_playlist_row: Row, playlists_df: DataFrame, songs_df: DataFrame) -> DataFrame:
  result = []
  main_vector = rating_vector_from_row(main_playlist_row, songs_df)
  for playlist_row in tqdm(playlists_df.collect(), desc="Creating similarity dataframe"):
    other_vector = rating_vector_from_row(playlist_row, songs_df)
    similarity = jaccard_similarity(main_vector, other_vector).item()
    
    del other_vector
    gc.collect()

    result.append(Row(other_pid=playlist_row.pid, similarity=similarity))

  schema = StructType([ StructField("other_pid", IntegerType(), True), StructField("similarity", FloatType(), True) ])
  return spark.createDataFrame(result, schema)

simil_df = create_similarity_df(slice_df.first(), slice_df, songs_df)

Creating similarity dataframe:   0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
def create_rating_vectors_df(playlists_df: DataFrame) -> DataFrame:
  rating_vectors = []
  songs_df = get_all_songs(slice_df, False)
  
  for playlist_row in tqdm(playlists_df.collect(), desc="Creating rating vectors"):
    rating_df = create_rating_df(playlist_row, songs_df) #TODO: change it to the one liner
    rating_vector = extract_rating_vector(rating_df)
    new_row = Row(playlist_id=playlist_row.pid, rating_vector=rating_vector)
    rating_vectors.append([new_row])
    del rating_df
    del rating_vector
    gc.collect()
  return spark.createDataFrame(rating_vectors)

rating_vectors_df = create_rating_vectors_df(slice_df)


#Item-Based Collaborative Filtering


# Matrix Factorization
Generate a matrix Y where each column represent a playlist and each row represent a song, the (i,j) entry will be 1 if the playlist contains the song, 0 otherwise.

In [None]:
import torch
import pyspark.sql.functions as f
from pyspark.sql.functions import explode
spark.conf.set("spark.sql.pivotMaxValues", 1000000)

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
# Get a DataFrame of only the relevant columns from the playlist schema
# playlists = slice_df.select("pid", "tracks.track_uri")
# playlists = playlists.select("pid", explode("track_uri").alias("song_uri"))
# playlists = playlists.withColumn("song_id", dense_rank().over(Window.orderBy("song_uri")))
# plaulists = playlists.withColumn("rating", lit(1))
# playlists.count()

from pyspark.sql.functions import expr


KeyboardInterrupt: ignored

In [None]:
from pyspark.sql.functions import explode
import random
tracks_df = slice_df.select("pid", explode("tracks").alias("track")).select("pid", "track.track_uri")
tracks_df = tracks_df.withColumn("rating", lit(1))
# tracks_df = tracks_df.withColumn("rating", (rand() * 10 + 1).cast("integer"))

In [None]:
tracks_df.show()

In [None]:


# Explode the tracks array column into multiple rows
# tracks_df = slice_df.select("pid", explode("tracks").alias("track"))
# tracks_df = slice_df.select("pid", "tracks", "tracks")
tracks_df = slice_df.select("pid", explode("tracks").alias("track")).select("pid", "track.track_uri", "track.pos")

# Select relevant columns and add a rating column with value 1
playlist_track_df = tracks_df.withColumn("rating", lit(1))

# Get distinct track_uri values and join with playlist_track_df
all_tracks_df = slice_df.select(explode("tracks").alias("track")).select("track.track_uri").distinct()
all_playlists_df = slice_df.select("pid").distinct()

all_against_all = all_tracks_df.join(all_playlists_df).distinct()

from pyspark.sql.functions import when, col

# playlist_track_rating_df = playlist_track_df.join(all_against_all, ["pid", "track_uri"], "left_outer") \
#     .withColumn("rating", when(col("pos").isNull(), 0).otherwise(1))

playlist_track_rating_df = all_against_all.join(playlist_track_df, ["pid", "track_uri"], "left_outer") \
    .withColumn("rating", when(col("pos").isNull(), 0).otherwise(1)) \
    .drop("pos")


In [None]:
playlist_track_rating_df = tracks_df.withColumn("song_id", dense_rank().over(Window.orderBy("track_uri")))

In [None]:
playlist_track_rating_df.show()

In [None]:
playlist_track_rating_df.groupBy("rating").count().show()

In [None]:
als = ALS(userCol="pid", itemCol="song_id", ratingCol="rating", nonnegative=True)

In [None]:
training, test = playlist_track_rating_df.randomSplit([0.8, 0.2])
model = als.fit(training)

In [None]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

In [None]:
predictions.show()

In [None]:
predictions.filter(col("prediction") != "NaN").count(), predictions.filter(col("prediction") == "NaN").count()

In [None]:
rmse

In [None]:
userRecs = model.recommendForAllUsers(10).orderBy("recommendations")
userRecs.first()

In [None]:
slice_df.filter(col("pid") == 1710).select(explode("tracks.track_name")).show()

In [None]:
track_uris = playlist_track_rating_df.filter(col("song_id") == 588).select("track_uri")
track_uris.first()