<a href="https://colab.research.google.com/github/DomizianoScarcelli/big-data-project/blob/dev/data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Configuration

Here I configure the environment. Since I alternated from Google Colab to Local development, I define a LOCAL variable that allows me to know in which environment I am. 

In [1]:
import os
def is_running_on_colab():
    return "COLAB_GPU" in os.environ

LOCAL = not is_running_on_colab()

In [2]:
#@title Download necessary libraries
if not LOCAL:
    !pip install pyspark -qq
    !pip install -U -q PyDrive -qq
    !apt install openjdk-8-jdk-headless -qq

In [3]:
#@title Imports
import requests

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, FloatType, LongType
from pyspark import SparkConf
from pyspark.ml.linalg import SparseVector, VectorUDT

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split as sklearn_split
import shutil
if not LOCAL:
    from google.colab import drive

from typing import Tuple
from functools import reduce

In [4]:
#@title Set up variables
if not LOCAL:
    JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
    GDRIVE_DIR = "/content/drive"
    GDRIVE_HOME_DIR = GDRIVE_DIR + "/MyDrive"
    GDRIVE_DATA_DIR = GDRIVE_HOME_DIR + "/Big Data/datasets"
    DATASET_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_friendly_spotify_playlist_dataset")
    AUDIO_FEATURES_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_track_features")
    LITTLE_SLICE_FILE = os.path.join(GDRIVE_DATA_DIR, "little_slice")
    SMALL_SLICE_FLIE = os.path.join(GDRIVE_DATA_DIR, "small_slice")
    LITTLE_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "little_slice_audio_features")
    MICRO_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "micro_slice_audio_features")
    SPLITTED_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "splitted_pyspark_track_features")
    SAVED_DFS_PATH = os.path.join(GDRIVE_DATA_DIR, "saved_dfs")
    SAVED_MODELS = os.path.join(GDRIVE_DATA_DIR, "saved_models")
    PRUNED_DF_PATH = os.path.join(GDRIVE_DATA_DIR, "pruned_df_100k")
else:
    GDRIVE_DATA_DIR = os.path.abspath("./data")
    SAVED_DFS_PATH = os.path.join(GDRIVE_DATA_DIR, "saved_dfs")
    SAVED_MODELS = os.path.join(GDRIVE_DATA_DIR, "saved_models")
    DATASET_FILE = os.path.join(GDRIVE_DATA_DIR, "full_dataset")
    SMALL_SLICE_FLIE = os.path.join(GDRIVE_DATA_DIR, "small_slice")
    PRUNED_DF_PATH = os.path.join(GDRIVE_DATA_DIR, "pruned_df_100k")
    JAVA_HOME = "/opt/homebrew/opt/openjdk"
RANDOM_SEED = 42 # for reproducibility
os.environ["JAVA_HOME"] = JAVA_HOME
os.environ["PYSPARK_PYTHON"]="python"

In [5]:
#@title Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '100G').\
                set("spark.executor.extraJavaOptions", "-XX:+UseG1GC").\
                setAppName("PySparkTutorial").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/29 11:26:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
spark = SparkSession.builder.getOrCreate()

In [7]:
if not LOCAL:
    drive.mount(GDRIVE_DIR, force_remount=True)

In [8]:
#@title Check if everything is ok
spark, sc._conf.getAll()

(<pyspark.sql.session.SparkSession at 0x12f8dfc10>,
 [('spark.app.id', 'local-1688030804255'),
  ('spark.executor.extraJavaOptions',
   '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false -XX:+UseG1GC'),
  ('spark.app.name', 'PySparkTutorial'),
 

# Load DataFrame

Define the `DataFrame` schemas and load the primary `DataFrame` containing the 100K playlists. 

In [9]:
song_schema = StructType([
    StructField("pos", IntegerType(), True),
    StructField("artist_name", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("artist_uri", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("album_uri", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("album_name", StringType(), True)
])

playlist_schema = StructType([
    StructField("name", StringType(), True),
    StructField("collaborative", StringType(), True),
    StructField("pid", IntegerType(), True),
    StructField("modified_at", IntegerType(), True),
    StructField("num_tracks", IntegerType(), True),
    StructField("num_albums", IntegerType(), True),
    StructField("num_followers", IntegerType(), True),
    StructField("tracks", ArrayType(song_schema), True),
    StructField("num_edits", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("num_artists", IntegerType(), True),
])

playlist_schema_mapped = StructType([
    StructField("name", StringType(), True),
    StructField("collaborative", StringType(), True),
    StructField("pid", IntegerType(), True),
    StructField("modified_at", IntegerType(), True),
    StructField("num_tracks", IntegerType(), True),
    StructField("num_albums", IntegerType(), True),
    StructField("num_followers", IntegerType(), True),
    StructField("tracks", VectorUDT(), True),
    StructField("num_edits", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("num_artists", IntegerType(), True),
])

In [10]:
slice_df = spark.read.schema(playlist_schema).json(SMALL_SLICE_FLIE, multiLine=True)
NUM_PLAYLISTS = slice_df.count()

                                                                                

In [11]:
slice_df.show()

+--------------+-------------+-----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|          name|collaborative|  pid|modified_at|num_tracks|num_albums|num_followers|              tracks|num_edits|duration_ms|num_artists|
+--------------+-------------+-----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|         Ratch|        false|45000| 1508976000|        88|        70|            1|[{0, Beyoncé, spo...|       50|   20047039|         48|
|  slow it down|        false|45001| 1505952000|        80|        77|            1|[{0, Twinbed, spo...|       20|   20365984|         65|
|    Phat Beats|        false|45002| 1466640000|        24|        15|            5|[{0, Baths, spoti...|       16|    5127143|         14|
|           ✌🏽|        false|45003| 1509148800|        77|        63|            3|[{0, Owl City, sp...|       50|   17201663|         54|
|          💘💘|       

Since we will produce many files, each one optimized for each technique, let's define here all the different paths

In [12]:
# The DF used for train (80% of the original) (playlist are different)
TRAIN_DF_PATH = os.path.join(SAVED_DFS_PATH, f"train_df-{NUM_PLAYLISTS}.json")
# The DF used for testing (20% of the original) (playlist are different)
TEST_DF_PATH = os.path.join(SAVED_DFS_PATH, f"test_df-{NUM_PLAYLISTS}.json")

# The DF used for train in the NN model (can be filtered or not)
NN_TRAIN_DF_PATH = os.path.join(SAVED_DFS_PATH, f"nn_train_df-{NUM_PLAYLISTS}.json")
# The DF used for testing in the NN model (can be filtered or not)
NN_TEST_DF_PATH = os.path.join(SAVED_DFS_PATH, f"nn_test_df-{NUM_PLAYLISTS}.json")
# The partition in train test of the NN test set. (Same playlists, different songs)
NN_TEST_DF_TRAIN_PATH = os.path.join(SAVED_DFS_PATH, f"nn_test_df-train-{NUM_PLAYLISTS}.json")
NN_TEST_DF_TEST_PATH = os.path.join(SAVED_DFS_PATH, f"nn_test_df-test-{NUM_PLAYLISTS}.json")

NN_EVAL_PATH = os.path.join(SAVED_DFS_PATH, f"nn_eval_df-{NUM_PLAYLISTS}.json")
NN_EVAL_TRAIN_PATH = os.path.join(SAVED_DFS_PATH, f"nn_eval_df-train-{NUM_PLAYLISTS}.json")
NN_EVAL_TEST_PATH = os.path.join(SAVED_DFS_PATH, f"nn_eval_df-test-{NUM_PLAYLISTS}.json")
# New one:
ARTISTS_EMBEDDINGS_TRAIN = os.path.join(SAVED_DFS_PATH, f"nn_artists_embeddings-train-{NUM_PLAYLISTS}.json")
ARTISTS_EMBEDDINGS_TEST_TRAIN = os.path.join(SAVED_DFS_PATH, f"nn_artists_embeddings-test-train-{NUM_PLAYLISTS}.json")
ARTISTS_EMBEDDINGS_TEST_TEST = os.path.join(SAVED_DFS_PATH, f"nn_artists_embeddings-test-test{NUM_PLAYLISTS}.json")

ARTISTS_EMBEDDINGS_EVAL = os.path.join(SAVED_DFS_PATH, f"nn_artists_embeddings-eval-{NUM_PLAYLISTS}.json")
ARTISTS_EMBEDDINGS_EVAL_TRAIN = os.path.join(SAVED_DFS_PATH, f"nn_artists_embeddings-eval-train-{NUM_PLAYLISTS}.json")
ARTISTS_EMBEDDINGS_EVAL_TEST = os.path.join(SAVED_DFS_PATH, f"nn_artists_embeddings-eval-test{NUM_PLAYLISTS}.json")

# The length of the artist vector length (Artist vectors are only used in the NN model)
ARTIST_VECTOR_LENGTH_PATH = os.path.join(SAVED_DFS_PATH, f"nn_artist_vector_length-{NUM_PLAYLISTS}.txt")

SONGS_VECTOR_LENGTH_PATH = os.path.join(SAVED_DFS_PATH, f"songs_vector_length-{NUM_PLAYLISTS}.txt")
# This may be filtered or not
FILTERED_SONGS_VECTOR_LENGTH_PATH = os.path.join(SAVED_DFS_PATH, f"nn_songs_vector_length-{NUM_PLAYLISTS}.txt")

SONGS_EMBEDDINGS_PATH = os.path.join(SAVED_DFS_PATH, f"songs_embeddings-{NUM_PLAYLISTS}.json")
SONGS_EMBEDDINGS_TRAIN = os.path.join(SAVED_DFS_PATH, f"songs_embeddings-train-{NUM_PLAYLISTS}.json")
SONGS_EMBEDDINGS_TEST = os.path.join(SAVED_DFS_PATH, f"songs_embeddings-test-{NUM_PLAYLISTS}.json")

NN_SONGS_EMBEDDINGS_TRAIN = os.path.join(SAVED_DFS_PATH, f"nn_songs_embeddings-train-{NUM_PLAYLISTS}.json")
NN_SONGS_EMBEDDINGS_TEST_TRAIN = os.path.join(SAVED_DFS_PATH, f"nn_songs_embeddings-test-train-{NUM_PLAYLISTS}.json")
NN_SONGS_EMBEDDINGS_TEST_TEST = os.path.join(SAVED_DFS_PATH, f"nn_songs_embeddings-test-test-{NUM_PLAYLISTS}.json")

NN_SONGS_EMBEDDINGS_EVAL = os.path.join(SAVED_DFS_PATH, f"nn_songs_embeddings-eval-{NUM_PLAYLISTS}.json") #TODO: The logic to produce this still has to be coded.
NN_SONGS_EMBEDDINGS_EVAL_TRAIN = os.path.join(SAVED_DFS_PATH, f"nn_songs_embeddings-eval-train-{NUM_PLAYLISTS}.json")
NN_SONGS_EMBEDDINGS_EVAL_TEST = os.path.join(SAVED_DFS_PATH, f"nn_songs_embeddings-eval-test-{NUM_PLAYLISTS}.json")

SONGS_INFO_DF = os.path.join(SAVED_DFS_PATH, f"songs_info_df-{NUM_PLAYLISTS}.json")
FILTERED_SONGS_INFO_DF = os.path.join(SAVED_DFS_PATH, f"nn_songs_info_df-{NUM_PLAYLISTS}.json")

# Dataset Train-Test split

## Simple Train-Test split
For the user-based and item-based collaborative filtering, the train-test split is done by splitting the songs inside each playlist with a ration of 80% train and 20% test. This means that the playlists are the same for the train and test, but the songs inside are different. In this way we can use the train test to recommend the songs, and use the test set to evaluate the results.

In [13]:


def train_test_split(playlist: Row) -> Tuple[Row, Row]:
    train_rows, test_rows = sklearn_split(playlist.tracks, random_state=42)

    playlist_train =  Row(
            name=playlist.name,
            collaborative=playlist.collaborative,
            pid=playlist.pid,
            modified_at=playlist.modified_at,
            num_tracks=playlist.num_tracks,
            num_albums=playlist.num_albums,
            num_followers=playlist.num_followers,
            tracks=train_rows,
            num_edits=playlist.num_edits,
            duration_ms=playlist.duration_ms,
            num_artists=playlist.num_artists,
        )

    playlist_test = Row(
            name=playlist.name,
            collaborative=playlist.collaborative,
            pid=playlist.pid,
            modified_at=playlist.modified_at,
            num_tracks=playlist.num_tracks,
            num_albums=playlist.num_albums,
            num_followers=playlist.num_followers,
            tracks=test_rows,
            num_edits=playlist.num_edits,
            duration_ms=playlist.duration_ms,
            num_artists=playlist.num_artists,
        )
    
    return playlist_train, playlist_test

In [14]:
def divide_whole_dataset(playlist_df: DataFrame) -> Tuple[DataFrame, DataFrame]:
  train_test_split_udf = F.udf(train_test_split, returnType=ArrayType(StructType(playlist_df.schema.fields)))
  divided_df = playlist_df.withColumn("divided", train_test_split_udf(F.struct(*playlist_df.columns)))
  train_test_df = divided_df.select(F.col('divided').getItem(0).alias('train'), F.col('divided').getItem(1).alias('test'))

  train_df = train_test_df.select("train.*")
  test_df = train_test_df.select("test.*")
  return train_df, test_df

In [15]:
OVERRIDE = False
#If True, it computes the dataframe and save them on the disk, overwriting the old ones
#Otherwise, it loads the dataframes from the disk, if they exist, otherwise it computes it.

In [22]:
if os.path.exists(TRAIN_DF_PATH) and os.path.exists(TEST_DF_PATH) and not OVERRIDE:
  train_df = spark.read.schema(playlist_schema).json(TRAIN_DF_PATH)
  test_df = spark.read.schema(playlist_schema).json(TEST_DF_PATH)
else:
  # In order to avoid [PATH_ALREADY_EXISTS] errors. 
  if os.path.exists(TRAIN_DF_PATH):
    shutil.rmtree(TRAIN_DF_PATH)
  if os.path.exists(TEST_DF_PATH):
    shutil.rmtree(TEST_DF_PATH)

  train_df, test_df = divide_whole_dataset(slice_df)
  train_df, test_df = train_df.cache(), test_df.cache()
  train_df.write.mode("overwrite").json(TRAIN_DF_PATH)
  test_df.write.mode("overwrite").json(TEST_DF_PATH)

## Neural Network Train-Test split
Regarding the Neural Network approach, we cannot use the same train-test split. This because we need a training test that contains some playlists in order to train the model, and then we need a test set with different playlists in order to make the performance evaluation. The test set will also be split with the approach above, meaning some songs will be removes in order to evaluate the recommendations. This approach is needed in order to not evaluate the model with playlists that were in the training set.

### Remove playlist with unpopular songs

Before going into the train and test split, let's take the whole 1M playlist dataset, and remove all the playlists that contain a song that appears less than 10 times in the whole dataset, in order for the Neural Network to better learn the patters.

In [16]:
PRUNE = False

In [17]:
PRUNED_DF_PATH_100K = os.path.join(GDRIVE_DATA_DIR, "pruned_df_100k")
if PRUNE:
    FULL_DATASET_PATH = os.path.join(GDRIVE_DATA_DIR, "full_dataset")
    full_df = spark.read.schema(playlist_schema).json(FULL_DATASET_PATH)

In [18]:
def prune_dataset(df: DataFrame, min_song_count: int) -> DataFrame:
    track_counts = full_df.withColumn("track_uri", F.explode("tracks.track_uri")) \
        .groupBy("track_uri") \
        .agg(F.count("*").alias("count"))

    unpopular_songs = full_df.withColumn("track_uri", F.explode("tracks.track_uri")) \
        .join(track_counts, "track_uri") \
        .filter(f"count <= {min_song_count}") \
        .groupBy("pid") \
        .agg(F.count("*").alias("num_unpopular_songs"))

    result_df = full_df.join(unpopular_songs, "pid", "left")
    result_df = result_df.fillna(0)
    result_df = result_df.filter("num_unpopular_songs == 0")
    return result_df

if PRUNE:
    pruned_df = prune_dataset(full_df, 10)
    tot = pruned_df.count()
    frac = 100_000 / tot
    pruned_df.sample(False,frac, 42).limit(100_000).write.mode("overwrite").json(PRUNED_DF_PATH_100K)

### Train Test split

In [19]:
from typing import List
def nn_train_test_split(playlists: DataFrame, split: List[float], seed: int = 42,) -> Tuple[DataFrame, DataFrame]:
    train_playlists, test_playlists = playlists.randomSplit(split, seed)
    return train_playlists, test_playlists

In [20]:
if not OVERRIDE and os.path.exists(NN_TRAIN_DF_PATH) and os.path.exists(NN_TEST_DF_TRAIN_PATH) and os.path.exists(NN_TEST_DF_TEST_PATH) and os.path.exists(NN_TEST_DF_PATH):
  nn_train_df = spark.read.schema(playlist_schema).json(NN_TRAIN_DF_PATH)
  nn_test_df = spark.read.schema(playlist_schema).json(NN_TEST_DF_PATH)
  nn_test_train_df = spark.read.schema(playlist_schema).json(NN_TEST_DF_TRAIN_PATH)
  nn_test_test_df = spark.read.schema(playlist_schema).json(NN_TEST_DF_TEST_PATH)

  nn_eval_df = spark.read.schema(playlist_schema).json(NN_SONGS_EMBEDDINGS_EVAL)
  nn_eval_train_df = spark.read.schema(playlist_schema).json(NN_SONGS_EMBEDDINGS_EVAL_TRAIN)
  nn_eval_test_df = spark.read.schema(playlist_schema).json(NN_SONGS_EMBEDDINGS_EVAL_TEST)
else:
  # In order to avoid [PATH_ALREADY_EXISTS] errors. 
  if os.path.exists(NN_TRAIN_DF_PATH):
    shutil.rmtree(NN_TRAIN_DF_PATH)
  if os.path.exists(NN_TEST_DF_PATH):
    shutil.rmtree(NN_TEST_DF_PATH)
  if os.path.exists(NN_TEST_DF_TRAIN_PATH):
    shutil.rmtree(NN_TEST_DF_TRAIN_PATH)
  if os.path.exists(NN_TEST_DF_TEST_PATH):
    shutil.rmtree(NN_TEST_DF_TEST_PATH)

  nn_train_no_eval_df, nn_test_df = nn_train_test_split(slice_df, split=[0.99, 0.01], seed=0)
  nn_test_train_df, nn_test_test_df = divide_whole_dataset(nn_test_df)
  nn_train_df, nn_eval_df = nn_train_test_split(nn_train_no_eval_df, split=[0.995, 0.005], seed=0)
  nn_eval_train_df, nn_eval_test_df = divide_whole_dataset(nn_eval_df)
  test_train_df, test_test_df = nn_test_train_df.cache(), nn_test_test_df.cache()
  

  nn_eval_df.write.mode("overwrite").json(NN_EVAL_PATH)
  nn_eval_train_df.write.mode("overwrite").json(NN_EVAL_TRAIN_PATH)
  nn_eval_test_df.write.mode("overwrite").json(NN_EVAL_TEST_PATH)

  nn_train_df.write.mode("overwrite").json(NN_TRAIN_DF_PATH)
  nn_test_df.write.mode("overwrite").json(NN_TEST_DF_PATH)
  nn_test_train_df.write.mode("overwrite").json(NN_TEST_DF_TRAIN_PATH)
  nn_test_test_df.write.mode("overwrite").json(NN_TEST_DF_TEST_PATH)


23/06/29 11:26:57 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [23]:
train_df.show(), test_df.show()

+-------------+-------------+----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|         name|collaborative| pid|modified_at|num_tracks|num_albums|num_followers|              tracks|num_edits|duration_ms|num_artists|
+-------------+-------------+----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|       disney|        false|1000| 1457827200|       189|        16|            1|[{31, Daughters o...|        4|   31428282|         65|
|Indie Electro|        false|1001| 1417824000|       165|        18|            2|[{117, Boards of ...|        2|   38241566|          8|
|  jack & jack|        false|1002| 1465430400|        17|        14|            1|[{14, Jack & Jack...|        3|    3549358|          3|
|        vibes|        false|1003| 1498435200|       225|       195|            2|[{119, PREP, spot...|       91|   51242585|        157|
|        Indie|        false|1004|

(None, None)

# Data Preparation

## Get the artists vector

In [24]:
#Filter rare should filter songs before the split, this implementation is wrong.
def get_all_artists(playlist_df: DataFrame, filter_rare: bool = False) -> DataFrame:
  """
  Given a playlist dataframe, returns the dataframe containing the unique list of artist_uri that are present in all the playlists.
  """
  if filter_rare:
    all_songs = playlist_df.select(F.explode("tracks.artist_uri").alias("artist_uri")) \
            .groupBy("artist_uri") \
            .agg(F.count("*").alias("count")) \
            .filter(F.col("count") >= 3)
  else:
    all_songs = playlist_df.select(F.explode("tracks.artist_uri").alias("artist_uri")).distinct()
  return all_songs

def create_artists_pos_mapping(playlist_df: DataFrame, filter_rare: bool = False) -> Tuple[DataFrame, int]:
  """
  Given the dataframe of artists_uri, it returns a mapping pos -> artist_uri in order to map each artist_uri to a position inside the embedding vector
  """
  artists_df = get_all_artists(playlist_df, filter_rare)
  artists_df.createOrReplaceTempView("ARTISTS")
  # Creates a POS column that maps each row to a integer from 0 to n, where n is the number of rows.
  artists_df: DataFrame = spark.sql("""
  SELECT 
      row_number() OVER (
          PARTITION BY '' 
          ORDER BY '' 
      ) as pos,
      *
  FROM 
      ARTISTS
  """)
  
  # UDF function fo subtract 1, since the pos starts from 1, but I want it to start from 0
  sub_udf = F.udf(lambda x: x-1, returnType=IntegerType())

  artists_df = artists_df.withColumnRenamed("pos", "old_pos").withColumn("pos", sub_udf(F.col("old_pos"))).drop("old_pos")
  artists_df = artists_df.sort("artist_uri")

  ARTIST_VECTOR_LENGTH = artists_df.count()

  return artists_df, ARTIST_VECTOR_LENGTH


def create_artists_vector(playlist_df: DataFrame, artist_uri_to_id: dict, vector_length) -> DataFrame:
    """
    Returns a DataFrames containing the playlists, but the tracks are represented as a binary sparse vector.
    """

    @F.udf(returnType=VectorUDT())
    def extract_vector(tracks):
      pos_list = set()

      def reduce_fn(pos_list, row):
        if row.artist_uri in artist_uri_to_id:
          pos_list.add(artist_uri_to_id.get(row.artist_uri))
        return pos_list
      
      pos_list = reduce(reduce_fn, tracks, pos_list)
      
      return SparseVector(vector_length, sorted(list(pos_list)), [1 for _ in pos_list])

    # Apply the mapping UDF on the "tracks" column of the slice_df dataframe
    mapped_df = playlist_df.withColumn('tracks', extract_vector(F.col('tracks')))

    return mapped_df

def artist_pipeline(playlist_df: DataFrame, save_name: str) -> DataFrame:
  if os.path.exists(save_name) and not OVERRIDE:
    artists_slice_df = spark.read.schema(playlist_schema_mapped).json(save_name)
    return artists_slice_df
  
  artists_slice_df = create_artists_vector(playlist_df,
                                           artist_uri_to_id, 
                                           FILTERED_ARTIST_VECTOR_LENGTH).cache()
  
  artists_slice_df.write.mode("overwrite").json(save_name)
  
  return artists_slice_df

#I've not implemented the code to not "filter rare" the artist because I don't need it.
filtered_artist_mapping, FILTERED_ARTIST_VECTOR_LENGTH = create_artists_pos_mapping(slice_df, filter_rare=False) #Should be True, but false for now
with open(ARTIST_VECTOR_LENGTH_PATH, "w") as f:
  f.write('%d' % FILTERED_ARTIST_VECTOR_LENGTH)

artist_uri_to_id = filtered_artist_mapping.select('artist_uri', 'pos').rdd.collectAsMap()

artist_slice_df_train = artist_pipeline(nn_train_df, save_name=ARTISTS_EMBEDDINGS_TRAIN)
artist_slice_df_test_train = artist_pipeline(nn_test_train_df, save_name=ARTISTS_EMBEDDINGS_TEST_TRAIN)
artist_slice_df_test_test = artist_pipeline(nn_test_test_df, save_name=ARTISTS_EMBEDDINGS_TEST_TEST)

artist_slice_df_eval = artist_pipeline(nn_eval_df, save_name=ARTISTS_EMBEDDINGS_EVAL)
artist_slice_df_eval_train = artist_pipeline(nn_eval_train_df, save_name=ARTISTS_EMBEDDINGS_EVAL_TRAIN)
artist_slice_df_eval_test = artist_pipeline(nn_eval_test_df, save_name=ARTISTS_EMBEDDINGS_EVAL_TEST)

ERROR:root:KeyboardInterrupt while sending command.                (0 + 8) / 30]
Traceback (most recent call last):
  File "/Users/dov/miniconda3/envs/dl/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/dov/miniconda3/envs/dl/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/dov/miniconda3/envs/dl/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

KeyboardInterrupt: 

                                                                                

In [27]:
filtered_artist_mapping.orderBy("pos").show()

[Stage 49:>                                                         (0 + 1) / 1]

+--------------------+---+
|          artist_uri|pos|
+--------------------+---+
|spotify:artist:1v...|  0|
|spotify:artist:6i...|  1|
|spotify:artist:1X...|  2|
|spotify:artist:73...|  3|
|spotify:artist:1H...|  4|
|spotify:artist:6P...|  5|
|spotify:artist:2q...|  6|
|spotify:artist:2o...|  7|
|spotify:artist:5l...|  8|
|spotify:artist:6t...|  9|
|spotify:artist:0A...| 10|
|spotify:artist:7c...| 11|
|spotify:artist:0c...| 12|
|spotify:artist:7i...| 13|
|spotify:artist:6L...| 14|
|spotify:artist:5I...| 15|
|spotify:artist:34...| 16|
|spotify:artist:47...| 17|
|spotify:artist:1l...| 18|
|spotify:artist:0k...| 19|
+--------------------+---+
only showing top 20 rows



                                                                                

## Get the tracks vector

In [30]:
def get_all_songs(playlist_df: DataFrame, filter_rare: bool = False) -> DataFrame:
  """
  Given a playlist dataframe, returns the dataframe containing the unique list of track_uri that are present in all the playlists.
  """
  if filter_rare:
    all_songs = playlist_df.select(F.explode("tracks.track_uri").alias("track_uri")) \
            .groupBy("track_uri") \
            .agg(F.count("*").alias("count")) \
            .filter(F.col("count") >= 5)
  else:
    all_songs = playlist_df.select(F.explode("tracks.track_uri").alias("track_uri")).distinct()
  return all_songs

def create_songs_pos_mapping(playlist_df: DataFrame, filter_rare: bool = False) -> Tuple[DataFrame, int]:
  """
  Given the dataframe of tracks_uris, it returns a mapping pos -> track_uri in order to map each track_uri to a position inside the embedding vector
  """
  songs_df = get_all_songs(playlist_df, filter_rare)
  songs_df.createOrReplaceTempView("SONGS_INFO")

  songs_df = spark.sql("""
  SELECT 
      row_number() OVER (
          PARTITION BY '' 
          ORDER BY '' 
      ) as pos,
      *
  FROM 
      SONGS_INFO
  """)
  
  sub_udf = F.udf(lambda x: x-1, returnType=IntegerType())
  songs_df = songs_df.withColumnRenamed("pos", "old_pos").withColumn("pos", sub_udf(F.col("old_pos"))).drop("old_pos")

  RATING_VECTOR_LENGTH = songs_df.count()

  return songs_df, RATING_VECTOR_LENGTH


def map_track_df_to_pos(playlist_df: DataFrame, track_uri_to_id: dict, vector_length: int) -> DataFrame:
    """
    Returns a DataFrames containing the playlists, but the tracks are represented as a binary sparse vector.
    """

    @F.udf(returnType=VectorUDT())
    def extract_vector(tracks):
      pos_list = set()

      def reduce_fn(pos_list, row):
        if row.track_uri in track_uri_to_id:
          pos_list.add(track_uri_to_id.get(row.track_uri))
        return pos_list
      
      pos_list = reduce(reduce_fn, tracks, pos_list)
      
      return SparseVector(vector_length, sorted(list(pos_list)), [1 for _ in pos_list])

    # Apply the mapping UDF on the "tracks" column of the slice_df dataframe
    mapped_df = playlist_df.withColumn('tracks', extract_vector(F.col('tracks')))

    return mapped_df

def track_pipeline(playlist_df: DataFrame, save_name: str, filter_rare: bool = False) -> DataFrame:
  if os.path.exists(save_name) and not OVERRIDE:
    song_embeddings_df = spark.read.schema(playlist_schema_mapped).json(save_name)
    return song_embeddings_df

  track_mapping = track_uri_to_id if not filter_rare else filtered_track_uri_to_id
  vector_length = SONG_VECTOR_LENGTH if not filter_rare else FILTERED_SONG_VECTOR_LENGTH
  song_embeddings_df = map_track_df_to_pos(playlist_df, 
                                       track_mapping, 
                                       vector_length).cache()

  song_embeddings_df.write.mode("overwrite").json(save_name)
  
  return song_embeddings_df


song_mapping, SONG_VECTOR_LENGTH = create_songs_pos_mapping(slice_df)
filtered_song_mapping, FILTERED_SONG_VECTOR_LENGTH = create_songs_pos_mapping(slice_df, filter_rare=False)

with open(SONGS_VECTOR_LENGTH_PATH, "w") as f:
  f.write('%d' % SONG_VECTOR_LENGTH)
with open(FILTERED_SONGS_VECTOR_LENGTH_PATH, "w") as f:
  f.write('%d' % FILTERED_SONG_VECTOR_LENGTH)

track_uri_to_id = song_mapping.select('track_uri', 'pos').rdd.collectAsMap()
filtered_track_uri_to_id = filtered_song_mapping.select('track_uri', 'pos').rdd.collectAsMap()

song_mapping.write.mode("overwrite").json(SONGS_INFO_DF)
filtered_song_mapping.write.mode("overwrite").json(FILTERED_SONGS_INFO_DF)

songs_embeddings = track_pipeline(slice_df, save_name=SONGS_EMBEDDINGS_PATH)
songs_slice_df_train = track_pipeline(train_df, save_name=SONGS_EMBEDDINGS_TRAIN)
songs_slice_df_test = track_pipeline(test_df, save_name=SONGS_EMBEDDINGS_TEST)

nn_songs_slice_df_train = track_pipeline(nn_train_df, filter_rare=False, save_name=NN_SONGS_EMBEDDINGS_TRAIN)
nn_songs_slice_df_test_train = track_pipeline(nn_test_train_df, filter_rare=False, save_name=NN_SONGS_EMBEDDINGS_TEST_TRAIN) #This should be True, but for now is False
nn_songs_slice_df_test_test = track_pipeline(nn_test_test_df, filter_rare=False, save_name=NN_SONGS_EMBEDDINGS_TEST_TEST)

nn_songs_slice_df_eval = track_pipeline(nn_eval_df, filter_rare=False, save_name=NN_SONGS_EMBEDDINGS_EVAL)
nn_songs_slice_df_eval_train = track_pipeline(nn_eval_train_df, filter_rare=False, save_name=NN_SONGS_EMBEDDINGS_EVAL_TRAIN) #This should be True, but for now is False
nn_songs_slice_df_eval_test = track_pipeline(nn_eval_test_df, filter_rare=False, save_name=NN_SONGS_EMBEDDINGS_EVAL_TEST)

                                                                                

In [None]:
song_mapping.show(), song_mapping.count()

                                                                                

+--------------------+---+
|           track_uri|pos|
+--------------------+---+
|spotify:track:1mr...|  0|
|spotify:track:1Uv...|  1|
|spotify:track:4WR...|  2|
|spotify:track:7B6...|  3|
|spotify:track:2Gy...|  4|
|spotify:track:7AO...|  5|
|spotify:track:48Z...|  6|
|spotify:track:1Um...|  7|
|spotify:track:7MO...|  8|
|spotify:track:27P...|  9|
|spotify:track:6lt...| 10|
|spotify:track:1yz...| 11|
|spotify:track:5Mz...| 12|
|spotify:track:3BU...| 13|
|spotify:track:4Cl...| 14|
|spotify:track:2dN...| 15|
|spotify:track:341...| 16|
|spotify:track:7ja...| 17|
|spotify:track:4eQ...| 18|
|spotify:track:6fy...| 19|
+--------------------+---+
only showing top 20 rows



                                                                                

(None, 681805)

## Get the Item-based dataframe

In [None]:
def create_playlists_pos_mapping(playlist_df: DataFrame) -> Tuple[DataFrame, int]:
  """
  Returns the dataframe that maps each playlist pid to a position, and the total number of playlists in the dataframe.
  """
  playlist_df.createOrReplaceTempView("PLAYLISTS_INFO")

  playlist_df = spark.sql("""
  SELECT 
      row_number() OVER (
          PARTITION BY '' 
          ORDER BY '' 
      ) as pos,
      *
  FROM 
      PLAYLISTS_INFO
  """)
  
  sub_udf = F.udf(lambda x: x-1, returnType=IntegerType())
  playlist_df = playlist_df.withColumnRenamed("pos", "old_pos").withColumn("pos", sub_udf(F.col("old_pos"))).drop("old_pos")
  playlist_df = playlist_df.select("pid", "pos").sort("pid").cache()
  PLAYLIST_VECTOR_LENGTH = playlist_df.count()
  return playlist_df, PLAYLIST_VECTOR_LENGTH

playlist_map, PLAYLIST_VECTOR_LENGTH = create_playlists_pos_mapping(train_df)

                                                                                

In [None]:
playlist_pid_to_id = playlist_map.rdd.collectAsMap()

def create_playlist_vector(playlist_df: DataFrame) -> DataFrame:
    """
    Returns a DataFrames containing the track uris mapped to a list of playlist. The list of playlists is represented as a binary sparse vector.
    """
    exploded_df = playlist_df.select("tracks", "pid").withColumn("track_uri", F.explode("tracks.track_uri"))
    new_df = exploded_df.groupBy("track_uri").agg(F.collect_list("pid").alias("pid"))

    @F.udf(returnType=VectorUDT())
    def extract_vector(row):
      pos_list = set(playlist_pid_to_id.get(pid) for pid in row)
      
      return SparseVector(NUM_PLAYLISTS, sorted(list(pos_list)), [1 for _ in pos_list])

    # Apply the mapping UDF on the "tracks" column of the slice_df dataframe
    mapped_df = new_df.withColumn('embedding', extract_vector(F.col('pid'))).drop("pid")

    return mapped_df

playlist_mapped = create_playlist_vector(slice_df)

In [None]:
PLAYLIST_MAP_PATH = os.path.join(SAVED_DFS_PATH, f"playlist_map-{NUM_PLAYLISTS}.json")
PID_TO_ID_PATH = os.path.join(SAVED_DFS_PATH, f"playlist_pid_to_id-{NUM_PLAYLISTS}.json")
playlist_mapped.write.mode("overwrite").json(PLAYLIST_MAP_PATH)

                                                                                

# Upload the songs into the Database for the Webapp

In [None]:
def get_all_songs_w_info(playlist_df: DataFrame) -> DataFrame:
  """
  Given a playlist dataframe, returns the dataframe containing the unique list of songs with their relative info withing the entire playlist dataset.
  """
  all_songs = playlist_df.select(F.explode("tracks")).select('col.*').drop("pos").distinct()
  return all_songs

23/06/23 03:12:55 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [None]:
all_songs_w_info = get_all_songs_w_info(slice_df).collect()

                                                                                

In [None]:
BASE_URL = "http://localhost:3004"
CREATE_SONG = f"{BASE_URL}/create-songs-batch"

BATCH_SIZE = 5000
for index in tqdm(range(0, len(all_songs_w_info), BATCH_SIZE)):
  start_index = index + BATCH_SIZE
  end_index = min(len(all_songs_w_info)-1, start_index + BATCH_SIZE)
  songs = all_songs_w_info[start_index:end_index]
  data = []
  for i, row in enumerate(songs):
    body = {
      "id": start_index + i,
      "name": str(row.track_name),
      "artist": str(row.artist_name),
      "album": str(row.album_name),
      "duration": int(row.duration_ms),
      "song_uri": row.track_uri,
      "album_uri": row.album_uri,
      "song_artist_concat": f"{row.track_name} {row.artist_name}"
    }
    data.append(body)
  requests.post(CREATE_SONG, json=data)

  0%|          | 0/137 [00:00<?, ?it/s]