#Instalando Dependencias

Instalando Spark

In [None]:
import requests
import subprocess
import os
import re
import socket
import shutil
import time
import sys

def run(cmd):
    # run a shell command
    try:
        # Run the command and capture stdout and stderr
        subprocess_output = subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
        # Access stdout (stderr redirected to stdout)
        stdout_result = subprocess_output.stdout.strip().splitlines()[-1]
        # Process the results as needed
        print(f'✅ {stdout_result}')
        return stdout_result
    except subprocess.CalledProcessError as e:
        # Handle the error if the command returns a non-zero exit code
        print(f"Command failed with return code {e.returncode}")
        print("stdout:", e.stdout)

def is_java_installed():
    return shutil.which("java")

def install_java():
    # Uncomment and modify the desired version
    # java_version= 'openjdk-11-jre-headless'
    # java_version= 'default-jre'
    # java_version= 'openjdk-17-jre-headless'
    # java_version= 'openjdk-18-jre-headless'
    java_version= 'openjdk-19-jre-headless'
    os.environ['JAVA_HOME'] = ' /usr/lib/jvm/java-19-openjdk-amd64'
    print(f"Java not found. Installing {java_version} ... (this might take a while)")
    try:
        cmd = f"apt install -y {java_version}"
        subprocess_output = subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
        stdout_result = subprocess_output.stdout
        # Process the results as needed
        print(f'✅ Done installing Java {java_version}')
    except subprocess.CalledProcessError as e:
        # Handle the error if the command returns a non-zero exit code
        print(f"Command failed with return code {e.returncode}")
        print("stdout:", e.stdout)

print("\n0️⃣   Install Java if not available")
if is_java_installed():
    print("✅ Java is already installed.")
else:
    install_java()

print("\n1️⃣   Download and install Hadoop and Spark")
# URL for downloading Hadoop and Spark
SPARK_VERSION = "3.5.1"
HADOOP_SPARK_URL = "https://dlcdn.apache.org/spark/spark-" + SPARK_VERSION + \
                   "/spark-" + SPARK_VERSION + "-bin-hadoop3.tgz"
r = requests.head(HADOOP_SPARK_URL)
if r.status_code >= 200 and r.status_code < 400:
    print(f'✅ {HADOOP_SPARK_URL} was found')
else:
    SPARK_CDN = "https://dlcdn.apache.org/spark/"
    print(f'⚠️ {HADOOP_SPARK_URL} was NOT found. \nCheck for available Spark versions in {SPARK_CDN}')

# set some environment variables
os.environ['SPARK_HOME'] = os.path.join(os.getcwd(), os.path.splitext(os.path.basename(HADOOP_SPARK_URL))[0])
os.environ['PATH'] = ':'.join([os.path.join(os.environ['SPARK_HOME'], 'bin'), os.environ['PATH']])
os.environ['PATH'] = ':'.join([os.path.join(os.environ['SPARK_HOME'], 'sbin'), os.environ['PATH']])

# download Spark
# using --no-clobber option will prevent wget from downloading file if already present
# shell command: wget --no-clobber $HADOOP_SPARK_URL
cmd = f"wget --no-clobber {HADOOP_SPARK_URL}"
run(cmd)

# uncompress
try:
    # Run the command and capture stdout and stderr
    cmd = "([ -d $(basename {0}|sed 's/\.[^.]*$//') ] && echo -n 'Folder already exists') || (tar xzf $(basename {0}) && echo 'Uncompressed Spark distribution')"
    subprocess_output = subprocess.run(cmd.format(HADOOP_SPARK_URL), shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    # Access stdout (stderr redirected to stdout)
    stdout_result = subprocess_output.stdout
    # Process the results as needed
    print(f'✅ {stdout_result}')

except subprocess.CalledProcessError as e:
    # Handle the error if the command returns a non-zero exit code
    print(f"Command failed with return code {e.returncode}")
    print("stdout:", e.stdout)


print("\n2️⃣   Start Spark engine")
# start master
# shell command: $SPARK_HOME/sbin/start-master.sh
cmd = os.path.join(os.environ['SPARK_HOME'], 'sbin', 'stop-master.sh')
run(cmd)
cmd = os.path.join(os.environ['SPARK_HOME'], 'sbin', 'start-master.sh')
out = run(cmd)

# start one worker (first stop it in case it's already running)
# shell command: $SPARK_HOME/sbin/start-worker.sh spark://${HOSTNAME}:7077
cmd = [os.path.join(os.environ['SPARK_HOME'], 'sbin', 'stop-worker.sh')]
run(cmd)
cmd = os.path.join(os.environ['SPARK_HOME'], 'sbin', 'start-worker.sh') + ' ' + 'spark://'+socket.gethostname()+':7077'
run(cmd)

print("\n3️⃣   Start Master Web UI")
# get master UI's port number
# the subprocess that's starting the master with start-master.sh
# might still not be ready with assigning the port number at this point
# therefore we check the logfile a few times (attempts=5) to see if the port
# has been assigned. This might take 1-2 seconds.

master_log = out.partition("logging to")[2].strip()
print("Search for port number in log file {}".format(master_log))
attempts = 10
search_pattern = "Successfully started service 'MasterUI' on port (\d+)"
found = False
for i in range(attempts):
  if not found:
   with open(master_log) as log:
      found = re.search(search_pattern, log.read())
      if found:
          webUIport = found.group(1)
          print(f"✅ Master UI is available at localhost:{webUIport} (attempt nr. {i})")
          break
      else:
          time.sleep(2) # need to try until port information is found in the logfile
          i+=1
if not found:
  print("Could not find port for Master Web UI\n")

IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    # serve the Web UI on Colab
    print("Click on the link below to open the Spark Web UI 🚀")
    from google.colab import output
    output.serve_kernel_port_as_window(webUIport)

print("\n4️⃣   Start history server")
# start history server
# shell command: mkdir -p /tmp/spark-events
# shell command: $SPARK_HOME/sbin/start-history-server.sh
spark_events_dir = os.path.join('/tmp', 'spark-events')
if not os.path.exists(spark_events_dir):
    os.mkdir(spark_events_dir)
cmd = os.path.join(os.environ['SPARK_HOME'], 'sbin', 'stop-history-server.sh')
run(cmd)
cmd = os.path.join(os.environ['SPARK_HOME'], 'sbin', 'start-history-server.sh')
run(cmd)

if IN_COLAB:
    # serve the History Server
    print("Click on the link below to open the Spark History Server Web UI 🚀")
    output.serve_kernel_port_as_window(18080)


0️⃣   Install Java if not available
✅ Java is already installed.

1️⃣   Download and install Hadoop and Spark
✅ https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz was found
✅ File ‘spark-3.5.1-bin-hadoop3.tgz’ already there; not retrieving.
✅ Folder already exists

2️⃣   Start Spark engine
✅ stopping org.apache.spark.deploy.master.Master
✅ starting org.apache.spark.deploy.master.Master, logging to /content/spark-3.5.1-bin-hadoop3/logs/spark--org.apache.spark.deploy.master.Master-1-20866ca1a172.out
✅ stopping org.apache.spark.deploy.worker.Worker
✅ starting org.apache.spark.deploy.worker.Worker, logging to /content/spark-3.5.1-bin-hadoop3/logs/spark--org.apache.spark.deploy.worker.Worker-1-20866ca1a172.out

3️⃣   Start Master Web UI
Search for port number in log file /content/spark-3.5.1-bin-hadoop3/logs/spark--org.apache.spark.deploy.master.Master-1-20866ca1a172.out
✅ Master UI is available at localhost:8081 (attempt nr. 4)
Click on the link below to open the Spark

<IPython.core.display.Javascript object>


4️⃣   Start history server
✅ stopping org.apache.spark.deploy.history.HistoryServer
✅ starting org.apache.spark.deploy.history.HistoryServer, logging to /content/spark-3.5.1-bin-hadoop3/logs/spark--org.apache.spark.deploy.history.HistoryServer-1-20866ca1a172.out
Click on the link below to open the Spark History Server Web UI 🚀


<IPython.core.display.Javascript object>

In [None]:
pip install opendatasets



In [None]:
pip install pyspark



#Preprocesamiento

In [None]:
import opendatasets as od
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn
import os
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
od.download(
	"https://www.kaggle.com/datasets/andrewmvd/spotify-playlists")

Skipping, found downloaded files in "./spotify-playlists" (use force=True to force download)


In [None]:
# Set the Spark master URL and other Spark settings
#os.environ['PYSPARK_SUBMIT_ARGS'] = '--master local[*] --executor-memory 4G --num-executors 4 pyspark-shell'
conf = SparkConf(loadDefaults=True)
conf.setMaster("local").setAppName("sptifyApp")
sc = SparkContext(conf=conf)

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
spark.conf.set("spark.sql.pivotMaxValues", 2200000)

In [None]:
spark

##Limpieza de datos

In [None]:
df = spark.read.option("header", "true").csv("spotify-playlists//spotify_dataset.csv")

df.write.mode('overwrite').parquet('df.parquet')

df = spark.read.parquet('df.parquet')

df = df.dropna()

df = df.drop_duplicates()

df = df.toDF(*[col.replace(' ', '').replace('"', '') for col in df.columns])


df.head()


Row(user_id='0dec62b08bd5ef26e79851d532f8ef82', artistname='Salif Keita', trackname='Bolon', playlistname='Dive Bar')

##Creando dimensiones separadas para playlists, artistas, canciones y usuarios, reconstruyendo la matriz de reproducciones con los indices de dichas dimensiones.

In [None]:
dims={}

def df_dim(df, input_col):
    windowSpec = Window.orderBy(input_col)
    dims[input_col]=df.select(input_col).distinct().withColumn(f"{input_col}_index", fn.row_number().over(windowSpec))

for col_name in df.columns:
    df_dim(df,col_name)

for dim in dims.values():
    print(dim.head(10))

newdf=df

for i in range(0, len(df.columns)):
    col_name = df.columns[i]
    newdf=newdf.join(dims[col_name].withColumnRenamed(col_name, col_name+'_base'), fn.col(col_name)==fn.col(col_name+'_base')).drop(col_name).drop(col_name+'_base')

print(newdf.head(10))

[Row(user_id='00055176fea33f6e027cd3302289378b', user_id_index=1), Row(user_id='0007f3dd09c91198371454c608d47f22', user_id_index=2), Row(user_id='000b0f32b5739f052b9d40fcc5c41079', user_id_index=3), Row(user_id='000c11a16c89aa4b14b328080f5954ee', user_id_index=4), Row(user_id='00123e0f544dee3ab006aa7f1e5725a7', user_id_index=5), Row(user_id='00139e9cb50fb309549e1561b476226d', user_id_index=6), Row(user_id='00152c870313100559aad7b097d9c1f5', user_id_index=7), Row(user_id='00154ec9dd1acd4ebfb521629dcb3948', user_id_index=8), Row(user_id='001599a07cb8ef5f114a9fcf4e0e2757', user_id_index=9), Row(user_id='0019363a0d57e94d39988c31eeb8d015', user_id_index=10)]
[Row(artistname=' Dolce', artistname_index=1), Row(artistname=' OneVoice', artistname_index=2), Row(artistname='!!!', artistname_index=3), Row(artistname='!!! (Chk Chk Chk)', artistname_index=4), Row(artistname='!!! Chk Chik Chick', artistname_index=5), Row(artistname='!ATTENTION!', artistname_index=6), Row(artistname='!DELADAP', artist

##Matriz de reporducciones según el artista y normalización de los datos.

In [None]:
counts_df = newdf.groupBy("user_id_index", "artistname_index").agg(fn.count("*").alias("reproductions"))

In [None]:
max_reproduction = counts_df.agg({"reproductions": "max"}).collect()[0][0]
min_reproduction = counts_df.agg({"reproductions": "min"}).collect()[0][0]

normalized_pl_counts_df = counts_df.withColumn("normalized_reproduction", (fn.col("reproductions") - min_reproduction) / (max_reproduction - min_reproduction))
normalized_pl_counts_df.head(10)

[Row(user_id_index=1, artistname_index=2499, reproductions=10, normalized_reproduction=0.0026905829596412557),
 Row(user_id_index=1, artistname_index=5070, reproductions=1, normalized_reproduction=0.0),
 Row(user_id_index=1, artistname_index=11192, reproductions=8, normalized_reproduction=0.0020926756352765323),
 Row(user_id_index=1, artistname_index=20733, reproductions=1, normalized_reproduction=0.0),
 Row(user_id_index=1, artistname_index=21137, reproductions=2, normalized_reproduction=0.00029895366218236175),
 Row(user_id_index=1, artistname_index=26209, reproductions=1, normalized_reproduction=0.0),
 Row(user_id_index=1, artistname_index=32666, reproductions=1, normalized_reproduction=0.0),
 Row(user_id_index=1, artistname_index=37366, reproductions=1, normalized_reproduction=0.0),
 Row(user_id_index=1, artistname_index=47981, reproductions=1, normalized_reproduction=0.0),
 Row(user_id_index=1, artistname_index=85883, reproductions=3, normalized_reproduction=0.0005979073243647235)

#Sistema de recomendación de artistas

In [None]:
(training, test) = normalized_pl_counts_df.randomSplit([0.8, 0.2])

als = ALS(maxIter=5, regParam=0.01, userCol="user_id_index", itemCol="artistname_index", ratingCol="normalized_reproduction", coldStartStrategy="drop",implicitPrefs=True)
model = als.fit(training)

predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="normalized_reproduction", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

print("Root Mean Squared Error (RMSE) = " + str(rmse))

#Top 10 recomendaciones por usuario
userRecs = model.recommendForAllUsers(10)
userRecs.head(10)

Root Mean Squared Error (RMSE) = 0.12806876585364213


[Row(user_id_index=1, recommendations=[Row(artistname_index=163412, rating=0.14453428983688354), Row(artistname_index=135095, rating=0.1390984207391739), Row(artistname_index=112985, rating=0.12903881072998047), Row(artistname_index=51082, rating=0.12721602618694305), Row(artistname_index=210588, rating=0.11653812974691391), Row(artistname_index=76809, rating=0.11591964215040207), Row(artistname_index=243033, rating=0.11045467108488083), Row(artistname_index=190699, rating=0.10443546622991562), Row(artistname_index=80113, rating=0.10440066456794739), Row(artistname_index=188137, rating=0.10249216109514236)]),
 Row(user_id_index=2, recommendations=[Row(artistname_index=112985, rating=0.07352685183286667), Row(artistname_index=177187, rating=0.0678005963563919), Row(artistname_index=89915, rating=0.06556925922632217), Row(artistname_index=51082, rating=0.061343226581811905), Row(artistname_index=101031, rating=0.05874035134911537), Row(artistname_index=85883, rating=0.05599679425358772),

In [None]:
test.write.mode('overwrite').parquet('test_data.parquet')

test = spark.read.parquet('test_data.parquet')

In [None]:
sampled_test = test.sample(withReplacement=False, fraction=0.05, seed=42)

In [None]:
user_ids = sampled_test.select('user_id_index').distinct()
user_ids.write.mode('overwrite').parquet('user_ids.parquet')

user_ids = spark.read.parquet('user_ids.parquet')
user_ids_list = user_ids.rdd.flatMap(lambda x: x).collect()

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, expr, lit
from pyspark.sql import DataFrame
import numpy as np


# Función para recomendar los top K artistas para un usuario dado
def recommend_top_k_artists(user_id_index, model, data, k=10):
    user_subset = data.select('artistname_index').distinct()
    user_subset = user_subset.withColumn('user_id_index', lit(user_id_index))
    predictions = model.transform(user_subset)
    top_k_artists = predictions.orderBy('prediction', ascending=False).limit(k)
    top_k_artist_ids = [row['artistname_index'] for row in top_k_artists.collect()]
    return top_k_artist_ids

# Función para calcular Precision@K
def precision_at_k(recommended_items, relevant_items, k):
    relevant_set = set(relevant_items)
    recommended_set = set(recommended_items)
    return len(recommended_set & relevant_set) / k

# Función para calcular MAP
def average_precision(actual, predicted):
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), len(predicted))

def mean_average_precision(actual_items, predicted_items):
    return np.mean([average_precision(actual, predicted) for actual, predicted in zip(actual_items, predicted_items)])

# Funciones para calcular NDCG
def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.divide(np.power(2, r) - 1, np.log2(np.arange(2, r.size + 2))))
    return 0.0

def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.0
    return dcg_at_k(r, k) / idcg

def normalized_discounted_cumulative_gain(actual, predicted, k):
    r = [1 if p in actual else 0 for p in predicted]
    return ndcg_at_k(r, k)

# Función para evaluar el modelo
def evaluate_model(test_data, model, k=10):
    user_ids = test_data.select('user_id_index').distinct().rdd.flatMap(lambda x: x).collect()
    avg_map = 0.0
    avg_ndcg = 0.0
    precision_scores = []
    count = 0

    for user_id in user_ids:
        relevant_items = test_data.filter(col('user_id_index') == user_id).select('artistname_index').rdd.flatMap(lambda x: x).collect()

        # Se ignoran usuarios que tengan pocos artistas (Arranque en frío)
        if len(relevant_items) < 25:
            continue

        recommended_items = recommend_top_k_artists(user_id, model, test_data, k)

        # Calcular Precision@K
        precision = precision_at_k(recommended_items, relevant_items, k)
        precision_scores.append(precision)

        # Calcular MAP para el usuario actual
        map_score = average_precision(relevant_items, recommended_items[:k])
        avg_map += map_score

        # Calcular NDCG para el usuario actual
        ndcg_score = normalized_discounted_cumulative_gain(relevant_items, recommended_items, k)
        avg_ndcg += ndcg_score

        count += 1

    # Calcular promedio de MAP y NDCG para todos los usuarios
    avg_precisionAtK = np.mean(precision_scores)
    avg_map /= count
    avg_ndcg /= count

    return avg_precisionAtK, avg_map, avg_ndcg

# Evaluar el modelo
avg_precisionAtK, avg_map, avg_ndcg = evaluate_model(sampled_test, model, k=10)
print(f"Precision@10: {avg_precisionAtK:.4f}")
print(f"Mean Average Precision (MAP)@10: {avg_map:.4f}")
print(f"Normalized Discounted Cumulative Gain (NDCG)@10: {avg_ndcg:.4f}")

10 recomendaiones para un usuario

In [None]:
user_id_index = 5914  # Usuario de ejemplo

playlist_df = userRecs.filter(fn.col("user_id_index") == user_id_index).select(fn.explode("recommendations").alias("recommendation"))
playlist_df = dims['artistname'].join(playlist_df.select(fn.col("recommendation.artistname_index").alias("artistname_index"), fn.col("recommendation.rating").alias("recommendation_score")),'artistname_index')\
              .orderBy("recommendation_score", ascending=False)
playlist_df.show()

+----------------+---------------+--------------------+
|artistname_index|     artistname|recommendation_score|
+----------------+---------------+--------------------+
|          157985|        Madonna|          0.02455879|
|           63174|    David Bowie|         0.023608776|
|          210588|        Rihanna|         0.022057204|
|          169467|Michael Jackson|          0.02189978|
|           28600|        Beyoncé|         0.021125346|
|          164255|    Marvin Gaye|         0.020584796|
|          236202|  Stevie Wonder|         0.020487295|
|          143141|      Lady Gaga|         0.018550886|
|           36605| Britney Spears|         0.017046666|
|           17755|Aretha Franklin|         0.016739469|
+----------------+---------------+--------------------+



Los artistas que ya ha reproducido el usuario.

In [None]:
usr_reps = counts_df.filter(fn.col("user_id_index") == user_id_index).distinct()\
    .join(dims['artistname'],'artistname_index').orderBy('reproductions', ascending=False).select('artistname','reproductions')

usr_reps.show()

+-------------+-------------+
|   artistname|reproductions|
+-------------+-------------+
|      Madonna|          290|
|Talking Heads|          132|
| Lana Del Rey|           12|
|   Theme Park|            5|
|  Hybrid Funk|            2|
|     Smokeman|            1|
+-------------+-------------+



In [None]:
#Eliminando los artistas que el usuario ya ha reproducido

print('Recomendaciones de artistas para el usuario:\n')
playlist_df.join(usr_reps, playlist_df["artistname"] == usr_reps["artistname"], "left_anti").show()

Recomendaciones de artistas para el usuario:

+----------------+---------------+--------------------+
|artistname_index|     artistname|recommendation_score|
+----------------+---------------+--------------------+
|           63174|    David Bowie|         0.023608776|
|          210588|        Rihanna|         0.022057204|
|          169467|Michael Jackson|          0.02189978|
|           28600|        Beyoncé|         0.021125346|
|          164255|    Marvin Gaye|         0.020584796|
|          236202|  Stevie Wonder|         0.020487295|
|          143141|      Lady Gaga|         0.018550886|
|           36605| Britney Spears|         0.017046666|
|           17755|Aretha Franklin|         0.016739469|
+----------------+---------------+--------------------+

