#Instalando Dependencias

Instalando Spark

In [1]:
import requests
import subprocess
import os
import re
import socket
import shutil
import time
import sys

def run(cmd):
    # run a shell command
    try:
        # Run the command and capture stdout and stderr
        subprocess_output = subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
        # Access stdout (stderr redirected to stdout)
        stdout_result = subprocess_output.stdout.strip().splitlines()[-1]
        # Process the results as needed
        print(f'✅ {stdout_result}')
        return stdout_result
    except subprocess.CalledProcessError as e:
        # Handle the error if the command returns a non-zero exit code
        print(f"Command failed with return code {e.returncode}")
        print("stdout:", e.stdout)

def is_java_installed():
    return shutil.which("java")

def install_java():
    # Uncomment and modify the desired version
    # java_version= 'openjdk-11-jre-headless'
    # java_version= 'default-jre'
    # java_version= 'openjdk-17-jre-headless'
    # java_version= 'openjdk-18-jre-headless'
    java_version= 'openjdk-19-jre-headless'
    os.environ['JAVA_HOME'] = ' /usr/lib/jvm/java-19-openjdk-amd64'
    print(f"Java not found. Installing {java_version} ... (this might take a while)")
    try:
        cmd = f"apt install -y {java_version}"
        subprocess_output = subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
        stdout_result = subprocess_output.stdout
        # Process the results as needed
        print(f'✅ Done installing Java {java_version}')
    except subprocess.CalledProcessError as e:
        # Handle the error if the command returns a non-zero exit code
        print(f"Command failed with return code {e.returncode}")
        print("stdout:", e.stdout)

print("\n0️⃣   Install Java if not available")
if is_java_installed():
    print("✅ Java is already installed.")
else:
    install_java()

print("\n1️⃣   Download and install Hadoop and Spark")
# URL for downloading Hadoop and Spark
SPARK_VERSION = "3.5.1"
HADOOP_SPARK_URL = "https://dlcdn.apache.org/spark/spark-" + SPARK_VERSION + \
                   "/spark-" + SPARK_VERSION + "-bin-hadoop3.tgz"
r = requests.head(HADOOP_SPARK_URL)
if r.status_code >= 200 and r.status_code < 400:
    print(f'✅ {HADOOP_SPARK_URL} was found')
else:
    SPARK_CDN = "https://dlcdn.apache.org/spark/"
    print(f'⚠️ {HADOOP_SPARK_URL} was NOT found. \nCheck for available Spark versions in {SPARK_CDN}')

# set some environment variables
os.environ['SPARK_HOME'] = os.path.join(os.getcwd(), os.path.splitext(os.path.basename(HADOOP_SPARK_URL))[0])
os.environ['PATH'] = ':'.join([os.path.join(os.environ['SPARK_HOME'], 'bin'), os.environ['PATH']])
os.environ['PATH'] = ':'.join([os.path.join(os.environ['SPARK_HOME'], 'sbin'), os.environ['PATH']])

# download Spark
# using --no-clobber option will prevent wget from downloading file if already present
# shell command: wget --no-clobber $HADOOP_SPARK_URL
cmd = f"wget --no-clobber {HADOOP_SPARK_URL}"
run(cmd)

# uncompress
try:
    # Run the command and capture stdout and stderr
    cmd = "([ -d $(basename {0}|sed 's/\.[^.]*$//') ] && echo -n 'Folder already exists') || (tar xzf $(basename {0}) && echo 'Uncompressed Spark distribution')"
    subprocess_output = subprocess.run(cmd.format(HADOOP_SPARK_URL), shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    # Access stdout (stderr redirected to stdout)
    stdout_result = subprocess_output.stdout
    # Process the results as needed
    print(f'✅ {stdout_result}')

except subprocess.CalledProcessError as e:
    # Handle the error if the command returns a non-zero exit code
    print(f"Command failed with return code {e.returncode}")
    print("stdout:", e.stdout)


print("\n2️⃣   Start Spark engine")
# start master
# shell command: $SPARK_HOME/sbin/start-master.sh
cmd = os.path.join(os.environ['SPARK_HOME'], 'sbin', 'stop-master.sh')
run(cmd)
cmd = os.path.join(os.environ['SPARK_HOME'], 'sbin', 'start-master.sh')
out = run(cmd)

# start one worker (first stop it in case it's already running)
# shell command: $SPARK_HOME/sbin/start-worker.sh spark://${HOSTNAME}:7077
cmd = [os.path.join(os.environ['SPARK_HOME'], 'sbin', 'stop-worker.sh')]
run(cmd)
cmd = os.path.join(os.environ['SPARK_HOME'], 'sbin', 'start-worker.sh') + ' ' + 'spark://'+socket.gethostname()+':7077'
run(cmd)

print("\n3️⃣   Start Master Web UI")
# get master UI's port number
# the subprocess that's starting the master with start-master.sh
# might still not be ready with assigning the port number at this point
# therefore we check the logfile a few times (attempts=5) to see if the port
# has been assigned. This might take 1-2 seconds.

master_log = out.partition("logging to")[2].strip()
print("Search for port number in log file {}".format(master_log))
attempts = 10
search_pattern = "Successfully started service 'MasterUI' on port (\d+)"
found = False
for i in range(attempts):
  if not found:
   with open(master_log) as log:
      found = re.search(search_pattern, log.read())
      if found:
          webUIport = found.group(1)
          print(f"✅ Master UI is available at localhost:{webUIport} (attempt nr. {i})")
          break
      else:
          time.sleep(2) # need to try until port information is found in the logfile
          i+=1
if not found:
  print("Could not find port for Master Web UI\n")

IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    # serve the Web UI on Colab
    print("Click on the link below to open the Spark Web UI 🚀")
    from google.colab import output
    output.serve_kernel_port_as_window(webUIport)

print("\n4️⃣   Start history server")
# start history server
# shell command: mkdir -p /tmp/spark-events
# shell command: $SPARK_HOME/sbin/start-history-server.sh
spark_events_dir = os.path.join('/tmp', 'spark-events')
if not os.path.exists(spark_events_dir):
    os.mkdir(spark_events_dir)
cmd = os.path.join(os.environ['SPARK_HOME'], 'sbin', 'stop-history-server.sh')
run(cmd)
cmd = os.path.join(os.environ['SPARK_HOME'], 'sbin', 'start-history-server.sh')
run(cmd)

if IN_COLAB:
    # serve the History Server
    print("Click on the link below to open the Spark History Server Web UI 🚀")
    output.serve_kernel_port_as_window(18080)


0️⃣   Install Java if not available
✅ Java is already installed.

1️⃣   Download and install Hadoop and Spark
✅ https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz was found
✅ 2024-05-15 23:07:12 (74.5 MB/s) - ‘spark-3.5.1-bin-hadoop3.tgz’ saved [400446614/400446614]
✅ Uncompressed Spark distribution


2️⃣   Start Spark engine
✅ no org.apache.spark.deploy.master.Master to stop
✅ starting org.apache.spark.deploy.master.Master, logging to /content/spark-3.5.1-bin-hadoop3/logs/spark--org.apache.spark.deploy.master.Master-1-4c283c4b9498.out
✅ no org.apache.spark.deploy.worker.Worker to stop
✅ starting org.apache.spark.deploy.worker.Worker, logging to /content/spark-3.5.1-bin-hadoop3/logs/spark--org.apache.spark.deploy.worker.Worker-1-4c283c4b9498.out

3️⃣   Start Master Web UI
Search for port number in log file /content/spark-3.5.1-bin-hadoop3/logs/spark--org.apache.spark.deploy.master.Master-1-4c283c4b9498.out
✅ Master UI is available at localhost:8081 (attempt nr. 4)


<IPython.core.display.Javascript object>


4️⃣   Start history server
✅ no org.apache.spark.deploy.history.HistoryServer to stop
✅ starting org.apache.spark.deploy.history.HistoryServer, logging to /content/spark-3.5.1-bin-hadoop3/logs/spark--org.apache.spark.deploy.history.HistoryServer-1-4c283c4b9498.out
Click on the link below to open the Spark History Server Web UI 🚀


<IPython.core.display.Javascript object>

In [2]:
pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [3]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=c2d0b4fdcf75c68dc9e4889253972bfd90718b6d53718e13eb8475fb782024fd
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


#Preprocesamiento

In [4]:
import opendatasets as od
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn
import os
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [5]:
od.download(
	"https://www.kaggle.com/datasets/andrewmvd/spotify-playlists")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: a01794892tecmx
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/andrewmvd/spotify-playlists
Downloading spotify-playlists.zip to ./spotify-playlists


100%|██████████| 183M/183M [00:01<00:00, 109MB/s]





In [6]:
# Set the Spark master URL and other Spark settings
#os.environ['PYSPARK_SUBMIT_ARGS'] = '--master local[*] --executor-memory 4G --num-executors 4 pyspark-shell'
conf = SparkConf(loadDefaults=True)
conf.setMaster("local").setAppName("sptifyApp")
sc = SparkContext(conf=conf)

In [7]:
spark = SparkSession.builder.getOrCreate()

In [8]:
spark.conf.set("spark.sql.pivotMaxValues", 2200000)

In [9]:
spark

##Limpieza de datos

In [10]:
df = spark.read.option("header", "true").csv("spotify-playlists//spotify_dataset.csv")

df = df.dropna()

df = df.drop_duplicates()

df = df.toDF(*[col.replace(' ', '').replace('"', '') for col in df.columns])


df.head()


Row(user_id='9cc0cfd4d7d7885102480dd99e7a90d6', artistname='Elvis Costello', trackname='(The Angels Wanna Wear My) Red Shoes', playlistname='HARD ROCK 2010')

##Creando dimensiones separadas para playlists, artistas, canciones y usuarios, reconstruyendo la matriz de reproducciones con los indices de dichas dimensiones.

In [11]:
dims={}

def df_dim(df, input_col):
    windowSpec = Window.orderBy(input_col)
    dims[input_col]=df.select(input_col).distinct().withColumn(f"{input_col}_index", fn.row_number().over(windowSpec))

for col_name in df.columns:
    df_dim(df,col_name)

for dim in dims.values():
    print(dim.head(10))

newdf=df

for i in range(0, len(df.columns)):
    col_name = df.columns[i]
    newdf=newdf.join(dims[col_name].withColumnRenamed(col_name, col_name+'_base'), fn.col(col_name)==fn.col(col_name+'_base')).drop(col_name).drop(col_name+'_base')

print(newdf.head(10))

[Row(user_id='00055176fea33f6e027cd3302289378b', user_id_index=1), Row(user_id='0007f3dd09c91198371454c608d47f22', user_id_index=2), Row(user_id='000b0f32b5739f052b9d40fcc5c41079', user_id_index=3), Row(user_id='000c11a16c89aa4b14b328080f5954ee', user_id_index=4), Row(user_id='00123e0f544dee3ab006aa7f1e5725a7', user_id_index=5), Row(user_id='00139e9cb50fb309549e1561b476226d', user_id_index=6), Row(user_id='00152c870313100559aad7b097d9c1f5', user_id_index=7), Row(user_id='00154ec9dd1acd4ebfb521629dcb3948', user_id_index=8), Row(user_id='001599a07cb8ef5f114a9fcf4e0e2757', user_id_index=9), Row(user_id='0019363a0d57e94d39988c31eeb8d015', user_id_index=10)]
[Row(artistname=' Dolce', artistname_index=1), Row(artistname=' OneVoice', artistname_index=2), Row(artistname='!!!', artistname_index=3), Row(artistname='!!! (Chk Chk Chk)', artistname_index=4), Row(artistname='!!! Chk Chik Chick', artistname_index=5), Row(artistname='!ATTENTION!', artistname_index=6), Row(artistname='!DELADAP', artist

##Matriz de reporducciones según el artista y normalización de los datos.

In [12]:
counts_df = newdf.groupBy("user_id_index", "artistname_index").agg(fn.count("*").alias("reproductions"))

In [13]:
max_reproduction = counts_df.agg({"reproductions": "max"}).collect()[0][0]
min_reproduction = counts_df.agg({"reproductions": "min"}).collect()[0][0]

normalized_pl_counts_df = counts_df.withColumn("normalized_reproduction", (fn.col("reproductions") - min_reproduction) / (max_reproduction - min_reproduction))
normalized_pl_counts_df.head(10)

[Row(user_id_index=11738, artistname_index=212990, reproductions=24, normalized_reproduction=0.0068759342301943195),
 Row(user_id_index=1036, artistname_index=152785, reproductions=4, normalized_reproduction=0.0008968609865470852),
 Row(user_id_index=2695, artistname_index=118440, reproductions=20, normalized_reproduction=0.005680119581464873),
 Row(user_id_index=7942, artistname_index=45951, reproductions=236, normalized_reproduction=0.07025411061285501),
 Row(user_id_index=4281, artistname_index=258892, reproductions=3, normalized_reproduction=0.0005979073243647235),
 Row(user_id_index=6765, artistname_index=81653, reproductions=38, normalized_reproduction=0.011061285500747383),
 Row(user_id_index=5278, artistname_index=125652, reproductions=42, normalized_reproduction=0.012257100149476832),
 Row(user_id_index=15667, artistname_index=198257, reproductions=44, normalized_reproduction=0.012855007473841554),
 Row(user_id_index=5381, artistname_index=91263, reproductions=61, normalized_r

#Sistema de recomendación de artistas

In [14]:
(training, test) = normalized_pl_counts_df.randomSplit([0.8, 0.2])

als = ALS(maxIter=5, regParam=0.01, userCol="user_id_index", itemCol="artistname_index", ratingCol="normalized_reproduction", coldStartStrategy="drop",implicitPrefs=True)
model = als.fit(training)

predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="normalized_reproduction", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

print("Root Mean Squared Error (RMSE) = " + str(rmse))

#Top 10 recomendaciones por usuario
userRecs = model.recommendForAllUsers(10)
userRecs.head(10)

Root Mean Squared Error (RMSE) = 0.12709003382739936


[Row(user_id_index=1, recommendations=[Row(artistname_index=163412, rating=0.13188980519771576), Row(artistname_index=76809, rating=0.10977599024772644), Row(artistname_index=112985, rating=0.10864003002643585), Row(artistname_index=37366, rating=0.10733484476804733), Row(artistname_index=51082, rating=0.09390018880367279), Row(artistname_index=135095, rating=0.0905061587691307), Row(artistname_index=243033, rating=0.08670885115861893), Row(artistname_index=190699, rating=0.08421992510557175), Row(artistname_index=188137, rating=0.08420371264219284), Row(artistname_index=85883, rating=0.08340750634670258)]),
 Row(user_id_index=2, recommendations=[Row(artistname_index=51082, rating=0.05577825382351875), Row(artistname_index=76809, rating=0.05316533148288727), Row(artistname_index=149394, rating=0.05300728231668472), Row(artistname_index=126554, rating=0.04858596622943878), Row(artistname_index=101031, rating=0.047018200159072876), Row(artistname_index=89915, rating=0.04597727581858635),

10 recomendaiones para un usuario

In [20]:
user_id_index = 5914  # Usuario de ejemplo

playlist_df = userRecs.filter(fn.col("user_id_index") == user_id_index).select(fn.explode("recommendations").alias("recommendation"))
playlist_df = dims['artistname'].join(playlist_df.select(fn.col("recommendation.artistname_index").alias("artistname_index"), fn.col("recommendation.rating").alias("recommendation_score")),'artistname_index')\
              .orderBy("recommendation_score", ascending=False)
playlist_df.show()

+----------------+---------------+--------------------+
|artistname_index|     artistname|recommendation_score|
+----------------+---------------+--------------------+
|          157985|        Madonna|          0.02455879|
|           63174|    David Bowie|         0.023608776|
|          210588|        Rihanna|         0.022057204|
|          169467|Michael Jackson|          0.02189978|
|           28600|        Beyoncé|         0.021125346|
|          164255|    Marvin Gaye|         0.020584796|
|          236202|  Stevie Wonder|         0.020487295|
|          143141|      Lady Gaga|         0.018550886|
|           36605| Britney Spears|         0.017046666|
|           17755|Aretha Franklin|         0.016739469|
+----------------+---------------+--------------------+



Los artistas que ya ha reproducido el usuario.

In [18]:
usr_reps = counts_df.filter(fn.col("user_id_index") == user_id_index).distinct()\
    .join(dims['artistname'],'artistname_index').orderBy('reproductions', ascending=False).select('artistname','reproductions')

usr_reps.show()

+-------------+-------------+
|   artistname|reproductions|
+-------------+-------------+
|      Madonna|          290|
|Talking Heads|          132|
| Lana Del Rey|           12|
|   Theme Park|            5|
|  Hybrid Funk|            2|
|     Smokeman|            1|
+-------------+-------------+



In [22]:
#Eliminando los artistas que el usuario ya ha reproducido

print('Recomendaciones de artistas para el usuario:\n')
playlist_df.join(usr_reps, playlist_df["artistname"] == usr_reps["artistname"], "left_anti").show()

Recomendaciones de artistas para el usuario:

+----------------+---------------+--------------------+
|artistname_index|     artistname|recommendation_score|
+----------------+---------------+--------------------+
|           63174|    David Bowie|         0.023608776|
|          210588|        Rihanna|         0.022057204|
|          169467|Michael Jackson|          0.02189978|
|           28600|        Beyoncé|         0.021125346|
|          164255|    Marvin Gaye|         0.020584796|
|          236202|  Stevie Wonder|         0.020487295|
|          143141|      Lady Gaga|         0.018550886|
|           36605| Britney Spears|         0.017046666|
|           17755|Aretha Franklin|         0.016739469|
+----------------+---------------+--------------------+

