# <font color='blue'>Data Science Academy</font>
# <font color='blue'>Big Data Real-Time Analytics com Python e Spark</font>

## <font color='blue'>Mini-Projeto 7</font>

### <font color='blue'>Sistema de Recomendação em Tempo Real com Machine Learning, PySpark, Spark Streaming e Kafka</font>

![title](imagens/MP7.png)

In [1]:
# Versão da Linguagem Python
from platform import python_version
print('Versão da Linguagem Python Usada Neste Jupyter Notebook:', python_version())

Versão da Linguagem Python Usada Neste Jupyter Notebook: 3.9.13


In [2]:
# Para atualizar um pacote, execute o comando abaixo no terminal ou prompt de comando:
# pip install -U nome_pacote

# Para instalar a versão exata de um pacote, execute o comando abaixo no terminal ou prompt de comando:
#!pip install nome_pacote==versão_desejada

# Depois de instalar ou atualizar o pacote, reinicie o jupyter notebook.

# Instala o pacote watermark. 
# Esse pacote é usado para gravar as versões de outros pacotes usados neste jupyter notebook.
#!pip install -q -U watermark

In [3]:
# Imports
import os
import time
import random
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Normalizer, StandardScaler

In [4]:
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -a "Data Science Academy" --iversions

Author: Data Science Academy

sys    : 3.9.13 (main, Aug 25 2022, 23:26:10) 
[GCC 11.2.0]
pyspark: 3.3.1



In [5]:
# Endereço do servidor Kafka
SERVER = 'localhost:9092'

In [6]:
# Nome do tópico
TOPIC = "dsaminiprojeto7"

In [7]:
# Conectores do Spark para o Apache Kafka
spark_jars =  ("{},{},{},{},{}".format(os.getcwd() + "/jars/spark-sql-kafka-0-10_2.12-3.2.1.jar",  
                                       os.getcwd() + "/jars/kafka-clients-2.1.1.jar", 
                                       os.getcwd() + "/jars/spark-streaming-kafka-0-10-assembly_2.12-3.3.2.jar", 
                                       os.getcwd() + "/jars/commons-pool2-2.8.0.jar",  
                                       os.getcwd() + "/jars/spark-token-provider-kafka-0-10_2.12-3.1.2.jar"))

In [8]:
# Inicializa sessão Spark
spark = SparkSession \
        .builder \
        .config("spark.jars", spark_jars) \
        .appName("Mini-Projeto7") \
        .getOrCreate()

23/05/16 23:24:17 WARN Utils: Your hostname, DataScience resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
23/05/16 23:24:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/05/16 23:24:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [9]:
spark.sparkContext.setLogLevel("ERROR")

In [10]:
# Usamos o Spark Streaming para leitura do streaming de dados do Kafka e salvamos em um dataframe
df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", SERVER) \
        .option("subscribe", TOPIC) \
        .option("startingOffsets", "latest") \
        .load()

In [11]:
# Selecionamos a coluna timestamp como string e salvamos em um novo dataframe
df1 = df.selectExpr("CAST(value AS STRING)", "timestamp") 

In [12]:
# Definimos o schema com o nome de cada coluna e o tipo de dado
def_schema = "order_id INT, id STRING, name STRING, popularity INT, duration_ms DOUBLE, " \
             + "artists STRING, id_artists STRING, release_date STRING, " \
             + "danceability DOUBLE,energy DOUBLE, key INT, loudness DOUBLE, " \
             + "mode INT,speechiness DOUBLE," \
             + "acousticness DOUBLE, instrumentalness DOUBLE, liveness DOUBLE, " \
             + "valence DOUBLE, tempo DOUBLE, time_signature DOUBLE"

In [13]:
# Selecionamos o streaming de dados de acordo com o schema e salvamos em um novo dataframe
df2 = df1.select(from_csv(col("value"), def_schema).alias("song"), "timestamp")

In [14]:
# Criamos uma view(tabela) na memória do Spark e visualizamos o schema
df3 = df2.select("song.*", "timestamp")  
df3.createOrReplaceTempView("df3_View");
df3.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- duration_ms: double (nullable = true)
 |-- artists: string (nullable = true)
 |-- id_artists: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: integer (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [15]:
# Selecionamos os dados com as músicas do stream
musicas_stream = spark.sql("SELECT * FROM df3_View")

In [None]:
# Não podemos visualizar ainda, pois temos que gerar o stream do Spark Streaming
# musicas_stream.show()

In [16]:
# Criamos o stream de dados no Spark Streaming
musicas_stream_spark = musicas_stream \
        .writeStream \
        .trigger(processingTime = '5 seconds') \
        .outputMode("append") \
        .option("truncate", "false") \
        .format("memory") \
        .queryName("tabela_spark") \
        .start()

musicas_stream_spark.awaitTermination(1)

False

In [17]:
# Selecionamos as músicas da tabela de stream do Spark
spark_songs = spark.sql("SELECT * FROM tabela_spark")

                                                                                

In [18]:
# Agora sim podemos visualizar o stream em tempo real como tabela do Spark
spark_songs.show(5)

+--------+--------------------+-------------+----------+-----------+--------------+------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+
|order_id|                  id|         name|popularity|duration_ms|       artists|        id_artists|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|           timestamp|
+--------+--------------------+-------------+----------+-----------+--------------+------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+
|      86|1NWDNrIUlFErxcibR...|   On My Way!|        39|   171984.0|         AWall|OjNTXbkrdGcBstaUOV|  2018-07-14|       0.717| 0.543|  6|  -8.008|   1|     0.0327|       0.257|             0.0|  0.0949|  0.456| 133.96|         

In [19]:
# Podemos visualizar apenas algumas colunas, por exemplo
spark_songs.select('order_id', 'id', 'name', 'popularity', 'duration_ms', 'artists').show(5)

+--------+--------------------+--------------+----------+-----------+--------------+
|order_id|                  id|          name|popularity|duration_ms|       artists|
+--------+--------------------+--------------+----------+-----------+--------------+
|      86|1NWDNrIUlFErxcibR...|    On My Way!|        39|   171984.0|         AWall|
|      87|2OS0B2x6aNyPtcN3j...| Not Your Prey|        33|   214586.0|SquirrelFlower|
|      88|4mulBtb3PtmjuUmXl...|       Vibrate|        28|   196414.0|          host|
|      89|5GFvl7wTZDzkFmXxj...|    Dolla Bill|         5|   237913.0|           Hue|
|      90|3xGUnyCvQW1Hg0bZk...|Feel Like That|        10|   156000.0|           YTK|
+--------+--------------------+--------------+----------+-----------+--------------+
only showing top 5 rows



In [30]:
# Contagem de músicas extraídas em tempo real
spark_songs.count()

102

Aguarde alguns minutos antes de seguir com a execução para que o streaming de dados possa ser coletado.

> Vamos agora trabalhar na extração de dados do Spotify.

In [None]:
# https://pypi.org/project/spotipy/
!pip install spotipy==2.22.1

In [31]:
# Imports
import os
import ujson
import spotipy
import spotipy.util
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [32]:
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -a "Data Science Academy" --iversions

Author: Data Science Academy

sys       : 3.9.13 (main, Aug 25 2022, 23:26:10) 
[GCC 11.2.0]
pandas    : 1.3.4
seaborn   : 0.11.2
matplotlib: 3.5.2
ujson     : 5.4.0
spotipy   : 2.22.1
pyspark   : 3.3.1
numpy     : 1.22.4



Leia o manual em pdf no Capítulo 16 do curso com os detalhes sobre a criação da API.

In [33]:
# Aqui você coloca as suas chaves da API do Spotify
os.environ["SPOTIPY_CLIENT_ID"] = 'CLIENT_ID'
os.environ["SPOTIPY_CLIENT_SECRET"] = 'CLIENT_SECRET'
os.environ["SPOTIPY_REDIRECT_URI"] = 'http://localhost:7777/callback'

https://developer.spotify.com/documentation/general/guides/authorization/scopes/

In [34]:
# Escopo de extração das preferências do usuário
scope = 'user-library-read'

In [35]:
# Username no Spotify
username = 'evandrocleto74@gmail.com'

In [36]:
# Criação do tohen de acesso
token = spotipy.util.prompt_for_user_token(username, scope)

In [37]:
# Cria o objeto de autenticação
spotipy_obj = spotipy.Spotify(auth = token)

In [38]:
# Extrai até 50 músicas da lista de favoritos do usuário
saved_tracks = spotipy_obj.current_user_saved_tracks(limit = 50) 

In [39]:
# Número de músicas extraídas
n_tracks = saved_tracks['total']
print('Total de Tracks: %d ' % n_tracks)

Total de Tracks: 372 


In [40]:
# Função para extrair os atributos da lista de músicas do usuário
def select_features(track_response):
    return {        
        'id': str(track_response['track']['id']),
        'name': str(track_response['track']['name']),
        'artists': [artist['name'] for artist in track_response['track']['artists']],
        'popularity': track_response['track']['popularity']
    }

In [41]:
# Aplica a função
tracks = [select_features(track) for track in saved_tracks['items']]

In [42]:
# Extrai os atributos das músicas preferidas do usuário
while saved_tracks['next']:
    saved_tracks = spotipy_obj.next(saved_tracks)
    tracks.extend([select_features(track) for track in saved_tracks['items']])

In [43]:
# Criamos o dataframe do pandas
df_tracks = pd.DataFrame(tracks)
pd.set_option('display.max_rows', len(tracks))
df_tracks['artists'] = df_tracks['artists'].apply(lambda artists: artists[0])

In [44]:
df_tracks.head(10)

Unnamed: 0,id,name,artists,popularity
0,5knlppPUmTNFVkTul5EjnI,Kind,10mg,7
1,4OROzZUy6gOWN4UGQVaZMF,The Trooper - 2015 Remaster,Iron Maiden,76
2,5sqN1halvJIprLduPPqIoe,Metropolis,Motörhead,34
3,4zU8R38mRpOQbbtrSjdUUq,Metropolis,The Church,39
4,05f8Hg3RSfiPSCBQOtxl3i,When the Levee Breaks - Remaster,Led Zeppelin,65
5,2EEinN4Zk8MUv4OQuLsTBj,Age of Consent - 2015 Remaster,New Order,67
6,72Z17vmmeQKAg8bptWvpVG,Space Oddity - 2015 Remaster,David Bowie,73
7,4r8AQvzullpWTDpgv70KxD,The Rover - Remaster,Led Zeppelin,56
8,6k6j3ZUljY1QLTMbc8VqB0,Ballroom Blitz,Sweet,59
9,6BU33g1GBGk1eWdax2Rpmh,Blockbuster,Sweet,39


In [45]:
# Dicionário para os atributos de áudio
audio_features = {}

In [46]:
# Extrai os atributos de áudio
for idd in df_tracks['id'].tolist():
    audio_features[idd] = spotipy_obj.audio_features(idd)[0]

                                                                                

In [47]:
# Adicionamos os atributos de áudio ao dataframe
df_tracks['acousticness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['acousticness'])
df_tracks['speechiness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['speechiness'])
df_tracks['key'] = df_tracks['id'].apply(lambda idd: str(audio_features[idd]['key']))
df_tracks['liveness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['liveness'])
df_tracks['instrumentalness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['instrumentalness'])
df_tracks['energy'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['energy'])
df_tracks['tempo'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['tempo'])
df_tracks['loudness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['loudness'])
df_tracks['danceability'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['danceability'])
df_tracks['valence'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['valence'])

In [48]:
df_tracks.head()

Unnamed: 0,id,name,artists,popularity,acousticness,speechiness,key,liveness,instrumentalness,energy,tempo,loudness,danceability,valence
0,5knlppPUmTNFVkTul5EjnI,Kind,10mg,7,0.00218,0.0415,2,0.105,8.7e-05,0.778,159.992,-4.662,0.526,0.445
1,4OROzZUy6gOWN4UGQVaZMF,The Trooper - 2015 Remaster,Iron Maiden,76,0.0318,0.0705,4,0.311,0.00909,0.908,159.756,-4.601,0.285,0.669
2,5sqN1halvJIprLduPPqIoe,Metropolis,Motörhead,34,5e-05,0.0307,2,0.0916,0.761,0.656,108.987,-7.668,0.426,0.0552
3,4zU8R38mRpOQbbtrSjdUUq,Metropolis,The Church,39,0.0206,0.0311,4,0.064,0.00113,0.637,123.706,-11.341,0.554,0.665
4,05f8Hg3RSfiPSCBQOtxl3i,When the Levee Breaks - Remaster,Led Zeppelin,65,0.00217,0.0329,5,0.0707,0.52,0.766,142.914,-8.03,0.271,0.803


In [49]:
# Selecionamos uma música randomicamente
musica_randomica = random. randint(0,len(df_tracks)-1)
df_musica_randomica = df_tracks.head(musica_randomica)[-1:]
df_musica_randomica

Unnamed: 0,id,name,artists,popularity,acousticness,speechiness,key,liveness,instrumentalness,energy,tempo,loudness,danceability,valence
23,5wmkNYIJBO4JqKYtlBFvgK,Goddess of Dawn,Kadavar,0,0.152,0.0723,3,0.123,0.0356,0.957,186.498,-1.07,0.0896,0.399


In [50]:
# Músicas do streaming do Spark
spark_songs.show(5)

+--------+--------------------+--------------+----------+-----------+--------------+-------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+
|order_id|                  id|          name|popularity|duration_ms|       artists|         id_artists|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|           timestamp|
+--------+--------------------+--------------+----------+-----------+--------------+-------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+
|      86|1NWDNrIUlFErxcibR...|    On My Way!|        39|   171984.0|         AWall| OjNTXbkrdGcBstaUOV|  2018-07-14|       0.717| 0.543|  6|  -8.008|   1|     0.0327|       0.257|             0.0|  0.0949|  0.456| 133.96| 

In [51]:
# Não precisamos mais dessas colunas
spark_songs = spark_songs.drop('order_id', 
                               'mode', 
                               'release_date', 
                               'id_artists',
                               'time_signature', 
                               'duration_ms',
                               'timestamp')

In [52]:
# Cria o dataframe com a música escolhida randomicamente
df_sp = spark.createDataFrame(df_musica_randomica)

In [53]:
# Concatena músicas do streaming do Spark com a música do Spotify
df = spark_songs.union(df_sp)

In [54]:
df.show(5)

+--------------------+--------------+----------+--------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+
|                  id|          name|popularity|       artists|danceability|energy|key|loudness|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|
+--------------------+--------------+----------+--------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+
|1NWDNrIUlFErxcibR...|    On My Way!|        39|         AWall|       0.717| 0.543|  6|  -8.008|     0.0327|       0.257|             0.0|  0.0949|  0.456| 133.96|
|2OS0B2x6aNyPtcN3j...| Not Your Prey|        33|SquirrelFlower|       0.352|  0.55|  2|   -6.98|     0.0317|        0.27|         0.00656|   0.096|  0.282|135.261|
|4mulBtb3PtmjuUmXl...|       Vibrate|        28|          host|       0.248| 0.674|  1|  -2.765|     0.0719|      0.0593|             0.0|  0.0678|  0.207| 68.661|
|5GFvl7wTZDzkFmX


[Stage 88:>                                                         (0 + 1) / 1]

                                                                                

## Pré-Processamento dos Dados

In [55]:
# Preparamos o VectorAssembler
vetor = VectorAssembler(inputCols = ['danceability',
                                     'energy',
                                     'loudness',
                                     'speechiness',
                                     'acousticness',
                                     'instrumentalness',
                                     'liveness',
                                     'valence',
                                     'tempo'], 
                        outputCol = 'song_features')

In [56]:
# Descartamos valores inválidos
assembled = vetor.setHandleInvalid("skip").transform(df)

In [57]:
# Preparamos o padronizador
std = StandardScaler(inputCol = 'song_features', outputCol = 'standardized')

In [58]:
# Treinamos o padronizador
scale = std.fit(assembled)

                                                                                

In [59]:
# Dataframe com dados padronizados
df = scale.transform(assembled)

In [60]:
df.show(5)

+--------------------+--------------+----------+--------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+--------------------+--------------------+
|                  id|          name|popularity|       artists|danceability|energy|key|loudness|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|       song_features|        standardized|
+--------------------+--------------+----------+--------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+--------------------+--------------------+
|1NWDNrIUlFErxcibR...|    On My Way!|        39|         AWall|       0.717| 0.543|  6|  -8.008|     0.0327|       0.257|             0.0|  0.0949|  0.456| 133.96|[0.717,0.543,-8.0...|[4.37553952951519...|
|2OS0B2x6aNyPtcN3j...| Not Your Prey|        33|SquirrelFlower|       0.352|  0.55|  2|   -6.98|     0.0317|        0.27|         0.00656|   0.096|  0.282|135.261|[0.352,0.55,-

## Machine Learning com Aprendizado Não Supervisionado

In [61]:
# Cria o objeto do modelo
objeto_KMeans = KMeans(featuresCol = 'standardized', k = 3)

In [62]:
# Treina o modelo
modelo_KMeans = objeto_KMeans.fit(df)

In [63]:
# Previsões do modelo
df_output = modelo_KMeans.transform(df)

## Sistema de Recomendação

In [78]:
# Classe
class RecoSystem():
    
    # Método construtor
    def __init__(self, data):
        self.data_ = data
    
    # Método de recomendação
    def Recomm(self, nome_musica, amount = 1):
        
        # Lista para as distâncias
        distancias = []
        
        # Seleciona a música
        song = self.data_[(self.data_.name.str.lower() == nome_musica.lower())].head(1).values[0]
        res_dt = self.data_[self.data_.name.str.lower() != nome_musica.lower()]
        
        # Loop para o cálculo das distâncias
        for i_song in tqdm(res_dt.values):
            
            # Inicializa a distância
            distancia = 0
            
            # Loop para calcular a distância
            for col in np.arange(len(res_dt.columns)):
                if not col in [0,1,2,14]:
                    distancia = distancia + np.absolute(float(song[col]) - float(i_song[col]))
            
            # Adiciona na lista de distâncias
            distancias.append(distancia)
        
        res_dt['distance'] = distancias
        res_dt = res_dt.sort_values('distance')
        
        columns = ['id','name', 
                   'artists', 
                   'acousticness', 
                   'liveness', 
                   'instrumentalness', 
                   'energy', 
                   'danceability', 
                   'valence']
        
        return res_dt[columns][:amount]

In [79]:
# Nomes das colunas
datalabel = df_output.select('id',
                             'name',
                             'artists',
                             'danceability',
                             'energy',
                             'key',
                             'loudness',
                             'speechiness',
                             'acousticness',
                             'instrumentalness',
                             'liveness',
                             'valence',
                             'tempo',
                             'prediction')

In [80]:
# Dataset final
df_final = datalabel.toPandas()
df_final.drop(df_final[df_final['artists'] == '0'].index, inplace = True)
df_final.drop_duplicates(inplace = True)
df_final.drop(df_final[df_final['danceability'] == 0.0000].index, inplace = True)
df_final.drop(df_final[df_final['liveness'] == 0.000].index, inplace = True)
df_final.drop(df_final[df_final['instrumentalness'] == 0.000000].index, inplace = True)
df_final.drop(df_final[df_final['energy'] == 0.0000].index, inplace = True)
df_final.drop(df_final[df_final['danceability'] == 0.000].index, inplace = True)
df_final.drop(df_final[df_final['valence'] == 0.000].index, inplace = True)

In [81]:
df_final.shape

(287, 14)

In [82]:
df_final.sample(5)

Unnamed: 0,id,name,artists,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,prediction
58,0COfJVwsdgpIc1Ytz3WcJ0,Good Ole Urns,Pioneer,0.527,0.534,6,-10.396,0.0399,0.715,0.6,0.126,0.237,87.011,0
14,1bX1nY3TlcAhbuL4UA3tQY,FOA,wwoman,0.713,0.687,2,-8.102,0.0319,0.049,0.564,0.218,0.55,129.985,1
179,73zWJYU92MvjWD3N3v5evB,Tauri,TristanArp,0.552,0.566,10,-6.742,0.126,0.0488,0.891,0.0928,0.436,117.995,1
73,6yQrmJC8m8DUK0wuSDBYih,Every Road,TheBonyKingofNowhere,0.541,0.746,4,-9.869,0.0309,0.00297,0.0868,0.229,0.556,119.802,1
264,37aYPJyhkxbsWVijwXtlrf,Anime,Fellsius,0.776,0.999,11,-2.737,0.113,0.00169,0.315,0.513,0.907,125.999,1


In [83]:
# Cria o objeto
reco_obj = RecoSystem(df_final)

In [84]:
musica = df_musica_randomica['name'].tolist()[0]

In [85]:
print(musica)

Goddess of Dawn


In [86]:
# Executa a recomendação
recomendacao = reco_obj.Recomm(musica)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [75]:
# Extrai a música randômica da lista de favoritos do Spotify
y = df_musica_randomica[['id','name', 
                         'artists',  
                         'acousticness', 
                         'liveness', 
                         'instrumentalness', 
                         'energy', 
                         'danceability', 
                         'valence']]

In [76]:
# Concatena a recomendação com a música randômica da lista de favoritos do Spotify
recomendacao = pd.concat([recomendacao, y])

NameError: name 'recomendacao' is not defined

In [None]:
# Salva a recomendação em disco
recomendacao.to_csv('recomendacoes/recomendacao.csv')

In [None]:
# Carrega o arquivo do disco
df_reco = (spark.read.format("csv").options(header = "true").load("recomendacoes/recomendacao.csv"))

In [None]:
# Recomendação de música
df_reco.show()

# Fim