# Pré-processamento
Limpando colunas desnecessárias e padronizando o nome das colunas importantes. É criada outra tabela `spotify_dataset_limpo`.

In [0]:
from pyspark.sql.functions import col
df = spark.table("data.default.spotify_dataset")

df_limpo_new = df.select(
    col("Artist(s)").alias("artist"),
    col("song").alias("song"),
    col("Length").alias("length"),
    col("emotion").alias("emotion"),
    col("Genre").alias("genre"),
    col("Album").alias("album"),
    col("Release Date").alias("release_date"),
    col("Explicit").alias("explicit"),
    col("Popularity").alias("popularity"),
    col("Similar Artist 1").alias("similar_artist_1"),
    col("Similar Song 1").alias("similar_song_1"),
    col("Similar Artist 2").alias("similar_artist_2"),
    col("Similar Song 2").alias("similar_song_2"),
    col("Similar Artist 3").alias("similar_artist_3"),
    col("Similar Song 3").alias("similar_song_3")
)

df_limpo_new.write.mode("overwrite").option("mergeSchema", "true").saveAsTable("default.spotify_dataset_limpo")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-6975282259532471>, line 22[0m
[1;32m      2[0m df [38;5;241m=[39m spark[38;5;241m.[39mtable([38;5;124m"[39m[38;5;124mdata.default.pmd_trabalho_new2[39m[38;5;124m"[39m)
[1;32m      4[0m df_limpo_new [38;5;241m=[39m df[38;5;241m.[39mselect(
[1;32m      5[0m     col([38;5;124m"[39m[38;5;124mArtist(s)[39m[38;5;124m"[39m)[38;5;241m.[39malias([38;5;124m"[39m[38;5;124martist[39m[38;5;124m"[39m),
[1;32m      6[0m     col([38;5;124m"[39m[38;5;124msong[39m[38;5;124m"[39m)[38;5;241m.[39malias([38;5;124m"[39m[38;5;124msong[39m[38;5;124m"[39m),
[0;32m   (...)[0m
[1;32m     20[0m     col([38;5;124m"[39m[38;5;124msong_id[39m[38;5;124m"[39m)[38;5;241m.[39malias([38;5;124m"[39m[38;5;124msong_id[39m[38;5;124m"[39m)
[1;32m     21[0m )
[0;32m-

# Preparação para integração
Limpando espaços, caracteres especiais e deixando o texto do atributo `song` como minúsculo. Isso será colocado na nova coluna `song_id`.

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import re


def clean_song_name(song):
    if song is None:
        return None
    return re.sub(r'[^a-zA-Z0-9]', '', song).lower()

clean_song_name_udf = udf(clean_song_name, StringType())


df = spark.table("data.default.spotify_dataset_limpo")
df = df.withColumn("song_id", clean_song_name_udf(df["song"]))
df.show()
df.write.mode("overwrite").saveAsTable("data.default.spotify_dataset_limpo2")


+----------+--------------------+------+-------+--------------------+--------------------+-------------------+--------+----------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+
|    artist|                song|length|emotion|               genre|               album|       release_date|explicit|popularity|  similar_artist_1|      similar_song_1|    similar_artist_2|      similar_song_2|    similar_artist_3|      similar_song_3|_rescued_data|             song_id|
+----------+--------------------+------+-------+--------------------+--------------------+-------------------+--------+----------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+
|Nickelback|Another Hole In T...| 03:35|sadness|rock,alternative ...|       The Long Road|23rd September 2003|      No|        40|

# Visualização do novo dataset
Feito o tratamento, aqui é onde é consultado a tabela para verificar se está tudo em ordem e onde é feito o download da tabela em um novo `.csv`.

In [0]:
%sql
--use catalog `data`; 
--select * from `default`.`spotify_dataset_limpo2` where song_id is null;

--use catalog `data`; 
--select * from `default`.`spotify_dataset_limpo2` where artist is null and song is null and length is null and emotion is null and genre is null and album is null and release_date is null and explicit is null and popularity is null and similar_artist_1 is null and similar_song_1 is null and similar_artist_2 is null and similar_song_2 is null and similar_artist_3 is null and similar_song_3 is null and song_id is null;

use catalog `data`; 
select * from `default`.`spotify_dataset_limpo2`;


artist,song,length,emotion,genre,album,release_date,explicit,popularity,similar_artist_1,similar_song_1,similar_artist_2,similar_song_2,similar_artist_3,similar_song_3,song_id
!!!,Even When the Waters Cold,03:47,sadness,hip hop,Thr!!!er,29th April 2013,No,40,Corey Smith,If I Could Do It Again,Toby Keith,Drinks After Work,Space,Neighbourhood,evenwhenthewaterscold
!!!,One Girl / One Boy,04:03,sadness,hip hop,Thr!!!er,29th April 2013,No,42,Hiroyuki Sawano,BRE@TH//LESS,When In Rome,Heaven Knows,Justice Crew,Everybody,onegirloneboy
!!!,Pardon My Freedom,05:51,joy,hip hop,Louden Up Now,8th June 2004,No,29,Ricky Dillard,More Abundantly Medley Live,Juliet,Avalon,The Jacksons,Lovely One,pardonmyfreedom
!!!,Ooo,03:44,joy,hip hop,As If,16th October 2015,No,24,Eric Clapton,Man Overboard,Roxette,Don't Believe In Accidents,Tiwa Savage,My Darlin,ooo
!!!,Freedom 15,06:00,joy,hip hop,As If,16th October 2015,No,30,Cibo Matto,Lint Of Love,Barrington Levy,Better Than Gold,Freestyle,Its Automatic,freedom15
!!!,All U Writers,05:22,love,hip hop,All U Writers / Gonna Guetta Stomp,27th April 2015,No,26,Wish & Fonda Rae,Touch Me All Night Long,Gary Numan,War Songs,Little Dragon,Forever,alluwriters
!!!,Serbia Drums,03:39,sadness,hip hop,Wallop,30th August 2019,No,17,Talking Heads,Ruby Dear,Hanson,Something Going Round,"Hoodie Allen,Jared Evan",Same As Before,serbiadrums
!!!,Must Be the Moon,05:57,joy,hip hop,Myth Takes,5th March 2007,No,27,"Crystal Waters,Steve ""Silk"" Hurley",Makin Happy,Marvin Gaye,Sanctified Lady,Ready For The World,Digital Display,mustbethemoon
!!!,Slyd,04:14,surprise,hip hop,Thr!!!er,29th April 2013,No,33,"Moon Boots,Fiora",I Want Your Attention,"Rhombus,Tiki Taane,MC Antsman,Imon Starr",Seen It All Beast Mix,"Craig Reever,Easton",Special about You,slyd
!!!,Hello? Is This Thing On?,07:33,sadness,hip hop,Louden Up Now,8th June 2004,No,21,Midnight Star,Operator,"Boris Dlugosch,Róisín Murphy,Ricky Mattioli",Never Enough,YACHT,Miles Miles,helloisthisthingon


In [0]:
df = df.drop("_rescued_data")
df.write.mode("overwrite").saveAsTable("data.default.pmd_trabalho_new3")
