# 1ª Fase: escolha da Base de Dados

In [60]:
# Referência: https://www.kaggle.com/datasets/nelgiriyewithana/top-spotify-songs-2023

In [61]:
# Importar bibliotecas necessárias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [62]:
# Configurar estilo dos gráficos
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Carregar os dados
df = pd.read_csv('./dados/spotify-2023.csv', encoding='latin1')

# 2. Análise e Tratamento dos dados

In [63]:
# Visualizar as primeiras linhas
df.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6


# 2.1 Dicionário das colunas

#### track_name -> Nome da música        
#### artist(s)_name -> Nome do(s) artista(s) da música
#### artist_count ->  Número de artistas contribuintes da música
#### released_year -> Ano de lançamento da música
#### released_month -> Mês que a música foi lançada 
#### released_day ->   Dia que foi lançada a música
#### in_spotify_playlists -> Número de playlists do spotify que ela está inclusa     
#### in_spotify_charts -> Presença e classificação da música nas paradas do Spotify     
#### streams -> Número total de transmissões no Spotify 
#### in_apple_playlists -> Número de playlists do Apple Music em que a música está incluída

In [64]:
# verificar valores ausentes, tipos de dados de cada coluna
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   track_name            953 non-null    object
 1   artist(s)_name        953 non-null    object
 2   artist_count          953 non-null    int64 
 3   released_year         953 non-null    int64 
 4   released_month        953 non-null    int64 
 5   released_day          953 non-null    int64 
 6   in_spotify_playlists  953 non-null    int64 
 7   in_spotify_charts     953 non-null    int64 
 8   streams               953 non-null    object
 9   in_apple_playlists    953 non-null    int64 
 10  in_apple_charts       953 non-null    int64 
 11  in_deezer_playlists   953 non-null    object
 12  in_deezer_charts      953 non-null    int64 
 13  in_shazam_charts      903 non-null    object
 14  bpm                   953 non-null    int64 
 15  key                   858 non-null    ob

track_name               0
artist(s)_name           0
artist_count             0
released_year            0
released_month           0
released_day             0
in_spotify_playlists     0
in_spotify_charts        0
streams                  0
in_apple_playlists       0
in_apple_charts          0
in_deezer_playlists      0
in_deezer_charts         0
in_shazam_charts        50
bpm                      0
key                     95
mode                     0
danceability_%           0
valence_%                0
energy_%                 0
acousticness_%           0
instrumentalness_%       0
liveness_%               0
speechiness_%            0
dtype: int64

In [65]:
#quantidade de linhas e colunas
df.shape

(953, 24)

In [66]:
# Estatísticas descritivas
df.describe(include='all')

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
count,953,953,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,...,953.0,858,953,953.0,953.0,953.0,953.0,953.0,953.0,953.0
unique,943,645,,,,,,,949.0,,...,,11,2,,,,,,,
top,Flowers,Taylor Swift,,,,,,,395591396.0,,...,,C#,Major,,,,,,,
freq,2,34,,,,,,,2.0,,...,,120,550,,,,,,,
mean,,,1.556139,2018.238195,6.033578,13.930745,5200.124869,12.009444,,67.812172,...,122.540399,,,66.96957,51.43127,64.279119,27.057712,1.581322,18.213012,10.131165
std,,,0.893044,11.116218,3.566435,9.201949,7897.60899,19.575992,,86.441493,...,28.057802,,,14.63061,23.480632,16.550526,25.996077,8.4098,13.711223,9.912888
min,,,1.0,1930.0,1.0,1.0,31.0,0.0,,0.0,...,65.0,,,23.0,4.0,9.0,0.0,0.0,3.0,2.0
25%,,,1.0,2020.0,3.0,6.0,875.0,0.0,,13.0,...,100.0,,,57.0,32.0,53.0,6.0,0.0,10.0,4.0
50%,,,1.0,2022.0,6.0,13.0,2224.0,3.0,,34.0,...,121.0,,,69.0,51.0,66.0,18.0,0.0,12.0,6.0
75%,,,2.0,2022.0,9.0,22.0,5542.0,16.0,,88.0,...,140.0,,,78.0,70.0,77.0,43.0,0.0,24.0,11.0


# 2.2 Limpeza e Pré-processamento

In [67]:
# Verificar valores nulos
print(df.isnull().sum())

track_name               0
artist(s)_name           0
artist_count             0
released_year            0
released_month           0
released_day             0
in_spotify_playlists     0
in_spotify_charts        0
streams                  0
in_apple_playlists       0
in_apple_charts          0
in_deezer_playlists      0
in_deezer_charts         0
in_shazam_charts        50
bpm                      0
key                     95
mode                     0
danceability_%           0
valence_%                0
energy_%                 0
acousticness_%           0
instrumentalness_%       0
liveness_%               0
speechiness_%            0
dtype: int64


In [68]:
# convertendo (in_shazam_charts) para numérico para vizualização
df['in_shazam_charts'] = pd.to_numeric(df['in_shazam_charts'], errors='coerce')

# convertendo coluna streams para numérico para vizualização 
df['streams'] = pd.to_numeric(df['streams'], errors='coerce')

# preenchendo nulos com medianas para coluna (in_shazam_charts)
df['in_shazam_charts'] = df['in_shazam_charts'].fillna(df['in_shazam_charts'].median())

# preenchendo nulos com medias para coluna (streams)
df['streams'] = df['streams'].fillna(df['streams'].mean())

# é melhor descartar nulos na coluna (key)
df = df.dropna()

# checando
df.isnull().sum()

track_name              0
artist(s)_name          0
artist_count            0
released_year           0
released_month          0
released_day            0
in_spotify_playlists    0
in_spotify_charts       0
streams                 0
in_apple_playlists      0
in_apple_charts         0
in_deezer_playlists     0
in_deezer_charts        0
in_shazam_charts        0
bpm                     0
key                     0
mode                    0
danceability_%          0
valence_%               0
energy_%                0
acousticness_%          0
instrumentalness_%      0
liveness_%              0
speechiness_%           0
dtype: int64

In [69]:
# Normalizando colunas com porcentagens para deixar mais fácil para interepretação
percentage_columns = [
    'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 
    'instrumentalness_%', 'liveness_%', 'speechiness_%'
]
df[percentage_columns] = df[percentage_columns] / 100

In [70]:
duplicados = df.duplicated()  # Retorna uma série booleana (True para linhas duplicadas)
print(df[duplicados])  # Mostra apenas as linhas duplicadas

Empty DataFrame
Columns: [track_name, artist(s)_name, artist_count, released_year, released_month, released_day, in_spotify_playlists, in_spotify_charts, streams, in_apple_playlists, in_apple_charts, in_deezer_playlists, in_deezer_charts, in_shazam_charts, bpm, key, mode, danceability_%, valence_%, energy_%, acousticness_%, instrumentalness_%, liveness_%, speechiness_%]
Index: []

[0 rows x 24 columns]


# 3ª Fase: Modelagem e Avaliação

In [74]:
#!pip install tensorflow
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler

In [75]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

X = df[features].values
y = df['streams'].values.reshape(-1, 1)  # Garantir formato (n_samples, 1)

# 2. Pré-processamento
scaler_x = StandardScaler()
scaler_y = StandardScaler()  # Normalizar o target também para melhor performance

X_scaled = scaler_x.fit_transform(X)
y_scaled = scaler_y.fit_transform(y)

# 3. Dividir os dados
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

In [76]:
# 4. Criar e treinar o modelo
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=500, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/500


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.8984 - val_loss: 1.3193
Epoch 2/500
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7601 - val_loss: 1.3152
Epoch 3/500
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7338 - val_loss: 1.3106
Epoch 4/500
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7175 - val_loss: 1.3182
Epoch 5/500
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6996 - val_loss: 1.3189
Epoch 6/500
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6864 - val_loss: 1.3177
Epoch 7/500
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6705 - val_loss: 1.3250
Epoch 8/500
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6567 - val_loss: 1.3324
Epoch 9/500
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x256afeaba10>

In [77]:
# 5. Salvar modelo e scalers
model.save('modelo_spotify_regressao.keras')  # Formato .keras recomendado
joblib.dump(scaler_x, 'scaler_x_spotify.pkl')
joblib.dump(scaler_y, 'scaler_y_spotify.pkl')

# 6. Download (Google Colab) - Remova se estiver executando localmente
files.download('modelo_spotify_regressao.keras')
files.download('scaler_x_spotify.pkl')
files.download('scaler_y_spotify.pkl')

NameError: name 'files' is not defined

# FIM