# Multiclass classification prediction for popularity of a song

Import all the packages

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import linear, relu, softmax

%matplotlib widget
import matplotlib.pyplot as plt

import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
tf.autograph.set_verbosity(0)

## Import data

In [None]:
df = pd.read_csv("spotify_data/spotify_songs.csv")

df = df.drop(['track_id','track_name','track_album_id','track_album_name','playlist_id','playlist_name'], axis= 1 ) # These features are not needed for prediction
df.head()

In [None]:
df.shape

In [None]:
# Plot the distribution of track_popularity
plt.figure(figsize=(10,6))
plt.hist(df['track_popularity'], bins=50, color='blue', edgecolor='black')
plt.title('Distribution of Popularity')
plt.xlabel('Track popularity')
plt.ylabel('Amount')
plt.grid(True)
plt.show()

### Checking for null or NaN values

In [None]:
missing_popularity = df['track_popularity'].isnull().any()

missing_popularity

## Drop the songs with popularity zero

In [None]:
df = df[df['track_popularity'] != 0]

In [None]:
# Plot the distribution of track_popularity
plt.figure(figsize=(10,6))
plt.hist(df['track_popularity'], bins=50, color='blue', edgecolor='black')
plt.title('Distribution of Popularity')
plt.xlabel('Track popularity')
plt.ylabel('Amount')
plt.grid(True)
plt.show()

In [None]:
df.info()

## Divide track_popularity in classes(labels)

In [None]:
# Define the bin edges and labels
bin_edges = [0, 25, 40, 55, 70, 100]
bin_labels = [0, 1, 2, 3, 4]# ['Trash', 'Flop', 'Average', 'Hit', 'Monsterhit']

# Create a new column 'popularity_label' based on the bins
df['track_popularity'] = pd.cut(df['track_popularity'], bins=bin_edges, labels=bin_labels, include_lowest=True)
df['track_popularity_label'] = df['track_popularity'].astype(int)

In [None]:
df = df.drop(['track_popularity'],axis=1)

## Change release date to release year + scale duration to seconds

make df['release_year'] a new column that stores the first 4 characters of df['track_album_release_date'] which is of Object type df['release_year'] = df['track_album_release_date'].astype(str).str[0:4].astype(int)

In [None]:
# Release date to years
df['track_album_release_date'] = pd.to_datetime(df['track_album_release_date'], errors='coerce')
df['release_year'] = (df['track_album_release_date'].dt.year)

df = df.drop(['track_album_release_date'], axis=1)

In [None]:
# Drop rows with missing values
df.dropna(inplace=True)
df.info()

In [None]:
# Track duration from ms to s
df['duration_ms'] = df['duration_ms']/1000

In [None]:
df = df.rename(columns={'duration_ms': 'duration_s'})

In [None]:
df.head()

In [None]:
df.info()

## Making the model

### Dataset

In [None]:
X, y = df.drop(['track_popularity_label', 'track_artist', 'playlist_genre', 'playlist_subgenre'],axis=1).to_numpy(), df['track_popularity_label'].to_numpy()

In [None]:
print ('The shape of X is: ' + str(X.shape))
print ('The shape of y is: ' + str(y.shape))

In [None]:
# UNQ_C2
# GRADED CELL: Sequential model
tf.random.set_seed(1234) # for consistent results
model = Sequential(
    [
        ### START CODE HERE ###

        tf.keras.Input(shape=(13,)),    #specify input shape
        Dense(200, activation=relu, name="L1"),
        Dense(100, activation=relu, name="L2"),
        Dense(20, activation=linear, name="L3"),
        
        ### END CODE HERE ###
    ], name = "my_model"
)
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

In [None]:
model.summary()

In [None]:
[layer1, layer2, layer3] = model.layers

In [None]:
#### Examine Weights shapes
W1,b1 = layer1.get_weights()
W2,b2 = layer2.get_weights()
W3,b3 = layer3.get_weights()
print(f"W1 shape = {W1.shape}, b1 shape = {b1.shape}")
print(f"W2 shape = {W2.shape}, b2 shape = {b2.shape}")
print(f"W3 shape = {W3.shape}, b3 shape = {b3.shape}")

In [None]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
)

history = model.fit(
    X,y,
    epochs=40
)

In [None]:
def plot_loss_tf(history):
    fig,ax = plt.subplots(1,1, figsize = (5,5))
    ax.plot(history.history['loss'], label='loss')
    ax.set_ylim([0, 10])
    ax.set_xlabel('Epoch')
    ax.set_ylabel('loss (cost)')
    ax.legend()
    ax.grid(True)
    plt.show()

In [None]:
plot_loss_tf(history)

In [None]:
df.head()
#give df value at row 23
df.iloc[2]
#give me the row with the highest track_popularity_label         
df.loc[df['track_popularity_label'].idxmax()]

In [None]:
Ed_IHE_Remix = X[41]
print(Ed_IHE_Remix.shape)
prediction = model.predict(Ed_IHE_Remix.reshape(1,13))
print(f" predicting a Two: \n{prediction}")  # Moet toch zijn: predicting a Four?
print(f" Largest Prediction index: {np.argmax(prediction)}")

In [None]:
prediction_p = tf.nn.softmax(prediction)

print(f" predicting a Two. Probability vector: \n{prediction_p}") # Moet toch zijn: predicting a Four?
yhat = np.argmax(prediction_p)

print(f"np.argmax(prediction_p): {yhat}")

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# You do not need to modify anything in this cell

m, n = X.shape

fig, axes = plt.subplots(8,8, figsize=(5,5))
fig.tight_layout(pad=0.13,rect=[0, 0.03, 1, 0.91]) #[left, bottom, right, top]

for i,ax in enumerate(axes.flat):
    random_index = np.random.randint(m)

    # Predict using the Neural Network
    prediction = model.predict(X[random_index].reshape(1,13))
    prediction_p = tf.nn.softmax(prediction)
    yhat = np.argmax(prediction_p)
    
    # Display the label above the image
    ax.set_title(f"{y[random_index]},{yhat}",fontsize=10)
    ax.set_axis_off()
    
fig.suptitle("Label, yhat", fontsize=14)
plt.show()

In [None]:
def display_errors(model,X,y):
    f = model.predict(X)
    yhat = np.argmax(f, axis=1)
    doo = yhat != y[:]
    idxs = np.where(yhat != y[:])[0]
    if len(idxs) == 0:
        print("no errors found")
    else:
        cnt = min(8, len(idxs))
        fig, ax = plt.subplots(1,cnt, figsize=(5,1.2))
        fig.tight_layout(pad=0.13,rect=[0, 0.03, 1, 0.80]) #[left, bottom, right, top]

        for i in range(cnt):
            j = idxs[i]
            
            # Predict using the Neural Network
            prediction = model.predict(X[j].reshape(1,13))
            prediction_p = tf.nn.softmax(prediction)
            yhat = np.argmax(prediction_p)

            # Display the label above the image
            ax[i].set_title(f"{y[j]},{yhat}",fontsize=10)
            ax[i].set_axis_off()
            fig.suptitle("Label, yhat", fontsize=12)
    return(len(idxs))

In [None]:
print( f"{display_errors(model,X,y)} errors out of {len(X)} images")