#### In this project, we aim to song popularity (target column). Other 12 numeric features include acousticness and danceability etc.

resource : https://www.kaggle.com/datasets/yasserh/song-popularity-dataset
We'll be taking on the role of an up-and-coming music group who are about to release a single that could be a hit. We would like to get an idea of how popular our song will be based on some of the features we can measure about it, namely those included in the aforementioned dataset.

In [1]:
import tensorflow as tf
from tensorflow.keras import layers
import pandas as pd
import numpy as np

In [2]:
X_train, y_train = pd.read_csv('song_data_xtrain.csv'), pd.read_csv('song_data_ytrain.csv')

In [3]:
X_train

Unnamed: 0,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,0.343646,-0.532610,0.030390,0.185681,-0.348566,1.035072,-0.259083,0.754719,-1.299135,-0.512009,-0.373235,0.135955,-0.675240
1,-0.220969,-0.895902,-1.479864,0.012798,-0.348252,1.035072,1.655417,-0.168131,-1.299135,-0.668973,0.204118,0.135955,1.537082
2,-0.428014,-0.198699,1.771021,-0.543232,-0.348566,1.035072,-0.548325,-0.140935,-1.299135,2.142106,-0.545805,0.135955,0.948220
3,-1.283457,2.079548,0.190374,-1.505772,0.412176,1.035072,-0.595154,-0.821632,0.769743,0.125359,-0.390270,0.135955,-0.744759
4,-0.312481,0.302167,1.291068,-0.146068,-0.348116,0.758103,-0.637852,0.762041,-1.299135,1.980386,-0.653254,0.135955,0.306197
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12614,-0.619217,1.366509,0.497544,-1.431012,-0.344716,0.481134,-0.672974,-2.034491,0.769743,-0.492984,0.102497,0.135955,1.463474
12615,-1.012328,-0.736783,-0.526356,-0.337641,-0.348566,1.035072,-0.520778,0.228048,0.769743,-0.076316,2.128209,0.135955,-0.012770
12616,-0.359941,1.331727,-0.385570,-2.220668,-0.348566,0.481134,-0.417477,-2.683023,0.769743,-0.724148,-1.166387,-3.254583,-0.151807
12617,1.549095,-0.884886,1.476650,0.891232,-0.323466,-1.180678,0.829014,0.358016,0.769743,1.162271,-0.626054,0.135955,-1.145512


# Define the initial RNN model

In [4]:
# Define RNN model
model = tf.keras.Sequential()   # Instantiate model
model.add(layers.SimpleRNN(3, input_shape=(1, 13), activation='relu'))   # Hidden SimpleRNN laoyer
model.add(layers.Dense(1))   # Output layer - to predict 1 numeric value

model.compile(optimizer='adam', loss='mean_squared_error')   # compile the model
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 3)                 51        
                                                                 
 dense (Dense)               (None, 1)                 4         
                                                                 
Total params: 55
Trainable params: 55
Non-trainable params: 0
_________________________________________________________________


# Preprocessing - X_train & FIT

##### The required preprocessing steps that are specific to neural networks are:

* Converting the data to a NumPy array.  (turn every row into a list and put into a list/array --> A list of lists)
* Reshaping the data to match what the recurrent model expects.   (become 3 Dimensional)
    ** (num_samples, num_timesteps, num_features)  **

In [5]:
# pre-processing the X_train
    ## convert into Numpy array
X_train_array = np.array(X_train)
    ## Reshape
X_train_reshaped = np.reshape(X_train_array, (X_train_array.shape[0], 1, X_train_array.shape[1]))


# Fit 
model.fit(X_train_reshaped, y_train)



<keras.callbacks.History at 0x7fe80421f0d0>

# Preprocessing - X_test & PREDICT

In [14]:
X_test, y_test = pd.read_csv('song_data_xtest.csv'), pd.read_csv('song_data_ytest.csv')

In [15]:
# pre-processing the X_test
    ## convert into Numpy array
X_test_array = np.array(X_test)
    ## Reshape
X_test_reshaped = np.reshape(X_test_array, (X_test_array.shape[0], 1, X_test_array.shape[1]))

# predict
predictions = model.predict(X_test_reshaped)




# Evaluation

In [18]:
from sklearn.metrics import mean_squared_error

print(f"MSE on test set: {mean_squared_error(y_test, predictions)}")

MSE on test set: 2889.4170744634107


In [24]:
# average of target_column: song_popularity
average = np.round(np.mean(y_test), 2)
print(average)

song_popularity    52.95
dtype: float64


# Optimize the model
* adding more hidden layers
* Nodes per Layer
* Loss Function and Optimizer

In [35]:
mse_lst = []

In [60]:
# Adding hidden layers

model = tf.keras.Sequential()
model.add(tf.keras.layers.SimpleRNN(3, input_shape=(1, X_train.shape[1]), activation='relu'))
model.add(tf.keras.layers.Dense(3, activation='relu'))
model.add(tf.keras.layers.Dense(3, activation='relu'))
model.add(tf.keras.layers.Dense(3, activation='relu'))
model.add(tf.keras.layers.Dense(3, activation='relu'))
model.add(tf.keras.layers.Dense(3, activation='relu'))
model.add(tf.keras.layers.Dense(3, activation='relu'))
model.add(tf.keras.layers.Dense(3, activation='relu'))
# model.add(tf.keras.layers.Dense(3, activation='relu'))
# model.add(tf.keras.layers.Dense(3, activation='relu'))
# model.add(tf.keras.layers.Dense(3, activation='relu'))
model.add(tf.keras.layers.Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

# Preparing the training data
X_train_array = np.array(X_train)
X_train_reshaped = np.reshape(X_train_array, (X_train_array.shape[0], 1, X_train_array.shape[1]))

# Fitting the model
model.fit(X_train_reshaped, y_train)

# Preparing the test data
X_test_array = np.array(X_test)
X_test_reshaped = np.reshape(X_test_array, (X_test_array.shape[0], 1, X_test_array.shape[1]))

# Testing the model
y_pred = model.predict(X_test_reshaped)

# Printing results
mse = mean_squared_error(y_test, y_pred)
# mse_lst.append(mse)
print(f"MSE on test set: {mse}")

Model: "sequential_19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_19 (SimpleRNN)   (None, 3)                 51        
                                                                 
 dense_86 (Dense)            (None, 3)                 12        
                                                                 
 dense_87 (Dense)            (None, 3)                 12        
                                                                 
 dense_88 (Dense)            (None, 3)                 12        
                                                                 
 dense_89 (Dense)            (None, 3)                 12        
                                                                 
 dense_90 (Dense)            (None, 3)                 12        
                                                                 
 dense_91 (Dense)            (None, 3)               

In [57]:
mse_lst

[3144.1563246087053,
 2394.6905218600946,
 2216.8790483857233,
 1727.6221543231945,
 1113.3122491838412,
 790.2914480402351,
 3248.020656596565,
 697.7160178524921,
 3247.768700875534,
 3248.0381087853975,
 3247.8980605181214]

In [91]:
mse_dict = {}

In [97]:
# Adding nodes in fixed 5 hidden layers 

model = tf.keras.Sequential()
model.add(tf.keras.layers.SimpleRNN(3, input_shape=(1, X_train.shape[1]), activation='relu'))
model.add(tf.keras.layers.Dense(1000, activation='relu'))
model.add(tf.keras.layers.Dense(1000, activation='relu'))
model.add(tf.keras.layers.Dense(1000, activation='relu'))
model.add(tf.keras.layers.Dense(1000, activation='relu'))
model.add(tf.keras.layers.Dense(1000, activation='relu'))

model.add(tf.keras.layers.Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

# Preparing the training data
X_train_array = np.array(X_train)
X_train_reshaped = np.reshape(X_train_array, (X_train_array.shape[0], 1, X_train_array.shape[1]))

# Fitting the model
model.fit(X_train_reshaped, y_train)

# Preparing the test data
X_test_array = np.array(X_test)
X_test_reshaped = np.reshape(X_test_array, (X_test_array.shape[0], 1, X_test_array.shape[1]))

# Testing the model
y_pred = model.predict(X_test_reshaped)

# Printing results
mse = mean_squared_error(y_test, y_pred)
# Record the result in mse_dict : number of nodes in 5 hidden layers, result mse
num_nodes = 1000
mse_dict[num_nodes] = mse
print(f"MSE on test set: {mse}")

Model: "sequential_41"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_41 (SimpleRNN)   (None, 3)                 51        
                                                                 
 dense_237 (Dense)           (None, 1000)              4000      
                                                                 
 dense_238 (Dense)           (None, 1000)              1001000   
                                                                 
 dense_239 (Dense)           (None, 1000)              1001000   
                                                                 
 dense_240 (Dense)           (None, 1000)              1001000   
                                                                 
 dense_241 (Dense)           (None, 1000)              1001000   
                                                                 
 dense_242 (Dense)           (None, 1)               

In [98]:
mse_dict

{10: 625.1831218665768,
 500: 482.7768692787711,
 100: 482.31579901428455,
 1000: 534.2162588325954}

#### Optimization - Optimizer & Loss funcs

In [103]:
optimizer_dict = {}

In [111]:
# Loss Function and Optimizer
model = tf.keras.Sequential()
model.add(tf.keras.layers.SimpleRNN(3, input_shape=(1, X_train.shape[1]), activation='relu'))
model.add(tf.keras.layers.Dense(3, activation='relu'))
model.add(tf.keras.layers.Dense(1))
model.compile(optimizer='sgd', loss='binary_crossentropy')
model.summary()

# Preparing the training data
X_train_array = np.array(X_train)
X_train_reshaped = np.reshape(X_train_array, (X_train_array.shape[0], 1, X_train_array.shape[1]))

# Fitting the model
model.fit(X_train_reshaped, y_train)

# Preparing the test data
X_test_array = np.array(X_test)
X_test_reshaped = np.reshape(X_test_array, (X_test_array.shape[0], 1, X_test_array.shape[1]))

# Testing the model
y_pred = model.predict(X_test_reshaped)

# Printing results
print(f"MSE on test set: {mean_squared_error(y_test, y_pred)}")

# Record the results
optimizer = 'sgd-binary_crossentropy'
mse = mean_squared_error(y_test, y_pred)
optimizer_dict[optimizer] = mse

Model: "sequential_51"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_51 (SimpleRNN)   (None, 3)                 51        
                                                                 
 dense_261 (Dense)           (None, 3)                 12        
                                                                 
 dense_262 (Dense)           (None, 1)                 4         
                                                                 
Total params: 67
Trainable params: 67
Non-trainable params: 0
_________________________________________________________________
MSE on test set: 2777.5076421654258


In [112]:
optimizer_dict

{'adam-mean_squared_error': 2699.7559032858767,
 'adam-mean_absolute_error': 2470.855443842236,
 'RMSprop-mean_squared_error': 3061.8675191720813,
 'RMSprop-mean_absolute_error': 3246.997937258182,
 'sgd-mean_squared_error': 485.7910186588187,
 'sgd-mean_absolute_error': 483.0586352661121,
 'sgd-binary_crossentropy': 2777.5076421654258}