In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split

from tensorflow import keras

from keras.models import Sequential
from keras.layers import Dense

In [2]:
merged_data_path = '../data/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

data = data.drop(columns=["user_id", "track_id"])

data = data.drop(
    columns=["release_date", "key", "loudness", "popularity",
             "duration_ms", "explicit",
             "danceability", "energy", "speechiness",
             "acousticness", "instrumentalness", "liveness", "valence", "tempo"])

data.head(5)

Unnamed: 0,favourite_genres,genres,skipped
0,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, folk rock...",False
1,"[filmi, regional mexican, folk]","[album rock, art rock, classic rock, folk rock...",False
2,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
3,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
4,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False


In [5]:
# Combine genres and favourite_genres
all_genres = list(data['favourite_genres'] + data['genres'])

# One-hot encode the genres
mlb = MultiLabelBinarizer()
encoded_all_genres = mlb.fit_transform(all_genres)

# Split encoded_genres into genres and favourite_genres
encoded_favourite_genres = encoded_all_genres[:, :len(data['favourite_genres'][0])]
encoded_genres = encoded_all_genres[:, len(data['favourite_genres'][0]):]

In [6]:
# Concatenate the one-hot encoded columns
X = np.concatenate([encoded_favourite_genres, encoded_genres], axis=1)

# Extract the labels
y = data['skipped'].astype(int).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
model = Sequential()
model.add(Dense(1977*2, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(1977, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


Metal device set to: Apple M2 Pro


In [8]:
epochs = 20
batch_size = 8

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=epochs,
    batch_size=batch_size,
    verbose=2
)


Epoch 1/20


2023-04-04 13:57:13.144721: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


791/791 - 10s - loss: 0.6149 - accuracy: 0.6706 - val_loss: 0.5930 - val_accuracy: 0.6806 - 10s/epoch - 13ms/step
Epoch 2/20
791/791 - 8s - loss: 0.5391 - accuracy: 0.7131 - val_loss: 0.6249 - val_accuracy: 0.6528 - 8s/epoch - 10ms/step
Epoch 3/20
791/791 - 8s - loss: 0.4816 - accuracy: 0.7508 - val_loss: 0.7041 - val_accuracy: 0.6686 - 8s/epoch - 10ms/step
Epoch 4/20
791/791 - 8s - loss: 0.4093 - accuracy: 0.7947 - val_loss: 0.7891 - val_accuracy: 0.6509 - 8s/epoch - 10ms/step
Epoch 5/20
791/791 - 8s - loss: 0.3533 - accuracy: 0.8267 - val_loss: 0.8339 - val_accuracy: 0.6483 - 8s/epoch - 10ms/step
Epoch 6/20
791/791 - 8s - loss: 0.3137 - accuracy: 0.8430 - val_loss: 1.0805 - val_accuracy: 0.6673 - 8s/epoch - 10ms/step
Epoch 7/20
791/791 - 9s - loss: 0.2853 - accuracy: 0.8567 - val_loss: 1.1635 - val_accuracy: 0.6515 - 9s/epoch - 11ms/step
Epoch 8/20
791/791 - 8s - loss: 0.2560 - accuracy: 0.8709 - val_loss: 1.3332 - val_accuracy: 0.6546 - 8s/epoch - 10ms/step
Epoch 9/20
791/791 - 8s -

In [7]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=2)
print("Test set accuracy:", accuracy)


62/62 - 1s - loss: 1.7006 - accuracy: 0.6388 - 683ms/epoch - 11ms/step
Test set accuracy: 0.638846755027771


In [8]:
# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)

# Print the predicted and actual labels
print("Predicted labels:", y_pred_classes.flatten())
print("Actual labels:", y_test)


Predicted labels: [1 0 1 ... 1 0 1]
Actual labels: [0 0 0 ... 1 1 0]


In [None]:
# test with new data

new_data = [
  (["dominican pop","merengue","merengue tipico","tropical"], ["blues rock","country rock","lounge"])
]

labels = [

]

new_df = pd.DataFrame(new_data, columns=["genres", "favourite_genres"])

# Combine genres and favourite_genres
all_new_genres = list(new_df['genres'] + new_df['favourite_genres'])

# One-hot encode the genres using the previously fit MultiLabelBinarizer (mlb)
encoded_new_genres = mlb.transform(all_new_genres)

# Split encoded_new_genres into genres and favourite_genres
encoded_new_genres1 = encoded_new_genres[:, :len(new_df['genres'][0])]
encoded_new_genres2 = encoded_new_genres[:, len(new_df['genres'][0]):]

# Concatenate the one-hot encoded columns
X_new = np.concatenate([encoded_new_genres1, encoded_new_genres2], axis=1)

y_new_pred = model.predict(X_new)
y_new_pred_classes = (y_new_pred > 0.5).astype(int)

# Print the predicted labels
print("Predicted labels:", y_new_pred_classes.flatten())
print("Actual labels:", labels)
