In [14]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from tensorflow import keras

from keras.models import Sequential
from keras.layers import Dense, Dropout

In [15]:
merged_data_path = '../data/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

data = data.drop(columns=["user_id", "track_id"])

data = data.drop(
    columns=["release_date", "key", "loudness", "popularity",
             "duration_ms", "explicit",
             "danceability", "energy", "speechiness",
             "acousticness", "instrumentalness", "liveness", "valence", "tempo"])

data.head(5)

Unnamed: 0,timestamp,session_id,favourite_genres,genres,skipped
0,2023-01-03 05:09:55.000,124,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, folk rock...",False
1,2023-02-08 21:41:02.507,151,"[filmi, regional mexican, folk]","[album rock, art rock, classic rock, folk rock...",False
2,2023-02-12 01:17:14.946,544,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
3,2023-01-03 03:59:59.738,534,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
4,2023-03-07 14:28:25.702,547,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False


In [16]:
# Combine genres and favourite_genres
all_genres = list(data['favourite_genres'] + data['genres'])

# One-hot encode the genres
mlb = MultiLabelBinarizer()
encoded_all_genres = mlb.fit_transform(all_genres)

# Split encoded_genres into genres and favourite_genres
encoded_favourite_genres = encoded_all_genres[:, :len(data['favourite_genres'][0])]
encoded_genres = encoded_all_genres[:, len(data['favourite_genres'][0]):]

In [17]:
# Concatenate the one-hot encoded columns
X = np.concatenate([encoded_favourite_genres, encoded_genres], axis=1)

# Extract the labels
y = data['skipped'].astype(int).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
model = Sequential()
model.add(Dense(6000, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(3000, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1000, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [19]:
epochs = 25
batch_size = 32

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=epochs,
    batch_size=batch_size,
    verbose=2
)


Epoch 1/50
198/198 - 5s - loss: 0.6134 - accuracy: 0.6655 - val_loss: 0.6011 - val_accuracy: 0.6831 - 5s/epoch - 24ms/step
Epoch 2/50
198/198 - 3s - loss: 0.5430 - accuracy: 0.7179 - val_loss: 0.6205 - val_accuracy: 0.6799 - 3s/epoch - 16ms/step
Epoch 3/50
198/198 - 3s - loss: 0.4699 - accuracy: 0.7636 - val_loss: 0.6685 - val_accuracy: 0.6686 - 3s/epoch - 15ms/step
Epoch 4/50
198/198 - 3s - loss: 0.4035 - accuracy: 0.8066 - val_loss: 0.7605 - val_accuracy: 0.6654 - 3s/epoch - 15ms/step
Epoch 5/50
198/198 - 3s - loss: 0.3532 - accuracy: 0.8311 - val_loss: 0.8664 - val_accuracy: 0.6578 - 3s/epoch - 15ms/step
Epoch 6/50
198/198 - 3s - loss: 0.3128 - accuracy: 0.8477 - val_loss: 1.0232 - val_accuracy: 0.6553 - 3s/epoch - 15ms/step
Epoch 7/50
198/198 - 3s - loss: 0.2782 - accuracy: 0.8653 - val_loss: 1.0803 - val_accuracy: 0.6509 - 3s/epoch - 15ms/step
Epoch 8/50
198/198 - 3s - loss: 0.2638 - accuracy: 0.8714 - val_loss: 1.1853 - val_accuracy: 0.6660 - 3s/epoch - 15ms/step
Epoch 9/50
198/1

In [20]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=2)
print("Test set accuracy:", accuracy)


print("TEST")
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_pred_classes))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_classes))
print("Classification report:\n", classification_report(y_test, y_pred_classes))

print("TRAIN")
y_pred = model.predict(X_train)
y_pred_classes = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_train, y_pred_classes))
print("Confusion matrix:\n", confusion_matrix(y_train, y_pred_classes))
print("Classification report:\n", classification_report(y_train, y_pred_classes))

62/62 - 0s - loss: 3.2886 - accuracy: 0.6368 - 229ms/epoch - 4ms/step
Test set accuracy: 0.6368234753608704
TEST
Accuracy: 0.6368234699038948
Confusion matrix:
 [[905 356]
 [362 354]]
Classification report:
               precision    recall  f1-score   support

           0       0.71      0.72      0.72      1261
           1       0.50      0.49      0.50       716

    accuracy                           0.64      1977
   macro avg       0.61      0.61      0.61      1977
weighted avg       0.64      0.64      0.64      1977

TRAIN
Accuracy: 0.8701923076923077
Confusion matrix:
 [[4459  540]
 [ 486 2419]]
Classification report:
               precision    recall  f1-score   support

           0       0.90      0.89      0.90      4999
           1       0.82      0.83      0.83      2905

    accuracy                           0.87      7904
   macro avg       0.86      0.86      0.86      7904
weighted avg       0.87      0.87      0.87      7904



In [21]:
# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)

# Print the predicted and actual labels
print("Predicted labels:", y_pred_classes.flatten())
print("Actual labels:", y_test)

Predicted labels: [0 0 1 ... 1 0 1]
Actual labels: [0 0 0 ... 1 1 0]


In [22]:
# test with new data

new_data = [
  (["dominican pop","merengue","merengue tipico","tropical"], ["blues rock","country rock","lounge"])
]

labels = [

]

new_df = pd.DataFrame(new_data, columns=["genres", "favourite_genres"])

# Combine genres and favourite_genres
all_new_genres = list(new_df['genres'] + new_df['favourite_genres'])

# One-hot encode the genres using the previously fit MultiLabelBinarizer (mlb)
encoded_new_genres = mlb.transform(all_new_genres)

# Split encoded_new_genres into genres and favourite_genres
encoded_new_genres1 = encoded_new_genres[:, :len(new_df['genres'][0])]
encoded_new_genres2 = encoded_new_genres[:, len(new_df['genres'][0]):]

# Concatenate the one-hot encoded columns
X_new = np.concatenate([encoded_new_genres1, encoded_new_genres2], axis=1)

y_new_pred = model.predict(X_new)
y_new_pred_classes = (y_new_pred > 0.5).astype(int)

# Print the predicted labels
print("Predicted labels:", y_new_pred_classes.flatten())
print("Actual labels:", labels)


Predicted labels: [0]
Actual labels: []
