In [46]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from tensorflow import keras

from keras.models import Sequential
from keras.layers import Dense, Dropout
# import keras_tuner

In [47]:
merged_data_path = '../data/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

data = data.drop(
    columns=["release_date", "key", "loudness",
             "explicit", ])

data.head(5)

Unnamed: 0,popularity,duration_ms,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,tempo,favourite_genres,genres,skipped
0,34,247707,0.491,0.606,0.0377,0.00327,8e-06,0.341,0.669,123.025,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, folk rock...",False
1,34,247707,0.491,0.606,0.0377,0.00327,8e-06,0.341,0.669,123.025,"[filmi, regional mexican, folk]","[album rock, art rock, classic rock, folk rock...",False
2,34,247707,0.491,0.606,0.0377,0.00327,8e-06,0.341,0.669,123.025,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
3,35,140067,0.449,0.749,0.0775,0.01,0.0,0.391,0.448,106.861,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
4,35,140067,0.449,0.749,0.0775,0.01,0.0,0.391,0.448,106.861,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False


In [48]:
# Combine genres and favourite_genres
all_genres = list(data['favourite_genres'] + data['genres'])

# TODO usunac nieznane gatunki z listy genres?

# One-hot encode the genres
mlb = MultiLabelBinarizer()
encoded_all_genres = mlb.fit_transform(all_genres)

# Split encoded_genres into genres and favourite_genres
encoded_favourite_genres = encoded_all_genres[:, :len(data['favourite_genres'][0])]
encoded_genres = encoded_all_genres[:, len(data['favourite_genres'][0]):]

In [49]:
# Concatenate the one-hot encoded columns
X = np.concatenate([encoded_favourite_genres, encoded_genres], axis=1)
# Extract the labels
y = data['skipped'].astype(int).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [50]:
def build_model(hp):
  model = Sequential()
  model.add(Dense(6000, activation='relu', input_shape=(X_train.shape[1],)))
  model.add(Dropout(0.5))
  model.add(Dense(3000, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(1000, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(1, activation='sigmoid'))

  # model.compile(
  #     optimizer=keras.optimizers.Adam(hp.Choice('learning_date', values=[0.5, 0.1, 0.01])),
  #     loss='binary_crossentropy', metrics=['accuracy'])
  model.compile(
      optimizer='adam',
      loss='binary_crossentropy', metrics=['accuracy'])
  return model


In [51]:
epochs = 25
batch_size = 32

model = build_model(None)

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=epochs,
    batch_size=batch_size,
    verbose=2
)

# tuner = keras_tuner.tuners.Hyperband(
#     build_model,
#     objective='val_accuracy',
#     max_epochs=50,
#     max_trials=10,
#     executions_per_trial=2,
#     directory='my_dir')


Metal device set to: Apple M2 Pro
Epoch 1/25


2023-04-04 21:18:05.910616: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


198/198 - 5s - loss: 0.6398 - accuracy: 0.6453 - val_loss: 0.6102 - val_accuracy: 0.6755 - 5s/epoch - 23ms/step
Epoch 2/25
198/198 - 4s - loss: 0.5765 - accuracy: 0.6937 - val_loss: 0.5935 - val_accuracy: 0.6768 - 4s/epoch - 19ms/step
Epoch 3/25
198/198 - 4s - loss: 0.5349 - accuracy: 0.7221 - val_loss: 0.6284 - val_accuracy: 0.6641 - 4s/epoch - 20ms/step
Epoch 4/25
198/198 - 4s - loss: 0.4917 - accuracy: 0.7457 - val_loss: 0.6629 - val_accuracy: 0.6730 - 4s/epoch - 18ms/step
Epoch 5/25
198/198 - 4s - loss: 0.4468 - accuracy: 0.7759 - val_loss: 0.6572 - val_accuracy: 0.6679 - 4s/epoch - 18ms/step
Epoch 6/25
198/198 - 4s - loss: 0.4104 - accuracy: 0.7955 - val_loss: 0.7284 - val_accuracy: 0.6528 - 4s/epoch - 18ms/step
Epoch 7/25
198/198 - 4s - loss: 0.3872 - accuracy: 0.8156 - val_loss: 0.8476 - val_accuracy: 0.6534 - 4s/epoch - 18ms/step
Epoch 8/25
198/198 - 4s - loss: 0.3546 - accuracy: 0.8246 - val_loss: 0.8365 - val_accuracy: 0.6483 - 4s/epoch - 18ms/step
Epoch 9/25
198/198 - 3s - l

In [52]:
# tuner.search(
#     (X_train, y_train),
#     validation_data=(X_test, y_test),
# )

In [53]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=2)
print("Test set accuracy:", accuracy)

print("TEST")
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_pred_classes))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_classes))
print("Classification report:\n", classification_report(y_test, y_pred_classes))

print("TRAIN")
y_pred = model.predict(X_train)
y_pred_classes = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_train, y_pred_classes))
print("Confusion matrix:\n", confusion_matrix(y_train, y_pred_classes))
print("Classification report:\n", classification_report(y_train, y_pred_classes))

62/62 - 0s - loss: 1.8383 - accuracy: 0.6525 - 287ms/epoch - 5ms/step
Test set accuracy: 0.6525037884712219
TEST
Accuracy: 0.6525037936267072
Confusion matrix:
 [[931 330]
 [357 359]]
Classification report:
               precision    recall  f1-score   support

           0       0.72      0.74      0.73      1261
           1       0.52      0.50      0.51       716

    accuracy                           0.65      1977
   macro avg       0.62      0.62      0.62      1977
weighted avg       0.65      0.65      0.65      1977

TRAIN
Accuracy: 0.8572874493927125
Confusion matrix:
 [[4483  516]
 [ 612 2293]]
Classification report:
               precision    recall  f1-score   support

           0       0.88      0.90      0.89      4999
           1       0.82      0.79      0.80      2905

    accuracy                           0.86      7904
   macro avg       0.85      0.84      0.85      7904
weighted avg       0.86      0.86      0.86      7904



In [54]:
# # Make predictions on the test set
# y_pred = model.predict(X_test)
# y_pred_classes = (y_pred > 0.5).astype(int)
#
# # Print the predicted and actual labels
# print("Predicted labels:", y_pred_classes.flatten())
# print("Actual labels:", y_test)

In [55]:
# # test with new data
#
# new_data = [
#   (["dominican pop", "merengue", "merengue tipico", "tropical"],
#    ["blues rock", "country rock", "lounge"])
# ]
#
# labels = [
#
# ]
#
# new_df = pd.DataFrame(new_data, columns=["genres", "favourite_genres"])
#
# # Combine genres and favourite_genres
# all_new_genres = list(new_df['genres'] + new_df['favourite_genres'])
#
# # One-hot encode the genres using the previously fit MultiLabelBinarizer (mlb)
# encoded_new_genres = mlb.transform(all_new_genres)
#
# # Split encoded_new_genres into genres and favourite_genres
# encoded_new_genres1 = encoded_new_genres[:, :len(new_df['genres'][0])]
# encoded_new_genres2 = encoded_new_genres[:, len(new_df['genres'][0]):]
#
# # Concatenate the one-hot encoded columns
# X_new = np.concatenate([encoded_new_genres1, encoded_new_genres2], axis=1)
#
# y_new_pred = model.predict(X_new)
# y_new_pred_classes = (y_new_pred > 0.5).astype(int)
#
# # Print the predicted labels
# print("Predicted labels:", y_new_pred_classes.flatten())
# print("Actual labels:", labels)
