In [10]:
import pandas as pd
import numpy as np
from keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from tensorflow import keras

from keras.models import Sequential
from keras.layers import Dense, Dropout

In [11]:
merged_data_path = '../data/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

data.head(5)

Unnamed: 0,favourite_genres,genres,skipped
0,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, folk rock...",False
1,"[filmi, regional mexican, folk]","[album rock, art rock, classic rock, folk rock...",False
2,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
3,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
4,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False


In [12]:
unique_favourite_genres = set()
for favourite_genres in data['favourite_genres']:
    unique_favourite_genres.update(favourite_genres)

unique_genres = set()
for genres in data['genres']:
    unique_genres.update(genres)

# remove genres that are not in favourite_genres
# data['genres'] = data['genres'].apply(
#     lambda x: [genre for genre in x if genre in unique_favourite_genres])

print(len(unique_genres))

data.head()

1766


Unnamed: 0,favourite_genres,genres,skipped
0,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, folk rock...",False
1,"[filmi, regional mexican, folk]","[album rock, art rock, classic rock, folk rock...",False
2,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
3,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
4,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False


In [13]:
# map genres to their simpler form
def map_genre(genre):
    if 'rock' in genre:
        return 'rock'
    elif 'pop' in genre:
        return 'pop'
    elif 'folk' in genre:
        return 'folk'
    elif 'jazz' in genre:
        return 'jazz'
    elif 'metal' in genre:
        return 'metal'
    elif 'wave' in genre:
        return 'wave'
    elif 'blues' in genre:
        return 'blues'
    elif 'latino' in genre:
        return 'latino'
    elif 'house' in genre:
        return 'house'
    elif 'hip hop' in genre:
        return 'hip hop'
    elif 'rap' in genre:
        return 'rap'
    elif 'punk' in genre:
        return 'punk'
    elif 'country' in genre:
        return 'country'
    elif 'indie' in genre:
        return 'indie'
    elif 'electronic' in genre:
        return 'electronic'
    elif 'reggae' in genre:
        return 'reggae'
    elif 'dance' in genre:
        return 'dance'
    else:
        return genre

data['genres'] = data['genres'].apply(
    lambda x: [map_genre(genre) for genre in x])

data['favourite_genres'] = data['favourite_genres'].apply(
    lambda x: [map_genre(genre) for genre in x])

# delete not unique genres
data['genres'] = data['genres'].apply(lambda x: list(set(x)))
data['favourite_genres'] = data['favourite_genres'].apply(lambda x: list(set(x)))

# count unique genres
unique_genres = set()
for genres in data['genres']:
    unique_genres.update(genres)
for genres in data['favourite_genres']:
    unique_genres.update(genres)
print(len(unique_genres))

# how many tracks are per genre
genre_count = {}
for genres in data['genres']:
    for genre in genres:
        if genre not in genre_count:
            genre_count[genre] = 0
        genre_count[genre] += 1

genre_count = sorted(genre_count.items(), key=lambda x: x[1], reverse=True)
genre_count

794


[('rock', 5144),
 ('pop', 4038),
 ('mellow gold', 2101),
 ('folk', 835),
 ('metal', 739),
 ('wave', 729),
 ('rap', 630),
 ('adult standards', 629),
 ('hip hop', 548),
 ('jazz', 458),
 ('singer-songwriter', 432),
 ('punk', 351),
 ('blues', 339),
 ('british invasion', 301),
 ('soul', 295),
 ('indie', 239),
 ('reggae', 236),
 ('latin', 219),
 ('house', 216),
 ('latino', 205),
 ('country', 197),
 ('motown', 196),
 ('funk', 193),
 ('beatlesque', 189),
 ('new romantic', 183),
 ('quiet storm', 183),
 ('lounge', 182),
 ('disco', 165),
 ('emo', 147),
 ('neo mellow', 121),
 ('merseybeat', 116),
 ('dance', 115),
 ('easy listening', 114),
 ('edm', 101),
 ('latin alternative', 95),
 ('lilith', 93),
 ('filmi', 92),
 ('r&b', 88),
 ('hoerspiel', 86),
 ('urban contemporary', 83),
 ('swing', 82),
 ('sertanejo universitario', 79),
 ('sertanejo', 76),
 ('old school thrash', 73),
 ('classic bollywood', 72),
 ('neo soul', 72),
 ('talent show', 70),
 ('jam band', 68),
 ('mpb', 67),
 ('ranchera', 65),
 ('trop

In [14]:
# balance set
skipped_data = data[data["skipped"] == 1]
not_skipped_data = data[data["skipped"] == 0]

# Split the dataset into subsets based on class labels
print("skipped_data", skipped_data.shape)
print("not_skipped_data", not_skipped_data.shape)

# Calculate the minimum number of samples among all classes
min_samples = min(len(skipped_data), len(not_skipped_data))

# Randomly select samples from each class subset to match the minimum number of samples
skipped_data_balanced = skipped_data.sample(n=min_samples, random_state=42)
not_skipped_data_balanced = not_skipped_data.sample(n=min_samples, random_state=42)

# Merge the balanced subsets to create the final balanced dataset
balanced_data = pd.concat([skipped_data_balanced, not_skipped_data_balanced], axis=0)

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)
balanced_data.head()

skipped_data (3621, 3)
not_skipped_data (6260, 3)


Unnamed: 0,favourite_genres,genres,skipped
0,"[rock, hoerspiel, ranchera]","[rock, pop, mellow gold]",True
1,"[tropical, wave, mellow gold]","[rock, mellow gold, blues]",False
2,"[rock, regional mexican]","[sufi, pop, classic bollywood, filmi]",False
3,"[rock, pop]","[lounge, bossa nova, jazz]",True
4,"[hoerspiel, pop, quiet storm]","[rock, metal]",True


In [15]:
# Combine genres and favourite_genres
mlb = MultiLabelBinarizer()

encoded_favourite_genres = mlb.fit_transform(balanced_data['favourite_genres'])
encoded_genres = mlb.fit_transform(balanced_data['genres'])
# balanced_data.head()

In [16]:
X = np.concatenate([encoded_genres, encoded_favourite_genres], axis=1)

# Extract the labels
y = balanced_data['skipped'].astype(int).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# count skipped and not skipped songs in X_train
print("Y_train not skipped", np.count_nonzero(y_train == 0))
print("Y_train skipped", np.count_nonzero(y_train == 1))
print("Y_train skipped %", np.count_nonzero(y_train == 1) / len(y_train))


Y_train not skipped 2891
Y_train skipped 2902
Y_train skipped % 0.500949421715864


In [17]:
def build_model(hp):
  model = Sequential()
  model.add(Dense(10 * 2, activation='relu', input_shape=(X_train.shape[1],)))
  model.add(Dropout(0.5))
  model.add(Dense(10, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(1, activation='sigmoid'))

  model.compile(
      optimizer=Adam(),
      loss='binary_crossentropy', metrics=['accuracy'])
  return model


In [18]:
epochs = 100
batch_size = 32

model = build_model(None)

callbacks = [
  keras.callbacks.ModelCheckpoint(
      filepath='../models/model2_{epoch}',
      save_freq='epoch')
]

model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=callbacks,
    verbose=2
)


Epoch 1/100
INFO:tensorflow:Assets written to: ../models/model2_1/assets
182/182 - 2s - loss: 0.6926 - accuracy: 0.5230 - val_loss: 0.6837 - val_accuracy: 0.5963 - 2s/epoch - 12ms/step
Epoch 2/100
INFO:tensorflow:Assets written to: ../models/model2_2/assets
182/182 - 2s - loss: 0.6757 - accuracy: 0.5764 - val_loss: 0.6632 - val_accuracy: 0.6398 - 2s/epoch - 9ms/step
Epoch 3/100
INFO:tensorflow:Assets written to: ../models/model2_3/assets
182/182 - 2s - loss: 0.6607 - accuracy: 0.6132 - val_loss: 0.6533 - val_accuracy: 0.6197 - 2s/epoch - 9ms/step
Epoch 4/100
INFO:tensorflow:Assets written to: ../models/model2_4/assets
182/182 - 2s - loss: 0.6490 - accuracy: 0.6249 - val_loss: 0.6368 - val_accuracy: 0.6563 - 2s/epoch - 8ms/step
Epoch 5/100
INFO:tensorflow:Assets written to: ../models/model2_5/assets
182/182 - 2s - loss: 0.6391 - accuracy: 0.6389 - val_loss: 0.6298 - val_accuracy: 0.6563 - 2s/epoch - 9ms/step
Epoch 6/100
INFO:tensorflow:Assets written to: ../models/model2_6/assets
182/18

<keras.callbacks.History at 0x2da704370>

In [19]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=2)
print("Test set accuracy:", accuracy)

print("TEST")
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_pred_classes))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_classes))
print("Classification report:\n", classification_report(y_test, y_pred_classes))

print("TRAIN")
y_pred = model.predict(X_train)
y_pred_classes = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_train, y_pred_classes))
print("Confusion matrix:\n", confusion_matrix(y_train, y_pred_classes))
print("Classification report:\n", classification_report(y_train, y_pred_classes))

46/46 - 0s - loss: 0.7984 - accuracy: 0.6363 - 185ms/epoch - 4ms/step
Test set accuracy: 0.6363009214401245
TEST
Accuracy: 0.6363008971704623
Confusion matrix:
 [[476 254]
 [273 446]]
Classification report:
               precision    recall  f1-score   support

           0       0.64      0.65      0.64       730
           1       0.64      0.62      0.63       719

    accuracy                           0.64      1449
   macro avg       0.64      0.64      0.64      1449
weighted avg       0.64      0.64      0.64      1449

TRAIN
Accuracy: 0.79147246677024
Confusion matrix:
 [[2339  552]
 [ 656 2246]]
Classification report:
               precision    recall  f1-score   support

           0       0.78      0.81      0.79      2891
           1       0.80      0.77      0.79      2902

    accuracy                           0.79      5793
   macro avg       0.79      0.79      0.79      5793
weighted avg       0.79      0.79      0.79      5793

