In [92]:
import pandas as pd
import numpy as np
from keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from tensorflow import keras

from keras.models import Sequential
from keras.layers import Dense, Dropout

In [93]:
merged_data_path = '../data/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

data.head(5)

Unnamed: 0,timestamp,user_id,track_id,session_id,track_name,popularity,duration_ms,explicit,id_artist,release_date,...,premium_user,name,genres,skipped,number_of_matching_genres,month,day_of_week,hour_of_day,genres_with_favourite_genres,common_genres
0,2023-01-03 05:09:55.000,101,2PmGtDUyJIpYBEtI1hQIVp,124,Ballrooms Of Mars,34,247707,0,3dBVyJ7JuOMt4GE9607Qin,1972-07-21,...,False,T. Rex,"[album rock, art rock, classic rock, folk rock...",False,0,1,1,5,"[album rock, art rock, classic rock, folk rock...","[protopunk, rock, glam rock, mandopop, album r..."
1,2023-02-08 21:41:02.507,103,2PmGtDUyJIpYBEtI1hQIVp,151,Ballrooms Of Mars,34,247707,0,3dBVyJ7JuOMt4GE9607Qin,1972-07-21,...,False,T. Rex,"[album rock, art rock, classic rock, folk rock...",False,0,2,2,21,"[album rock, art rock, classic rock, folk rock...","[folk, protopunk, rock, regional mexican, glam..."
2,2023-02-12 01:17:14.946,132,2PmGtDUyJIpYBEtI1hQIVp,544,Ballrooms Of Mars,34,247707,0,3dBVyJ7JuOMt4GE9607Qin,1972-07-21,...,True,T. Rex,"[album rock, art rock, classic rock, folk rock...",False,1,2,6,1,"[album rock, art rock, classic rock, folk rock...","[protopunk, rock, rock en espanol, glam rock, ..."
3,2023-01-03 03:59:59.738,132,5yxYokipsWlpDCt4Th4VVc,534,Solid Gold Easy Action,35,140067,0,3dBVyJ7JuOMt4GE9607Qin,1973-03-16,...,True,T. Rex,"[album rock, art rock, classic rock, folk rock...",False,1,1,1,3,"[album rock, art rock, classic rock, folk rock...","[protopunk, rock, rock en espanol, glam rock, ..."
4,2023-03-07 14:28:25.702,132,5yxYokipsWlpDCt4Th4VVc,547,Solid Gold Easy Action,35,140067,0,3dBVyJ7JuOMt4GE9607Qin,1973-03-16,...,True,T. Rex,"[album rock, art rock, classic rock, folk rock...",False,1,3,1,14,"[album rock, art rock, classic rock, folk rock...","[protopunk, rock, rock en espanol, glam rock, ..."


In [94]:
unique_favourite_genres = set()
for favourite_genres in data['favourite_genres']:
    unique_favourite_genres.update(favourite_genres)

unique_genres = set()
for genres in data['genres']:
    unique_genres.update(genres)

# remove genres that are not in favourite_genres
# data['genres'] = data['genres'].apply(
#     lambda x: [genre for genre in x if genre in unique_favourite_genres])

data.head()

In [95]:
# balance set
skipped_data = data[data["skipped"] == 1]
not_skipped_data = data[data["skipped"] == 0]

# Split the dataset into subsets based on class labels
print("skipped_data", skipped_data.shape)
print("not_skipped_data", not_skipped_data.shape)

# Calculate the minimum number of samples among all classes
min_samples = min(len(skipped_data), len(not_skipped_data))

# Randomly select samples from each class subset to match the minimum number of samples
skipped_data_balanced = skipped_data.sample(n=min_samples, random_state=42)
not_skipped_data_balanced = not_skipped_data.sample(n=min_samples, random_state=42)

# Merge the balanced subsets to create the final balanced dataset
balanced_data = pd.concat([skipped_data_balanced, not_skipped_data_balanced], axis=0)

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)
balanced_data.head()

skipped_data (3621, 34)
not_skipped_data (6260, 34)


Unnamed: 0,timestamp,user_id,track_id,session_id,track_name,popularity,duration_ms,explicit,id_artist,release_date,...,premium_user,name,genres,skipped,number_of_matching_genres,month,day_of_week,hour_of_day,genres_with_favourite_genres,common_genres
0,2023-03-23 18:05:18.096,147,0kVB6PeqBbN2HhwJdWumeZ,732,The Wall Street Shuffle,40,236373,0,6i6WlGzQtXtz7GcC5H5st5,1974,...,True,10cc,"[album rock, art rock, classic rock, mellow go...",True,1,3,3,18,"[album rock, art rock, classic rock, classic u...","[rock, classic uk pop, glam rock, hoerspiel, a..."
1,2023-01-22 05:30:03.002,141,4pJKzul9oXW1lBQmOpf96m,646,Deep Dark Dungeon,38,129133,0,06nsZ3qSOYZ2hPVIMcr1IN,1981-02,...,False,J.J. Cale,"[album rock, blues rock, classic rock, country...",False,1,1,6,5,"[album rock, blues, blues rock, classic rock, ...","[rock, album rock, country rock, folk rock, cl..."
2,2023-02-23 05:54:20.102,125,7t6zMZSO6DfCeE4IcinxUJ,443,Chala Chala Navbala,0,193320,0,61JrslREXq98hurYL2hYoc,1943-12-31,...,False,Lata Mangeshkar,[filmi],False,0,2,3,5,"[classic bollywood, desi pop, filmi, sufi, cla...","[regional mexican, classic bollywood, sufi, cl..."
3,2023-02-26 04:48:49.355,149,62UVjAjo8ra1PVSeHfnxk3,754,I Love Paris,21,353200,0,3XOVABzceOUTbR3iEz0ImO,1959-01-01,...,False,Cal Tjader,[lounge],True,0,2,6,4,"[bossa nova, cool jazz, jazz, jazz funk, jazz ...","[new wave pop, latin jazz, jazz, lounge, bossa..."
4,2023-01-31 09:21:51.897,134,7DDfGy1cstvJGETyYYnfig,559,The One to Sing the Blues,30,187213,0,1DFr97A9HnbV3SKTJFu62M,1991-02-26,...,True,Motörhead,"[album rock, alternative metal, hard rock, met...",True,0,1,1,9,"[album rock, alternative metal, hard rock, met...","[rock, album rock, brill building pop, metal, ..."


In [96]:
# Combine genres and favourite_genres
mlb = MultiLabelBinarizer()

encoded_favourite_genres = mlb.fit_transform(balanced_data['favourite_genres'])
encoded_genres = mlb.fit_transform(balanced_data['genres'])
# balanced_data.head()

In [97]:
X = np.concatenate([encoded_genres, encoded_favourite_genres], axis=1)

# Extract the labels
y = balanced_data['skipped'].astype(int).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# count skipped and not skipped songs in X_train
print("Y_train not skipped", np.count_nonzero(y_train == 0))
print("Y_train skipped", np.count_nonzero(y_train == 1))
print("Y_train skipped %", np.count_nonzero(y_train == 1) / len(y_train))


Y_train not skipped 2875
Y_train skipped 2918
Y_train skipped % 0.5037113757983773


In [98]:
def build_model(hp):
  model = Sequential()
  model.add(Dense(10 * 2, activation='relu', input_shape=(X_train.shape[1],)))
  model.add(Dropout(0.5))
  model.add(Dense(10, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(1, activation='sigmoid'))

  model.compile(
      optimizer=Adam(),
      loss='binary_crossentropy', metrics=['accuracy'])
  return model


In [99]:
epochs = 100
batch_size = 128

model = build_model(None)

callbacks = [
  keras.callbacks.ModelCheckpoint(
      filepath='../models/model2_{epoch}',
      save_freq='epoch')
]

model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=callbacks,
    verbose=2
)


Epoch 1/100
INFO:tensorflow:Assets written to: ../models/model2_1/assets
46/46 - 2s - loss: 0.7055 - accuracy: 0.5194 - val_loss: 0.6870 - val_accuracy: 0.5611 - 2s/epoch - 50ms/step
Epoch 2/100
INFO:tensorflow:Assets written to: ../models/model2_2/assets
46/46 - 1s - loss: 0.6951 - accuracy: 0.5215 - val_loss: 0.6853 - val_accuracy: 0.5790 - 696ms/epoch - 15ms/step
Epoch 3/100
INFO:tensorflow:Assets written to: ../models/model2_3/assets
46/46 - 1s - loss: 0.6869 - accuracy: 0.5474 - val_loss: 0.6813 - val_accuracy: 0.5921 - 754ms/epoch - 16ms/step
Epoch 4/100
INFO:tensorflow:Assets written to: ../models/model2_4/assets
46/46 - 1s - loss: 0.6857 - accuracy: 0.5529 - val_loss: 0.6786 - val_accuracy: 0.6121 - 772ms/epoch - 17ms/step
Epoch 5/100
INFO:tensorflow:Assets written to: ../models/model2_5/assets
46/46 - 1s - loss: 0.6774 - accuracy: 0.5781 - val_loss: 0.6717 - val_accuracy: 0.6460 - 662ms/epoch - 14ms/step
Epoch 6/100
INFO:tensorflow:Assets written to: ../models/model2_6/assets


<keras.callbacks.History at 0x3aeacb850>

In [101]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=2)
print("Test set accuracy:", accuracy)

print("TEST")
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_pred_classes))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_classes))
print("Classification report:\n", classification_report(y_test, y_pred_classes))

print("TRAIN")
y_pred = model.predict(X_train)
y_pred_classes = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_train, y_pred_classes))
print("Confusion matrix:\n", confusion_matrix(y_train, y_pred_classes))
print("Classification report:\n", classification_report(y_train, y_pred_classes))

46/46 - 0s - loss: 0.6176 - accuracy: 0.6694 - 330ms/epoch - 7ms/step
Test set accuracy: 0.669427216053009
TEST
Accuracy: 0.6694271911663217
Confusion matrix:
 [[454 292]
 [187 516]]
Classification report:
               precision    recall  f1-score   support

           0       0.71      0.61      0.65       746
           1       0.64      0.73      0.68       703

    accuracy                           0.67      1449
   macro avg       0.67      0.67      0.67      1449
weighted avg       0.67      0.67      0.67      1449

TRAIN
Accuracy: 0.7270844122216468
Confusion matrix:
 [[1865 1010]
 [ 571 2347]]
Classification report:
               precision    recall  f1-score   support

           0       0.77      0.65      0.70      2875
           1       0.70      0.80      0.75      2918

    accuracy                           0.73      5793
   macro avg       0.73      0.73      0.73      5793
weighted avg       0.73      0.73      0.73      5793

