In [167]:
import pandas as pd
import numpy as np
from keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from tensorflow import keras

from keras.models import Sequential
from keras.layers import Dense, Dropout
# import keras_tuner

In [168]:
merged_data_path = '../data/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

# data = data.drop(
#     columns=["release_date", "key", "loudness",
#              "explicit", "popularity", "duration_ms", "danceability", "energy", "speechiness",
#              "acousticness", "instrumentalness", "liveness", "valence", "tempo"])

data = data.drop(
    columns=["release_date", "name"])

data.head(5)

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,favourite_genres,genres,skipped,number_of_matching_genres
0,34,247707,0,0.491,0.606,7,-10.172,0.0377,0.00327,8e-06,0.341,0.669,123.025,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, folk rock...",False,0
1,34,247707,0,0.491,0.606,7,-10.172,0.0377,0.00327,8e-06,0.341,0.669,123.025,"[filmi, regional mexican, folk]","[album rock, art rock, classic rock, folk rock...",False,0
2,34,247707,0,0.491,0.606,7,-10.172,0.0377,0.00327,8e-06,0.341,0.669,123.025,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False,1
3,35,140067,0,0.449,0.749,9,-8.585,0.0775,0.01,0.0,0.391,0.448,106.861,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False,1
4,35,140067,0,0.449,0.749,9,-8.585,0.0775,0.01,0.0,0.391,0.448,106.861,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False,1


In [178]:
unique_favourite_genres = set()
for genres in data['favourite_genres']:
  for genre in genres:
    unique_favourite_genres.add(genre)

unique_genres = set()
for genres in data['genres']:
  for genre in genres:
    unique_genres.add(genre)

# show all common genres between all favourite_genres and genres
print("unique_favourite_genres", len(unique_favourite_genres))
print("unique_genres", len(unique_genres))
common_genres = unique_favourite_genres.intersection(unique_genres)
print("common genres", common_genres)

# remove genres that are not in common_genres
data['genres'] = data['genres'].apply(
    lambda x: [genre for genre in x if genre in common_genres])

data.head()

# TODO remove ??? moze niepotrzebne

unique_favourite_genres 46
unique_genres 46
common genres {'blues rock', 'singer-songwriter', 'pop', 'quiet storm', 'album rock', 'ranchera', 'motown', 'vocal jazz', 'classic rock', 'soul', 'art rock', 'pop rock', 'folk', 'lounge', 'j-pop', 'filmi', 'hard rock', 'new wave', 'mellow gold', 'country rock', 'new romantic', 'permanent wave', 'alternative rock', 'new wave pop', 'latin alternative', 'soft rock', 'rock en espanol', 'brill building pop', 'c-pop', 'adult standards', 'alternative metal', 'mpb', 'turkish pop', 'metal', 'funk', 'europop', 'latin', 'latin pop', 'latin rock', 'tropical', 'rock', 'psychedelic rock', 'hoerspiel', 'dance pop', 'mandopop', 'regional mexican'}


Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,favourite_genres,genres,skipped,number_of_matching_genres
0,34,247707,0,0.491,0.606,7,-10.172,0.0377,0.00327,8e-06,0.341,0.669,123.025,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, psychedel...",False,0
1,34,247707,0,0.491,0.606,7,-10.172,0.0377,0.00327,8e-06,0.341,0.669,123.025,"[filmi, regional mexican, folk]","[album rock, art rock, classic rock, psychedel...",False,0
2,34,247707,0,0.491,0.606,7,-10.172,0.0377,0.00327,8e-06,0.341,0.669,123.025,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, psychedel...",False,1
3,35,140067,0,0.449,0.749,9,-8.585,0.0775,0.01,0.0,0.391,0.448,106.861,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, psychedel...",False,1
4,35,140067,0,0.449,0.749,9,-8.585,0.0775,0.01,0.0,0.391,0.448,106.861,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, psychedel...",False,1


In [170]:
# Combine genres and favourite_genres
all_genres = list(data['favourite_genres'] + data['genres'])

# One-hot encode the genres
mlb = MultiLabelBinarizer()
mlb.fit(all_genres)

encoded_all_genres = mlb.fit_transform(all_genres)

# Split encoded_genres into genres and favourite_genres
# encoded_favourite_genres = encoded_all_genres[:, :len(data['favourite_genres'][0])]
# encoded_genres = encoded_all_genres[:, len(data['favourite_genres'][0]):]

encoded_favourite_genres = mlb.transform(data['favourite_genres'])
encoded_genres = mlb.transform(data['genres'])

popularity_normalized = data['popularity'].values.reshape(-1, 1)
popularity_normalized = (popularity_normalized - np.min(popularity_normalized)) / (np.max(popularity_normalized) - np.min(popularity_normalized))

duration_ms_normalized = data['duration_ms'].values.reshape(-1, 1)
duration_ms_normalized = (duration_ms_normalized - np.min(duration_ms_normalized)) / (np.max(duration_ms_normalized) - np.min(duration_ms_normalized))

# create data frame from data genres, data favourite_genres, encoded genres, encoded favourite_genres
df = pd.DataFrame(
  data={'popularity': data['popularity'], 'popularity_normalized': popularity_normalized.reshape(-1),
        'duration_ms': data['duration_ms'], 'duration_ms_normalized': duration_ms_normalized.reshape(-1),
        'skipped': data['skipped']})

df.head(5)

Unnamed: 0,popularity,popularity_normalized,duration_ms,duration_ms_normalized,skipped
0,34,0.354167,247707,0.092836,False
1,34,0.354167,247707,0.092836,False
2,34,0.354167,247707,0.092836,False
3,35,0.364583,140067,0.046724,False
4,35,0.364583,140067,0.046724,False


In [171]:
# TODO uzywac keras tokenizer?

# Concatenate the one-hot encoded columns
X = np.concatenate([encoded_favourite_genres, encoded_genres, popularity_normalized, duration_ms_normalized], axis=1)
# X = np.concatenate([popularity_normalized, duration_ms_normalized], axis=1)

# create df from X
# df = pd.DataFrame(data=X)
# df.head(5)
#
# Extract the labels
y = data['skipped'].astype(int).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# count skipped and not skipped songs in X_train
print("Y_train not skipped", np.count_nonzero(y_test == 0))
print("Y_train skipped", np.count_nonzero(y_test == 1))
print("Y_train skipped %", np.count_nonzero(y_test == 1) / len(y_train))


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,84,85,86,87,88,89,90,91,92,93
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.354167,0.092836
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.354167,0.092836
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.354167,0.092836
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.364583,0.046724
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.364583,0.046724


In [172]:
def build_model(hp):
  model = Sequential()
  model.add(Dense(32 * 2, activation='relu', input_shape=(X_train.shape[1],)))
  model.add(Dropout(0.5))
  model.add(Dense(32, activation='relu'))
  model.add(Dropout(0.5))
  # model.add(Dense(1000, activation='relu'))
  # model.add(Dropout(0.5))
  model.add(Dense(1, activation='sigmoid'))

  # model.compile(
  #     optimizer=keras.optimizers.Adam(hp.Choice('learning_date', values=[0.5, 0.1, 0.01])),
  #     loss='binary_crossentropy', metrics=['accuracy'])
  model.compile(
      optimizer=Adam(),
      loss='binary_crossentropy', metrics=['accuracy'])
  return model


In [173]:
epochs = 50
batch_size = 32

model = build_model(None)

history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=epochs,
    batch_size=batch_size,
    verbose=2
)

# tuner = keras_tuner.tuners.Hyperband(
#     build_model,
#     objective='val_accuracy',
#     max_epochs=50,
#     max_trials=10,
#     executions_per_trial=2,
#     directory='my_dir')


Epoch 1/50
247/247 - 3s - loss: 0.6094 - accuracy: 0.6704 - val_loss: 0.5711 - val_accuracy: 0.6889 - 3s/epoch - 13ms/step
Epoch 2/50
247/247 - 3s - loss: 0.5832 - accuracy: 0.6862 - val_loss: 0.5516 - val_accuracy: 0.6945 - 3s/epoch - 10ms/step
Epoch 3/50
247/247 - 3s - loss: 0.5783 - accuracy: 0.6941 - val_loss: 0.5529 - val_accuracy: 0.6990 - 3s/epoch - 11ms/step
Epoch 4/50
247/247 - 3s - loss: 0.5739 - accuracy: 0.6979 - val_loss: 0.5516 - val_accuracy: 0.6950 - 3s/epoch - 10ms/step
Epoch 5/50
247/247 - 3s - loss: 0.5680 - accuracy: 0.7051 - val_loss: 0.5610 - val_accuracy: 0.7056 - 3s/epoch - 10ms/step
Epoch 6/50
247/247 - 3s - loss: 0.5649 - accuracy: 0.7033 - val_loss: 0.5550 - val_accuracy: 0.6985 - 3s/epoch - 11ms/step
Epoch 7/50
247/247 - 3s - loss: 0.5618 - accuracy: 0.7056 - val_loss: 0.5536 - val_accuracy: 0.7026 - 3s/epoch - 10ms/step
Epoch 8/50
247/247 - 3s - loss: 0.5554 - accuracy: 0.7064 - val_loss: 0.5599 - val_accuracy: 0.6960 - 3s/epoch - 11ms/step
Epoch 9/50
247/2

In [174]:
# tuner.search(
#     (X_train, y_train),
#     validation_data=(X_test, y_test),
# )

In [175]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=2)
print("Test set accuracy:", accuracy)

print("TEST")
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_pred_classes))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_classes))
print("Classification report:\n", classification_report(y_test, y_pred_classes))

print("TRAIN")
y_pred = model.predict(X_train)
y_pred_classes = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_train, y_pred_classes))
print("Confusion matrix:\n", confusion_matrix(y_train, y_pred_classes))
print("Classification report:\n", classification_report(y_train, y_pred_classes))

62/62 - 0s - loss: 0.8142 - accuracy: 0.6925 - 224ms/epoch - 4ms/step
Test set accuracy: 0.6924633383750916
TEST
Accuracy: 0.6924633282751644
Confusion matrix:
 [[1039  207]
 [ 401  330]]
Classification report:
               precision    recall  f1-score   support

           0       0.72      0.83      0.77      1246
           1       0.61      0.45      0.52       731

    accuracy                           0.69      1977
   macro avg       0.67      0.64      0.65      1977
weighted avg       0.68      0.69      0.68      1977

TRAIN
Accuracy: 0.7982034412955465
Confusion matrix:
 [[4640  374]
 [1221 1669]]
Classification report:
               precision    recall  f1-score   support

           0       0.79      0.93      0.85      5014
           1       0.82      0.58      0.68      2890

    accuracy                           0.80      7904
   macro avg       0.80      0.75      0.77      7904
weighted avg       0.80      0.80      0.79      7904



In [176]:
# # Make predictions on the test set
# y_pred = model.predict(X_test)
# y_pred_classes = (y_pred > 0.5).astype(int)
#
# # Print the predicted and actual labels
# print("Predicted labels:", y_pred_classes.flatten())
# print("Actual labels:", y_test)

In [177]:
# # test with new data
#
# new_data = [
#   (["dominican pop", "merengue", "merengue tipico", "tropical"],
#    ["blues rock", "country rock", "lounge"])
# ]
#
# labels = [
#
# ]
#
# new_df = pd.DataFrame(new_data, columns=["genres", "favourite_genres"])
#
# # Combine genres and favourite_genres
# all_new_genres = list(new_df['genres'] + new_df['favourite_genres'])
#
# # One-hot encode the genres using the previously fit MultiLabelBinarizer (mlb)
# encoded_new_genres = mlb.transform(all_new_genres)
#
# # Split encoded_new_genres into genres and favourite_genres
# encoded_new_genres1 = encoded_new_genres[:, :len(new_df['genres'][0])]
# encoded_new_genres2 = encoded_new_genres[:, len(new_df['genres'][0]):]
#
# # Concatenate the one-hot encoded columns
# X_new = np.concatenate([encoded_new_genres1, encoded_new_genres2], axis=1)
#
# y_new_pred = model.predict(X_new)
# y_new_pred_classes = (y_new_pred > 0.5).astype(int)
#
# # Print the predicted labels
# print("Predicted labels:", y_new_pred_classes.flatten())
# print("Actual labels:", labels)
