In [224]:
import pandas as pd
import numpy as np
from keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from tensorflow import keras

from keras.models import Sequential
from keras.layers import Dense, Dropout

In [225]:
merged_data_path = '../data/v2/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

data.head(5)

Unnamed: 0,favourite_genres,genres,skipped
0,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, folk rock...",False
1,"[filmi, regional mexican, folk]","[album rock, art rock, classic rock, folk rock...",False
2,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
3,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
4,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False


In [226]:
unique_favourite_genres = set()
for favourite_genres in data['favourite_genres']:
    unique_favourite_genres.update(favourite_genres)

unique_genres = set()
for genres in data['genres']:
    unique_genres.update(genres)

# remove genres that are not in favourite_genres
# data['genres'] = data['genres'].apply(
#     lambda x: [genre for genre in x if genre in unique_favourite_genres])

print(len(unique_genres))

data.head()

1766


Unnamed: 0,favourite_genres,genres,skipped
0,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, folk rock...",False
1,"[filmi, regional mexican, folk]","[album rock, art rock, classic rock, folk rock...",False
2,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
3,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
4,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False


In [227]:
# K-MEANS

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Prepare the list of unique genres
unique_genres = list(set([genre for genres in data['genres'] for genre in genres])) + list(set([genre for genres in data['favourite_genres'] for genre in genres]))

# Convert the genres to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
vectorized = vectorizer.fit_transform(unique_genres)

# Apply K-means clustering
num_clusters = 100  # Adjust this value according to your needs
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto')
kmeans.fit(vectorized)
labels = kmeans.labels_

# Group the genres based on the cluster labels
clustered_genres = {}
for i, label in enumerate(labels):
    if label not in clustered_genres:
        clustered_genres[label] = []
    clustered_genres[label].append(unique_genres[i])

# Map genres to their cluster label
genre_to_cluster = {genre: label for label, genres in clustered_genres.items() for genre in genres}

# Define the function for mapping genres to simpler forms
def map_genre(genre):
    cluster_label = genre_to_cluster[genre]
    representative_genre = clustered_genres[cluster_label][0]  # Use the first genre in the cluster as the representative
    return representative_genre

# Apply the mapping function to both 'genres' and 'favourite_genres' columns
data['genres'] = data['genres'].apply(lambda x: [map_genre(genre) for genre in x])
data['favourite_genres'] = data['favourite_genres'].apply(lambda x: [map_genre(genre) for genre in x])

# Delete not unique genres
data['genres'] = data['genres'].apply(lambda x: list(set(x)))
data['favourite_genres'] = data['favourite_genres'].apply(lambda x: list(set(x)))

data.head(10)


Unnamed: 0,favourite_genres,genres,skipped
0,"[funk metal, no wave, classic mandopop]","[irish pub song, rock alternatif francais, psy...",False
1,"[mexican son, indie rock mexicano, irish pub s...","[irish pub song, rock alternatif francais, psy...",False
2,"[rock alternatif francais, psychedelic hip hop...","[irish pub song, rock alternatif francais, psy...",False
3,"[rock alternatif francais, psychedelic hip hop...","[irish pub song, rock alternatif francais, psy...",False
4,"[rock alternatif francais, psychedelic hip hop...","[irish pub song, rock alternatif francais, psy...",False
5,"[magyar alternative, classic cantopop, latin pop]","[irish pub song, rock alternatif francais, psy...",False
6,"[rock alternatif francais, cumbia ranchera, kl...","[irish pub song, rock alternatif francais, psy...",True
7,"[rock alternatif francais, cumbia ranchera, kl...","[irish pub song, rock alternatif francais, psy...",True
8,"[rock alternatif francais, cumbia ranchera, kl...","[irish pub song, rock alternatif francais, psy...",True
9,"[rock alternatif francais, cumbia ranchera, kl...","[irish pub song, rock alternatif francais, psy...",True


In [228]:
# count unique genres
unique_genres = set()
for genres in data['genres']:
    unique_genres.update(genres)
for genres in data['favourite_genres']:
    unique_genres.update(genres)
print(len(unique_genres))

# how many tracks are per genre
genre_count = {}
for genres in data['genres']:
    for genre in genres:
        if genre not in genre_count:
            genre_count[genre] = 0
        genre_count[genre] += 1

genre_count = sorted(genre_count.items(), key=lambda x: x[1], reverse=True)
genre_count

100


[('irish pub song', 5191),
 ('rock alternatif francais', 4499),
 ('classic cantopop', 3248),
 ('guinean pop', 2845),
 ('hard rock', 1141),
 ('art pop', 1126),
 ('country gospel', 997),
 ('no wave', 990),
 ('indie rock mexicano', 967),
 ('punk blues', 891),
 ('psychedelic hip hop', 817),
 ('dance rock', 623),
 ('viral rap', 608),
 ('doom metal', 574),
 ('russian post-punk', 563),
 ('turkish hip hop', 547),
 ('taiwan singer-songwriter', 544),
 ('magyar alternative', 531),
 ('british blues', 467),
 ('soul blues', 445),
 ('greek trap', 445),
 ('progressive death metal', 405),
 ('latin pop', 369),
 ('funk metal', 319),
 ('modern blues rock', 316),
 ('texas pop punk', 298),
 ('canadian ccm', 295),
 ('midwest emo', 202),
 ('tropical house', 199),
 ('piano rock', 176),
 ('hindustani vocal', 176),
 ('japanese electropop', 158),
 ('melodic metalcore', 130),
 ('reggaeton', 130),
 ('kleine hoerspiel', 111),
 ('roots reggae', 109),
 ('post-grunge', 108),
 ('mexican son', 108),
 ('pop edm', 103),
 (

In [229]:
# balance set
skipped_data = data[data["skipped"] == 1]
not_skipped_data = data[data["skipped"] == 0]

# Split the dataset into subsets based on class labels
print("skipped_data", skipped_data.shape)
print("not_skipped_data", not_skipped_data.shape)

# Calculate the minimum number of samples among all classes
min_samples = min(len(skipped_data), len(not_skipped_data))

# Randomly select samples from each class subset to match the minimum number of samples
skipped_data_balanced = skipped_data.sample(n=min_samples, random_state=42)
not_skipped_data_balanced = not_skipped_data.sample(n=min_samples, random_state=42)

# Merge the balanced subsets to create the final balanced dataset
balanced_data = pd.concat([skipped_data_balanced, not_skipped_data_balanced], axis=0)

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)
balanced_data.head()

skipped_data (3621, 3)
not_skipped_data (6260, 3)


Unnamed: 0,favourite_genres,genres,skipped
0,"[rock alternatif francais, cumbia ranchera, kl...","[rock alternatif francais, classic cantopop, a...",True
1,"[no wave, irish pub song]","[punk blues, irish pub song, rock alternatif f...",False
2,"[mexican son, classic cantopop, guinean pop]","[irish pub song, classic cantopop, guinean pop]",False
3,"[psychedelic hip hop, no wave, magyar alternat...","[funk metal, latin pop, soul blues, irish pub ...",True
4,"[guinean pop, kleine hoerspiel, irish pub song]","[rock alternatif francais, hard rock, doom met...",True


In [230]:
# Combine genres and favourite_genres
mlb = MultiLabelBinarizer()

encoded_favourite_genres = mlb.fit_transform(balanced_data['favourite_genres'])
encoded_genres = mlb.fit_transform(balanced_data['genres'])
# balanced_data.head()

# save mlb to file
import pickle
with open('../models/model2_mlb.pickle', 'wb') as f:
    pickle.dump(mlb, f)


In [231]:
X = np.concatenate([encoded_genres, encoded_favourite_genres], axis=1)

# Extract the labels
y = balanced_data['skipped'].astype(int).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# count skipped and not skipped songs in X_train
print("Y_train not skipped", np.count_nonzero(y_train == 0))
print("Y_train skipped", np.count_nonzero(y_train == 1))
print("Y_train skipped %", np.count_nonzero(y_train == 1) / len(y_train))


Y_train not skipped 2865
Y_train skipped 2928
Y_train skipped % 0.5054375970999482


In [232]:
def build_model(hp):
  model = Sequential()
  model.add(Dense(10 * 2, activation='relu', input_shape=(X_train.shape[1],)))
  model.add(Dropout(0.5))
  model.add(Dense(10, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(1, activation='sigmoid'))

  model.compile(
      optimizer=Adam(),
      loss='binary_crossentropy', metrics=['accuracy'])
  return model


In [233]:
epochs = 50
batch_size = 32

model = build_model(None)

model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=epochs,
    batch_size=batch_size,
    verbose=2
)

# save model to file
model.save('../models/model2.h5')

Epoch 1/50
182/182 - 2s - loss: 0.6991 - accuracy: 0.5129 - val_loss: 0.6879 - val_accuracy: 0.5645 - 2s/epoch - 11ms/step
Epoch 2/50
182/182 - 1s - loss: 0.6880 - accuracy: 0.5329 - val_loss: 0.6822 - val_accuracy: 0.5804 - 1s/epoch - 6ms/step
Epoch 3/50
182/182 - 1s - loss: 0.6822 - accuracy: 0.5482 - val_loss: 0.6752 - val_accuracy: 0.5970 - 1s/epoch - 6ms/step
Epoch 4/50
182/182 - 1s - loss: 0.6753 - accuracy: 0.5645 - val_loss: 0.6673 - val_accuracy: 0.6052 - 1s/epoch - 6ms/step
Epoch 5/50
182/182 - 1s - loss: 0.6695 - accuracy: 0.5874 - val_loss: 0.6580 - val_accuracy: 0.6156 - 1s/epoch - 6ms/step
Epoch 6/50
182/182 - 1s - loss: 0.6677 - accuracy: 0.5987 - val_loss: 0.6514 - val_accuracy: 0.6301 - 1s/epoch - 6ms/step
Epoch 7/50
182/182 - 1s - loss: 0.6546 - accuracy: 0.6211 - val_loss: 0.6444 - val_accuracy: 0.6190 - 1s/epoch - 6ms/step
Epoch 8/50
182/182 - 1s - loss: 0.6469 - accuracy: 0.6273 - val_loss: 0.6396 - val_accuracy: 0.6239 - 1s/epoch - 6ms/step
Epoch 9/50
182/182 - 1s

In [234]:
print("TEST")
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_pred_classes))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_classes))
print("Classification report:\n", classification_report(y_test, y_pred_classes))

print("TRAIN")
y_pred = model.predict(X_train)
y_pred_classes = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_train, y_pred_classes))
print("Confusion matrix:\n", confusion_matrix(y_train, y_pred_classes))
print("Classification report:\n", classification_report(y_train, y_pred_classes))


TEST
Accuracy: 0.6694271911663217
Confusion matrix:
 [[465 291]
 [188 505]]
Classification report:
               precision    recall  f1-score   support

           0       0.71      0.62      0.66       756
           1       0.63      0.73      0.68       693

    accuracy                           0.67      1449
   macro avg       0.67      0.67      0.67      1449
weighted avg       0.67      0.67      0.67      1449

TRAIN
Accuracy: 0.7322630761263594
Confusion matrix:
 [[1946  919]
 [ 632 2296]]
Classification report:
               precision    recall  f1-score   support

           0       0.75      0.68      0.72      2865
           1       0.71      0.78      0.75      2928

    accuracy                           0.73      5793
   macro avg       0.73      0.73      0.73      5793
weighted avg       0.73      0.73      0.73      5793

