In [107]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from tensorflow import keras

from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
import keras_tuner

from tensorboard.plugins.hparams import api as hp

In [108]:
DATA_VERSION = "v2"

merged_data_path = f'../data/{DATA_VERSION}/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

In [109]:
# K-MEANS

# Prepare the list of unique genres
unique_genres = list(set([genre for genres in data['genres'] for genre in genres]) | set(
    [genre for genres in data['favourite_genres'] for genre in genres]))

# Convert the genres to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
vectorized = vectorizer.fit_transform(unique_genres)

# Apply K-means clustering
num_clusters = 100  # Adjust this value according to your needs
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto')
kmeans.fit(vectorized)
labels = kmeans.labels_

# Group the genres based on the cluster labels
clustered_genres = {}
for i, label in enumerate(labels):
    if label not in clustered_genres:
        clustered_genres[label] = []
    clustered_genres[label].append(unique_genres[i])

# Map genres to their cluster label
genre_to_cluster = {genre: label for label, genres in clustered_genres.items() for genre in genres}


# Define the function for mapping genres to simpler forms
def map_genre(genre):
    cluster_label = genre_to_cluster[genre]
    representative_genre = clustered_genres[cluster_label][
        0]  # Use the first genre in the cluster as the representative
    return representative_genre


# Save the genre_to_cluster and clustered_genres dictionaries to use for new data prediction
with open('../microservice/models/model2_genre_to_cluster.pickle', 'wb') as f:
    pickle.dump(genre_to_cluster, f)

with open('../microservice/models/model2_clustered_genres.pickle', 'wb') as f:
    pickle.dump(clustered_genres, f)

# Apply the mapping function to both 'genres' and 'favourite_genres' columns
data['genres'] = data['genres'].apply(lambda x: [map_genre(genre) for genre in x])
data['favourite_genres'] = data['favourite_genres'].apply(lambda x: [map_genre(genre) for genre in x])

# Delete not unique genres
data['genres'] = data['genres'].apply(lambda x: list(set(x)))
data['favourite_genres'] = data['favourite_genres'].apply(lambda x: list(set(x)))


In [110]:
# count unique genres
unique_genres = set()
for genres in data['genres']:
    unique_genres.update(genres)
for genres in data['favourite_genres']:
    unique_genres.update(genres)
print(len(unique_genres))

# how many tracks are per genre
genre_count = {}
for genres in data['genres']:
    for genre in genres:
        if genre not in genre_count:
            genre_count[genre] = 0
        genre_count[genre] += 1

genre_count = sorted(genre_count.items(), key=lambda x: x[1], reverse=True)
genre_count

100


[('vallenato moderno', 5622),
 ('math rock', 4695),
 ('classic greek rock', 3014),
 ('pop emo', 2771),
 ('flemish folk', 1619),
 ('country', 984),
 ('jump blues', 913),
 ('pop rap', 658),
 ('belly dance', 622),
 ('dark minimal techno', 580),
 ('swedish hip hop', 580),
 ('classic bulgarian pop', 546),
 ('polish alternative', 546),
 ('singer-songwriter', 544),
 ('new wave of thrash metal', 544),
 ('glam metal', 523),
 ('indonesian metal', 518),
 ('indie soul', 386),
 ('trap latino', 381),
 ('latin worship', 374),
 ('progressive groove metal', 364),
 ('hungarian punk', 341),
 ('brit funk', 319),
 ('canadian indie', 295),
 ('neo-crust', 221),
 ('jazz trumpet', 215),
 ('soulful house', 215),
 ('disco house', 180),
 ('contemporary vocal jazz', 175),
 ('girl group', 173),
 ('rap underground espanol', 137),
 ('drill francais', 123),
 ('mexican rock-and-roll', 111),
 ('beat italiano', 108),
 ('australian indigenous', 107),
 ('brazilian edm', 104),
 ('indian fusion', 102),
 ('mandopop', 100),
 (

In [111]:
# balance set
skipped_data = data[data["skipped"] == 1]
not_skipped_data = data[data["skipped"] == 0]

# Split the dataset into subsets based on class labels
print("skipped_data", skipped_data.shape)
print("not_skipped_data", not_skipped_data.shape)

# Calculate the minimum number of samples among all classes
min_samples = min(len(skipped_data), len(not_skipped_data))

# Randomly select samples from each class subset to match the minimum number of samples
skipped_data_balanced = skipped_data.sample(n=min_samples, random_state=42)
not_skipped_data_balanced = not_skipped_data.sample(n=min_samples, random_state=42)

# Merge the balanced subsets to create the final balanced dataset
balanced_data = pd.concat([skipped_data_balanced, not_skipped_data_balanced], axis=0)

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

skipped_data (3621, 3)
not_skipped_data (6260, 3)


In [112]:
# Combine genres and favourite_genres
mlb = MultiLabelBinarizer()

data['combined_genres'] = data.apply(lambda x: x['genres'] + x['favourite_genres'], axis=1)
mlb.fit(data['combined_genres'])

encoded_favourite_genres = mlb.transform(balanced_data['favourite_genres'])
encoded_genres = mlb.transform(balanced_data['genres'])

# save mlb to file
with open('../microservice/models/model2_mlb.pickle', 'wb') as f:
    pickle.dump(mlb, f)


In [113]:
X = np.concatenate([encoded_genres, encoded_favourite_genres], axis=1)

# Extract the labels
y = balanced_data['skipped'].astype(int).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# count skipped and not skipped songs in X_train
print("Y_train not skipped", np.count_nonzero(y_train == 0))
print("Y_train skipped", np.count_nonzero(y_train == 1))
print("Y_train skipped %", np.count_nonzero(y_train == 1) / len(y_train))

Y_train not skipped 2921
Y_train skipped 2872
Y_train skipped % 0.4957707578111514


In [114]:
# mode = "train"
mode = "tune"

# NOTATKI Z TRAINU
# 1 layer - ???
# 2 layer - 100,100 neuronow 0.001 learning rate (20 epok, 0.655 acc) dropout (0.5)
# 2 layer - 111,41  neuronow 0.003 learning rate (20 epok, 0.670 acc) dropout (0.5)

# NOTATKI Z TUNINGU
# 1 layer - 31 neuronow, 0.0067 learning rate (x epok, y ac) dropout (0.5)
# 2 layer - 100,100 neuronow, 0.001 learning rate (10 epok, 0.641 acc) bez dropoutu
#         - 111,41 neuronow, 0.003 learning rate bez dropoutu

In [115]:
def build_model(hp):
    if mode == "train":
        model = Sequential()
        model.add(Input(shape=(X_train.shape[1])))
        model.add(Dense(units=110, activation="relu"))
        model.add(Dropout(0.5))
        model.add(Dense(units=40, activation="relu"))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation="sigmoid"))

        model.compile(
            optimizer=Adam(learning_rate=0.003),
            loss='binary_crossentropy', metrics=['accuracy'])
    else:
        model = Sequential()
        model.add(Input(shape=(X_train.shape[1])))

        for i in range(hp.Int("num_layers", min_value=1, max_value=3)):
            model.add(Dense(units=hp.Int(f"units_{i}", min_value=1, max_value=200, step=5), activation="relu"))
            if hp.Boolean("dropout"):
                model.add(Dropout(0.5))

        model.add(Dense(1, activation="sigmoid"))

        model.compile(optimizer=Adam(learning_rate=hp.Float("learning_rate", min_value=0.001, max_value=0.01, sampling="log")),loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [116]:
if mode == "train":
    model = build_model(None)
    model.fit(
        X_train,
        y_train,
        validation_data=(X_test, y_test),
        epochs=20,
        batch_size=32,
        verbose=2
    )
else:
    # tuner = keras_tuner.RandomSearch(
    #     hypermodel=build_model,
    #     objective="val_accuracy",
    #     max_trials=50,
    #     executions_per_trial=1,
    #     overwrite=True,
    #     directory="tuner",
    #     project_name="IUM",
    # )

    tuner = keras_tuner.Hyperband(
        hypermodel=build_model,
        objective="val_accuracy",
        max_epochs=20,
        factor=3,
        executions_per_trial=1,
        overwrite=True,
        directory="tuner",
        project_name="IUM",
    )

    stop_early = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    tensorboard_callback = keras.callbacks.TensorBoard(log_dir="logs")

    tuner.search_space_summary()

    tuner.search(X_train, y_train, epochs=50, validation_data=(X_test, y_test), callbacks=[stop_early, tensorboard_callback])

    tuner.results_summary()

Trial 1 Complete [00h 01m 46s]
val_accuracy: 0.6507936418056488

Best val_accuracy So Far: 0.6507936418056488
Total elapsed time: 00h 01m 46s

Search: Running Trial #2

Value             |Best Value So Far |Hyperparameter
3                 |2                 |num_layers
111               |136               |units_0
False             |True              |dropout
0.0028229         |0.0013438         |learning_rate
36                |1                 |units_1

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50

KeyboardInterrupt: 

In [None]:
if mode == "train":
    # save model to file
    model.save('../models/model2.h5')

    print("TEST")
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)
    print("Accuracy:", accuracy_score(y_test, y_pred_classes))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_classes))
    print("Classification report:\n", classification_report(y_test, y_pred_classes))

    print("TRAIN")
    y_pred = model.predict(X_train)
    y_pred_classes = (y_pred > 0.5).astype(int)
    print("Accuracy:", accuracy_score(y_train, y_pred_classes))
    print("Confusion matrix:\n", confusion_matrix(y_train, y_pred_classes))
    print("Classification report:\n", classification_report(y_train, y_pred_classes))

    print("ALL")
    y_pred = model.predict(X)
    y_pred_classes = (y_pred > 0.5).astype(int)
    print("Accuracy:", accuracy_score(y, y_pred_classes))
    print("Confusion matrix:\n", confusion_matrix(y, y_pred_classes))
    print("Classification report:\n", classification_report(y, y_pred_classes))


TEST
Accuracy: 0.6701173222912353
Confusion matrix:
 [[439 291]
 [187 532]]
Classification report:
               precision    recall  f1-score   support

           0       0.70      0.60      0.65       730
           1       0.65      0.74      0.69       719

    accuracy                           0.67      1449
   macro avg       0.67      0.67      0.67      1449
weighted avg       0.67      0.67      0.67      1449

TRAIN
Accuracy: 0.7431382703262558
Confusion matrix:
 [[1904  987]
 [ 501 2401]]
Classification report:
               precision    recall  f1-score   support

           0       0.79      0.66      0.72      2891
           1       0.71      0.83      0.76      2902

    accuracy                           0.74      5793
   macro avg       0.75      0.74      0.74      5793
weighted avg       0.75      0.74      0.74      5793

ALL
Accuracy: 0.7285280309306821
Confusion matrix:
 [[2343 1278]
 [ 688 2933]]
Classification report:
               precision    recall  f1-