In [208]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from tensorflow import keras

from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
import keras_tuner

In [209]:
DATA_VERSION = "v2"

merged_data_path = f'../data/{DATA_VERSION}/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

In [210]:
# K-MEANS

# Prepare the list of unique genres
unique_genres = list(set([genre for genres in data['genres'] for genre in genres]) | set(
    [genre for genres in data['favourite_genres'] for genre in genres]))

# Convert the genres to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
vectorized = vectorizer.fit_transform(unique_genres)

# Apply K-means clustering
num_clusters = 100  # Adjust this value according to your needs
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto')
kmeans.fit(vectorized)
labels = kmeans.labels_

# Group the genres based on the cluster labels
clustered_genres = {}
for i, label in enumerate(labels):
    if label not in clustered_genres:
        clustered_genres[label] = []
    clustered_genres[label].append(unique_genres[i])

# Map genres to their cluster label
genre_to_cluster = {genre: label for label, genres in clustered_genres.items() for genre in genres}


# Define the function for mapping genres to simpler forms
def map_genre(genre):
    cluster_label = genre_to_cluster[genre]
    representative_genre = clustered_genres[cluster_label][
        0]  # Use the first genre in the cluster as the representative
    return representative_genre


# Save the genre_to_cluster and clustered_genres dictionaries to use for new data prediction
with open('../models/model2_genre_to_cluster.pickle', 'wb') as f:
    pickle.dump(genre_to_cluster, f)

with open('../models/model2_clustered_genres.pickle', 'wb') as f:
    pickle.dump(clustered_genres, f)

# Apply the mapping function to both 'genres' and 'favourite_genres' columns
data['genres'] = data['genres'].apply(lambda x: [map_genre(genre) for genre in x])
data['favourite_genres'] = data['favourite_genres'].apply(lambda x: [map_genre(genre) for genre in x])

# Delete not unique genres
data['genres'] = data['genres'].apply(lambda x: list(set(x)))
data['favourite_genres'] = data['favourite_genres'].apply(lambda x: list(set(x)))


In [211]:
# count unique genres
unique_genres = set()
for genres in data['genres']:
    unique_genres.update(genres)
for genres in data['favourite_genres']:
    unique_genres.update(genres)
print(len(unique_genres))

# how many tracks are per genre
genre_count = {}
for genres in data['genres']:
    for genre in genres:
        if genre not in genre_count:
            genre_count[genre] = 0
        genre_count[genre] += 1

genre_count = sorted(genre_count.items(), key=lambda x: x[1], reverse=True)
genre_count

100


[('murcia indie', 5937),
 ('slovak rock', 4521),
 ('swamp pop', 2722),
 ('classic italian folk pop', 1554),
 ('hard rock', 1141),
 ('art rock', 1126),
 ('country quebecois', 993),
 ('piano blues', 919),
 ('permanent wave', 915),
 ('jazz rap', 658),
 ('bubblegum dance', 622),
 ('post-metal', 589),
 ('symphonic metal', 576),
 ('swedish singer-songwriter', 544),
 ('jewish hip hop', 541),
 ('alternative r&b', 519),
 ('northern soul', 440),
 ('latin metal', 374),
 ('trap', 354),
 ('modern rock', 321),
 ('p funk', 319),
 ('classic canadian rock', 295),
 ('polish punk', 268),
 ('acid house', 216),
 ('rockabilly', 194),
 ('deep disco', 180),
 ('j-pop boy group', 145),
 ('power-pop punk', 139),
 ('nz reggae', 111),
 ('mexican pop punk', 108),
 ('pop flamenco', 105),
 ('turkish trap pop', 105),
 ('pop edm', 104),
 ('australian pop', 102),
 ('deep talent show', 99),
 ('german metal', 97),
 ('hardcore', 82),
 ('irish folk', 81),
 ('german thrash metal', 81),
 ('neo-synthpop', 77),
 ('classic dutch

In [212]:
# balance set
skipped_data = data[data["skipped"] == 1]
not_skipped_data = data[data["skipped"] == 0]

# Split the dataset into subsets based on class labels
print("skipped_data", skipped_data.shape)
print("not_skipped_data", not_skipped_data.shape)

# Calculate the minimum number of samples among all classes
min_samples = min(len(skipped_data), len(not_skipped_data))

# Randomly select samples from each class subset to match the minimum number of samples
skipped_data_balanced = skipped_data.sample(n=min_samples, random_state=42)
not_skipped_data_balanced = not_skipped_data.sample(n=min_samples, random_state=42)

# Merge the balanced subsets to create the final balanced dataset
balanced_data = pd.concat([skipped_data_balanced, not_skipped_data_balanced], axis=0)

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

skipped_data (3621, 3)
not_skipped_data (6260, 3)


In [213]:
# Combine genres and favourite_genres
mlb = MultiLabelBinarizer()

data['combined_genres'] = data.apply(lambda x: x['genres'] + x['favourite_genres'], axis=1)
mlb.fit(data['combined_genres'])

encoded_favourite_genres = mlb.transform(balanced_data['favourite_genres'])
encoded_genres = mlb.transform(balanced_data['genres'])

# save mlb to file
with open('../models/model2_mlb.pickle', 'wb') as f:
    pickle.dump(mlb, f)


In [214]:
X = np.concatenate([encoded_genres, encoded_favourite_genres], axis=1)

# Extract the labels
y = balanced_data['skipped'].astype(int).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# count skipped and not skipped songs in X_train
print("Y_train not skipped", np.count_nonzero(y_train == 0))
print("Y_train skipped", np.count_nonzero(y_train == 1))
print("Y_train skipped %", np.count_nonzero(y_train == 1) / len(y_train))

Y_train not skipped 2873
Y_train skipped 2920
Y_train skipped % 0.5040566200586916


In [215]:
mode = "train"
# mode = "tune"

# NOTATKI Z TRAINU
# 1 layer - ???
# 2 layer - 16,8 neuronow 0.0015 learning rate (50 epok, 0.648 ac) dropout (0.5)

# NOTATKI Z TUNINGU
# 1 layer - 946 neuronow, 0.001 learning rate (10 epok, 0.680 acc) dropout (0.5)
# 2 layer - 108,376 neuronow, 0.0015 learning rate (10 epok, 0.641 acc) dropout (0.5)
# 3 layer - 836,2576,166 neuronow, 0.0015 learning rate (10 epok, 0.652 acc) dropout (0.5)

In [216]:
def build_model(hp):
    if mode == "train":
        model = Sequential()
        model.add(Input(shape=(X_train.shape[1])))
        model.add(Dense(units=16, activation="relu"))
        model.add(Dropout(0.5))
        model.add(Dense(units=8, activation="relu"))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation="sigmoid"))

        model.compile(
            optimizer=Adam(learning_rate=0.0015),
            loss='binary_crossentropy', metrics=['accuracy'])
    else:
        model = Sequential()
        model.add(Input(shape=(X_train.shape[1])))

        for i in range(hp.Int("num_layers", min_value=1, max_value=3)):
            model.add(Dense(units=hp.Int(f"units_{i}", min_value=1, max_value=3000, step=5), activation="relu"))
            if hp.Boolean("dropout"):
                model.add(Dropout(hp.Float("dropout_rate", min_value=0.1, max_value=0.99)))

        model.add(Dense(1, activation="sigmoid"))

        model.compile(optimizer=Adam(learning_rate=hp.Float("learning_rate", min_value=0.001, max_value=0.01)),loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [217]:
if mode == "train":
    model = build_model(None)
    model.fit(
        X_train,
        y_train,
        validation_data=(X_test, y_test),
        epochs=50,
        batch_size=32,
        verbose=2
    )
else:
    tuner = keras_tuner.RandomSearch(
        hypermodel=build_model,
        objective="val_accuracy",
        max_trials=5,
        executions_per_trial=1,
        overwrite=True,
        directory="tuner",
        project_name="IUM",
    )

    tuner.search_space_summary()

    tuner.search(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

    tuner.results_summary()

Trial 5 Complete [00h 00m 15s]
val_accuracy: 0.6328502297401428

Best val_accuracy So Far: 0.639061450958252
Total elapsed time: 00h 01m 28s
INFO:tensorflow:Oracle triggered exit
Results summary
Results in tuner/IUM
Showing 10 best trials
<keras_tuner.engine.objective.Objective object at 0x359d8bcd0>
Trial summary
Hyperparameters:
num_layers: 3
units_0: 611
dropout: False
learning_rate: 0.005390015309739632
units_1: 1836
units_2: 2051
dropout_rate: 0.4511226946998379
Score: 0.639061450958252
Trial summary
Hyperparameters:
num_layers: 2
units_0: 1211
dropout: True
learning_rate: 0.008414265177676794
units_1: 2121
units_2: 2931
dropout_rate: 0.1
Score: 0.6363009214401245
Trial summary
Hyperparameters:
num_layers: 1
units_0: 2236
dropout: False
learning_rate: 0.008086612225360367
units_1: 2346
units_2: 1056
dropout_rate: 0.3884727844115068
Score: 0.6328502297401428
Trial summary
Hyperparameters:
num_layers: 1
units_0: 1556
dropout: True
learning_rate: 0.0058727861910414176
units_1: 286
un

In [218]:
if mode == "train":
    # save model to file
    model.save('../models/model2.h5')

    print("TEST")
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)
    print("Accuracy:", accuracy_score(y_test, y_pred_classes))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_classes))
    print("Classification report:\n", classification_report(y_test, y_pred_classes))

    print("TRAIN")
    y_pred = model.predict(X_train)
    y_pred_classes = (y_pred > 0.5).astype(int)
    print("Accuracy:", accuracy_score(y_train, y_pred_classes))
    print("Confusion matrix:\n", confusion_matrix(y_train, y_pred_classes))
    print("Classification report:\n", classification_report(y_train, y_pred_classes))
