In [43]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from tensorflow import keras

from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
import keras_tuner

In [44]:
merged_data_path = '../data/v2/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

In [45]:
# K-MEANS

# Prepare the list of unique genres
unique_genres = list(set([genre for genres in data['genres'] for genre in genres]) | set(
    [genre for genres in data['favourite_genres'] for genre in genres]))

# Convert the genres to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
vectorized = vectorizer.fit_transform(unique_genres)

# Apply K-means clustering
num_clusters = 100  # Adjust this value according to your needs
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto')
kmeans.fit(vectorized)
labels = kmeans.labels_

# Group the genres based on the cluster labels
clustered_genres = {}
for i, label in enumerate(labels):
    if label not in clustered_genres:
        clustered_genres[label] = []
    clustered_genres[label].append(unique_genres[i])

# Map genres to their cluster label
genre_to_cluster = {genre: label for label, genres in clustered_genres.items() for genre in genres}


# Define the function for mapping genres to simpler forms
def map_genre(genre):
    cluster_label = genre_to_cluster[genre]
    representative_genre = clustered_genres[cluster_label][
        0]  # Use the first genre in the cluster as the representative
    return representative_genre


# Save the genre_to_cluster and clustered_genres dictionaries to use for new data prediction
with open('../models/model2_genre_to_cluster.pickle', 'wb') as f:
    pickle.dump(genre_to_cluster, f)

with open('../models/model2_clustered_genres.pickle', 'wb') as f:
    pickle.dump(clustered_genres, f)

# Apply the mapping function to both 'genres' and 'favourite_genres' columns
data['genres'] = data['genres'].apply(lambda x: [map_genre(genre) for genre in x])
data['favourite_genres'] = data['favourite_genres'].apply(lambda x: [map_genre(genre) for genre in x])

# Delete not unique genres
data['genres'] = data['genres'].apply(lambda x: list(set(x)))
data['favourite_genres'] = data['favourite_genres'].apply(lambda x: list(set(x)))


In [46]:
# count unique genres
unique_genres = set()
for genres in data['genres']:
    unique_genres.update(genres)
for genres in data['favourite_genres']:
    unique_genres.update(genres)
print(len(unique_genres))

# how many tracks are per genre
genre_count = {}
for genres in data['genres']:
    for genre in genres:
        if genre not in genre_count:
            genre_count[genre] = 0
        genre_count[genre] += 1

genre_count = sorted(genre_count.items(), key=lambda x: x[1], reverse=True)
genre_count

100


[('cowboy western', 7065),
 ('modern blues rock', 3510),
 ('classic bollywood', 3383),
 ('colombian pop', 2968),
 ('mellow gold', 2218),
 ('kurdish folk', 1619),
 ('hard bop', 1140),
 ('texas blues', 911),
 ('psychedelic hip hop', 817),
 ('pop rap', 619),
 ('post-punk argentina', 615),
 ('doom metal', 577),
 ('norwegian alternative rock', 560),
 ('danish singer-songwriter', 544),
 ('czech hip hop', 537),
 ('swedish soul', 435),
 ('vapor trap', 428),
 ('progressive bluegrass', 359),
 ('latin pop', 357),
 ('danish jazz', 328),
 ('brit funk', 319),
 ("canadian children's music", 297),
 ('early us punk', 286),
 ('piano rock', 242),
 ('el paso indie', 242),
 ('hip house', 215),
 ('pop emo', 202),
 ('japanese electropop', 158),
 ('finnish power metal', 139),
 ('reggae peruano', 109),
 ('regional mexican', 108),
 ('australian psych', 105),
 ('german indie', 101),
 ('rap nacional antigo', 92),
 ('irish folk', 80),
 ('argentine reggae', 78),
 ('neo-synthpop', 77),
 ('finnish modern jazz', 70),


In [47]:
# balance set
skipped_data = data[data["skipped"] == 1]
not_skipped_data = data[data["skipped"] == 0]

# Split the dataset into subsets based on class labels
print("skipped_data", skipped_data.shape)
print("not_skipped_data", not_skipped_data.shape)

# Calculate the minimum number of samples among all classes
min_samples = min(len(skipped_data), len(not_skipped_data))

# Randomly select samples from each class subset to match the minimum number of samples
skipped_data_balanced = skipped_data.sample(n=min_samples, random_state=42)
not_skipped_data_balanced = not_skipped_data.sample(n=min_samples, random_state=42)

# Merge the balanced subsets to create the final balanced dataset
balanced_data = pd.concat([skipped_data_balanced, not_skipped_data_balanced], axis=0)

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

skipped_data (3621, 3)
not_skipped_data (6260, 3)


In [48]:
# Combine genres and favourite_genres
mlb = MultiLabelBinarizer()

data['combined_genres'] = data.apply(lambda x: x['genres'] + x['favourite_genres'], axis=1)
mlb.fit(data['combined_genres'])

encoded_favourite_genres = mlb.transform(balanced_data['favourite_genres'])
encoded_genres = mlb.transform(balanced_data['genres'])

# save mlb to file
with open('../models/model2_mlb.pickle', 'wb') as f:
    pickle.dump(mlb, f)


In [49]:
X = np.concatenate([encoded_genres, encoded_favourite_genres], axis=1)

# Extract the labels
y = balanced_data['skipped'].astype(int).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# count skipped and not skipped songs in X_train
# print("Y_train not skipped", np.count_nonzero(y_train == 0))
# print("Y_train skipped", np.count_nonzero(y_train == 1))
# print("Y_train skipped %", np.count_nonzero(y_train == 1) / len(y_train))

In [50]:
def build_model(hp):
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1])))

    for i in range(hp.Int("num_layers", min_value=1, max_value=3)):
        model.add(Dense(units=hp.Int(f"units_{i}", min_value=1, max_value=200), activation="relu"))
        if hp.Boolean("dropout"):
            model.add(Dropout(hp.Float("dropout_rate", min_value=0.1, max_value=0.99)))

    model.add(Dense(1, activation="sigmoid"))

    # TODO hp. wybor optimizera
    model.compile(
        optimizer=Adam(learning_rate=hp.Float("learning_rate", min_value=0.001, max_value=0.01, sampling="log")),
        loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [51]:
# epochs = 50
# batch_size = 32

# model = build_model(keras_tuner.HyperParameters())

# model.fit(
#     X_train,
#     y_train,
#     validation_data=(X_test, y_test),
#     epochs=epochs,
#     batch_size=batch_size,
#     verbose=2
# )

tuner = keras_tuner.RandomSearch(
    hypermodel=build_model,
    objective="val_accuracy",
    max_trials=500,
    executions_per_trial=2,
    overwrite=True,
    directory="tuner",
    project_name="IUM",
)

tuner.search_space_summary()


Search space summary
Default search space size: 4
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 3, 'step': 1, 'sampling': None}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 200, 'step': 1, 'sampling': None}
dropout (Boolean)
{'default': False, 'conditions': []}
learning_rate (Float)
{'default': 0.001, 'conditions': [], 'min_value': 0.001, 'max_value': 0.01, 'step': None, 'sampling': 'log'}


In [52]:
tuner.search(X_train, y_train, epochs=20, validation_data=(X_test, y_test))

tuner.results_summary()

Trial 25 Complete [00h 01m 03s]
val_accuracy: 0.6438923478126526

Best val_accuracy So Far: 0.6576949656009674
Total elapsed time: 00h 24m 18s
INFO:tensorflow:Oracle triggered exit
Results summary
Results in tuner/IUM
Showing 10 best trials
<keras_tuner.engine.objective.Objective object at 0x292dd2bf0>
Trial summary
Hyperparameters:
num_layers: 1
units_0: 187
dropout: False
learning_rate: 0.0014048301447808303
dropout_rate: 0.7432039675976436
units_1: 129
units_2: 22
Score: 0.6576949656009674
Trial summary
Hyperparameters:
num_layers: 2
units_0: 109
dropout: True
learning_rate: 0.0014704628420197305
dropout_rate: 0.7124349643875012
units_1: 142
units_2: 109
Score: 0.6563147008419037
Trial summary
Hyperparameters:
num_layers: 2
units_0: 32
dropout: False
learning_rate: 0.004039257002706661
dropout_rate: 0.8081616757923842
units_1: 59
Score: 0.6542443037033081
Trial summary
Hyperparameters:
num_layers: 1
units_0: 17
dropout: False
learning_rate: 0.0020676659154745367
dropout_rate: 0.1949

In [53]:
models = tuner.get_best_models(num_models=5)
for model in models:
    model.summary()

model = models[0]
model.summary()

# save model to file
model.save('../models/model2.h5')

# NOTATKI Z TUNINGU
# 61 neuronów to podejrzanie dobra liczba (moze trzeba zmniejszyc step (do 5 albo 1) bo pewnie optymalna wartosc jest w poblizu)
# najlepszy wynik byl na 2/3 warstwach, moze warto zwiekszyc liczbe
# hidden activation relu
# dropout False/True (0.4-0.5)
# output activation sigmoid
# learning rate lekko powyzej 0.002 (albo miedzy 0.001 a 0.002)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 187)               37587     
                                                                 
 dense_1 (Dense)             (None, 1)                 188       
                                                                 
Total params: 37,775
Trainable params: 37,775
Non-trainable params: 0
_________________________________________________________________
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 109)               21909     
                                                                 
 dropout (Dropout)           (None, 109)               0         
                                                                 
 dense_1 (Dense)             (No

In [54]:
print("TEST")
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_pred_classes))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_classes))
print("Classification report:\n", classification_report(y_test, y_pred_classes))

print("TRAIN")
y_pred = model.predict(X_train)
y_pred_classes = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_train, y_pred_classes))
print("Confusion matrix:\n", confusion_matrix(y_train, y_pred_classes))
print("Classification report:\n", classification_report(y_train, y_pred_classes))


TEST
Accuracy: 0.6597653554175293
Confusion matrix:
 [[430 296]
 [197 526]]
Classification report:
               precision    recall  f1-score   support

           0       0.69      0.59      0.64       726
           1       0.64      0.73      0.68       723

    accuracy                           0.66      1449
   macro avg       0.66      0.66      0.66      1449
weighted avg       0.66      0.66      0.66      1449

TRAIN
Accuracy: 0.7105126877265665
Confusion matrix:
 [[1868 1027]
 [ 650 2248]]
Classification report:
               precision    recall  f1-score   support

           0       0.74      0.65      0.69      2895
           1       0.69      0.78      0.73      2898

    accuracy                           0.71      5793
   macro avg       0.71      0.71      0.71      5793
weighted avg       0.71      0.71      0.71      5793

