In [64]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

import pandas as pd
import numpy as np
import pickle

In [65]:
merged_data_path = '../data/v2/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

In [66]:
# K-MEANS

# Prepare the list of unique genres
unique_genres = list(set([genre for genres in data['genres'] for genre in genres]) | set([genre for genres in data['favourite_genres'] for genre in genres]))

# Convert the genres to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
vectorized = vectorizer.fit_transform(unique_genres)

# Apply K-means clustering
num_clusters = 100  # Adjust this value according to your needs
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto')
kmeans.fit(vectorized)
labels = kmeans.labels_

# Group the genres based on the cluster labels
clustered_genres = {}
for i, label in enumerate(labels):
    if label not in clustered_genres:
        clustered_genres[label] = []
    clustered_genres[label].append(unique_genres[i])

# Map genres to their cluster label
genre_to_cluster = {genre: label for label, genres in clustered_genres.items() for genre in genres}

# Define the function for mapping genres to simpler forms
def map_genre(genre):
    cluster_label = genre_to_cluster[genre]
    representative_genre = clustered_genres[cluster_label][0]  # Use the first genre in the cluster as the representative
    return representative_genre

# Save the genre_to_cluster and clustered_genres dictionaries to use for new data prediction
with open('../microservice/models/model_genre_to_cluster.pickle', 'wb') as f:
    pickle.dump(genre_to_cluster, f)

with open('../microservice/models/model_clustered_genres.pickle', 'wb') as f:
    pickle.dump(clustered_genres, f)

# Apply the mapping function to both 'genres' and 'favourite_genres' columns
data['genres'] = data['genres'].apply(lambda x: [map_genre(genre) for genre in x])
data['favourite_genres'] = data['favourite_genres'].apply(lambda x: [map_genre(genre) for genre in x])

# Delete not unique genres
data['genres'] = data['genres'].apply(lambda x: list(set(x)))
data['favourite_genres'] = data['favourite_genres'].apply(lambda x: list(set(x)))


In [67]:
# count unique genres
unique_genres = set()
for genres in data['genres']:
    unique_genres.update(genres)
for genres in data['favourite_genres']:
    unique_genres.update(genres)
print(len(unique_genres))

# how many tracks are per genre
genre_count = {}
for genres in data['genres']:
    for genre in genres:
        if genre not in genre_count:
            genre_count[genre] = 0
        genre_count[genre] += 1

genre_count = sorted(genre_count.items(), key=lambda x: x[1], reverse=True)
genre_count

100


[('lo-fi beats', 5476),
 ('rock-and-roll', 5079),
 ('classic mandopop', 3320),
 ('hong kong indie', 2695),
 ('folk punk', 1549),
 ('norwegian country', 997),
 ('italian new wave', 990),
 ('jump blues', 910),
 ('dance rock', 619),
 ('old school hip hop', 591),
 ('post-screamo', 590),
 ('j-metal', 573),
 ('norwegian alternative rock', 554),
 ('singer-songwriter', 544),
 ('chicago soul', 435),
 ('swedish progressive metal', 411),
 ('latin arena pop', 374),
 ('russian trap', 370),
 ('modern hard rock', 323),
 ('g funk', 319),
 ('jazz accordion', 295),
 ('j-pop boy group', 274),
 ('boston punk', 264),
 ('canadian indie', 260),
 ('emo rap', 202),
 ('electro house', 189),
 ('post-disco', 180),
 ('neo-synthpop', 144),
 ('brazilian gospel', 142),
 ('us power metal', 139),
 ('melodic hardcore', 134),
 ('reggaeton colombiano', 125),
 ('irish pub song', 124),
 ('tropical', 113),
 ('mexican rock-and-roll', 108),
 ('reggae peruano', 105),
 ('turkish metal', 104),
 ('german post-hardcore', 102),
 ('v

In [68]:
# balance set
skipped_data = data[data["skipped"] == 1]
not_skipped_data = data[data["skipped"] == 0]

# Split the dataset into subsets based on class labels
print("skipped_data", skipped_data.shape)
print("not_skipped_data", not_skipped_data.shape)

# Calculate the minimum number of samples among all classes
min_samples = min(len(skipped_data), len(not_skipped_data))

# Randomly select samples from each class subset to match the minimum number of samples
skipped_data_balanced = skipped_data.sample(n=min_samples, random_state=42)
not_skipped_data_balanced = not_skipped_data.sample(n=min_samples, random_state=42)

# Merge the balanced subsets to create the final balanced dataset
balanced_data = pd.concat([skipped_data_balanced, not_skipped_data_balanced], axis=0)

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

skipped_data (3621, 3)
not_skipped_data (6260, 3)


In [69]:
# Combine genres and favourite_genres
mlb = MultiLabelBinarizer()

data['combined_genres'] = data.apply(lambda x: x['genres'] + x['favourite_genres'], axis=1)
mlb.fit(data['combined_genres'])

encoded_favourite_genres = mlb.transform(balanced_data['favourite_genres'])
encoded_genres = mlb.transform(balanced_data['genres'])

# save mlb to file
with open('../microservice/models/model_mlb.pickle', 'wb') as f:
    pickle.dump(mlb, f)


In [70]:
X = np.concatenate([encoded_genres, encoded_favourite_genres], axis=1)

# Extract the labels
y = balanced_data['skipped'].astype(int).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# count skipped and not skipped songs in X_train
# print("Y_train not skipped", np.count_nonzero(y_train == 0))
# print("Y_train skipped", np.count_nonzero(y_train == 1))
# print("Y_train skipped %", np.count_nonzero(y_train == 1) / len(y_train))

In [71]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# save model to file
with open('../microservice/models/model.pickle', 'wb') as f:
    pickle.dump(model, f)

In [72]:
print("TEST")
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))

print("TRAIN")
y_pred = model.predict(X_train)
print("Accuracy:", accuracy_score(y_train, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_train, y_pred))
print("Classification report:\n", classification_report(y_train, y_pred))


TEST
Accuracy: 0.6155969634230504
Confusion matrix:
 [[445 256]
 [301 447]]
Classification report:
               precision    recall  f1-score   support

           0       0.60      0.63      0.62       701
           1       0.64      0.60      0.62       748

    accuracy                           0.62      1449
   macro avg       0.62      0.62      0.62      1449
weighted avg       0.62      0.62      0.62      1449

TRAIN
Accuracy: 0.8518902123252201
Confusion matrix:
 [[2503  417]
 [ 441 2432]]
Classification report:
               precision    recall  f1-score   support

           0       0.85      0.86      0.85      2920
           1       0.85      0.85      0.85      2873

    accuracy                           0.85      5793
   macro avg       0.85      0.85      0.85      5793
weighted avg       0.85      0.85      0.85      5793

