In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras


# Przygotowanie danych do uczenia


In [5]:
merged_data_path = '../data/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

data = data.drop(columns=["user_id", "track_id"])

# TODO delete?
# data = data.drop(
#     columns=["release_date", "key", "loudness", "popularity", "duration_ms", "explicit",
#              "danceability", "energy", "speechiness",
#              "acousticness", "instrumentalness", "liveness", "valence", "tempo"])

data = data.drop(
    columns=["release_date", "key", "loudness", "popularity",
             "duration_ms", "explicit",
             "danceability", "energy", "speechiness",
             "acousticness", "instrumentalness", "liveness", "valence", "tempo"])

# data = data.drop(columns=["release_date", "explicit", "key", "loudness", "favourite_genres", "genres"])


# TODO map genres

# def map_genres(genre):
#   if "rock" in genre:
#     return "rock"
#   elif "pop" in genre:
#     return "pop"
#   elif "dance" in genre:
#     return "dance"
#   elif "wave" in genre:
#     return "wave"
#   elif "metal" in genre:
#     return "metal"
#   else:
#     return genre
#
#
# data["genres"] = data["genres"].apply(lambda genres: list(set(map(map_genres, genres))))
# data["favourite_genres"] = data["favourite_genres"].apply(lambda genres: list(set(map(map_genres, genres))))

data.head(500)

Unnamed: 0,favourite_genres,genres,skipped
0,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, folk rock...",False
1,"[filmi, regional mexican, folk]","[album rock, art rock, classic rock, folk rock...",False
2,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
3,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
4,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
...,...,...,...
495,"[hard rock, alternative metal, singer-songwriter]","[album rock, art rock, blues, blues rock, brit...",False
496,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, classic u...",True
497,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, classic u...",True
498,"[filmi, regional mexican, folk]","[album rock, art rock, classic rock, classic u...",False


# Zamiana wartości kategorycznych (genres) na liczbowe

In [6]:
unique_genres = set.union(*data["genres"].apply(set).tolist(),
                          *data["favourite_genres"].apply(set).tolist())

encoder = LabelEncoder()
encoder.fit(list(unique_genres))

# convert the categorical values into numeric - favourite_genres is a list of strings
data["genres"] = data["genres"].apply(lambda genres: encoder.transform(genres).tolist())
data["favourite_genres"] = data["favourite_genres"].apply(
    lambda genres: encoder.transform(genres).tolist())

# convert encoded lists into binary arrays
mlb = MultiLabelBinarizer()
genres_binarized = mlb.fit_transform(data["genres"])
favourite_genres_binarized = mlb.transform(data["favourite_genres"])

# Combine the binary arrays with appropriate column suffixes
X_genres_df = pd.DataFrame(genres_binarized, columns=[f'g_{col}' for col in mlb.classes_.tolist()])
X_fav_genres_df = pd.DataFrame(favourite_genres_binarized,
                               columns=[f'fav_{col}' for col in mlb.classes_.tolist()])

# Join the binary arrays with the original DataFrame
data = data.join(X_genres_df).join(X_fav_genres_df)

# Drop the original columns
data = data.drop(columns=["genres", "favourite_genres"])

data.head(5)


Unnamed: 0,skipped,g_0,g_1,g_2,g_3,g_4,g_5,g_6,g_7,g_8,...,fav_1756,fav_1757,fav_1758,fav_1759,fav_1760,fav_1761,fav_1762,fav_1763,fav_1764,fav_1765
0,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Ekstrakcja labeli

In [7]:
X = data.drop(columns=["skipped"])
Y = data["skipped"]

# Podział danych na zbiór treningowy i testowy

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("{}".format(Y_train.value_counts()))
print("{}".format(Y_test.value_counts()))

False    4999
True     2905
Name: skipped, dtype: int64
False    1261
True      716
Name: skipped, dtype: int64


# Train (RandomForestClassifier)

In [9]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, Y_train)

print("TRENINGOWY")
y_pred = model.predict(X_train)
print("Accuracy:", accuracy_score(Y_train, y_pred))
print("Confusion matrix:\n", confusion_matrix(Y_train, y_pred))
print("Classification report:\n", classification_report(Y_train, y_pred))

print("TESTOWY")
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(Y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(Y_test, y_pred))
print("Classification report:\n", classification_report(Y_test, y_pred))


TRENINGOWY
Accuracy: 0.917257085020243
Confusion matrix:
 [[4712  287]
 [ 367 2538]]
Classification report:
               precision    recall  f1-score   support

       False       0.93      0.94      0.94      4999
        True       0.90      0.87      0.89      2905

    accuracy                           0.92      7904
   macro avg       0.91      0.91      0.91      7904
weighted avg       0.92      0.92      0.92      7904

TESTOWY
Accuracy: 0.661608497723824
Confusion matrix:
 [[993 268]
 [401 315]]
Classification report:
               precision    recall  f1-score   support

       False       0.71      0.79      0.75      1261
        True       0.54      0.44      0.48       716

    accuracy                           0.66      1977
   macro avg       0.63      0.61      0.62      1977
weighted avg       0.65      0.66      0.65      1977

