In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from joblib import load, dump

# Przygotowanie danych do uczenia


In [5]:
merged_data_path = '../data/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

data = data.drop(columns=["user_id", "track_id"])

# TODO delete?
data = data.drop(
    columns=["popularity", "duration_ms", "explicit", "danceability", "energy", "speechiness",
             "acousticness", "instrumentalness", "liveness", "valence", "tempo"])


# TODO map genres

def map_genres(genre):
  if "rock" in genre:
    return "rock"
  elif "pop" in genre:
    return "pop"
  elif "dance" in genre:
    return "dance"
  elif "wave" in genre:
    return "wave"
  elif "metal" in genre:
    return "metal"
  else:
    return genre


data["genres"] = data["genres"].apply(lambda genres: list(set(map(map_genres, genres))))
data["favourite_genres"] = data["favourite_genres"].apply(lambda genres: list(set(map(map_genres, genres))))

data.head(500)

2     2649
1     2164
3     1959
4     1382
5      791
6      474
7      168
8      139
9      113
10      22
11      15
12       5
Name: genres, dtype: int64

# Zamiana wartości kategorycznych (genres) na liczbowe

In [162]:
unique_genres = set.union(*data["genres"].apply(set).tolist(),
                          *data["favourite_genres"].apply(set).tolist())

encoder = LabelEncoder()
encoder.fit(list(unique_genres))

# convert the categorical values into numeric - favourite_genres is a list of strings
data["genres"] = data["genres"].apply(lambda genres: encoder.transform(genres).tolist())
data["favourite_genres"] = data["favourite_genres"].apply(
  lambda genres: encoder.transform(genres).tolist())

# convert encoded lists into binary arrays
mlb = MultiLabelBinarizer()
genres_binarized = mlb.fit_transform(data["genres"])
favourite_genres_binarized = mlb.transform(data["favourite_genres"])

# Combine the binary arrays with appropriate column suffixes
X_genres_df = pd.DataFrame(genres_binarized, columns=[f'g_{col}' for col in mlb.classes_.tolist()])
X_fav_genres_df = pd.DataFrame(favourite_genres_binarized,
                               columns=[f'fav_{col}' for col in mlb.classes_.tolist()])

# Join the binary arrays with the original DataFrame
data = data.join(X_genres_df).join(X_fav_genres_df)

# Drop the original columns
data = data.drop(columns=["genres", "favourite_genres"])

data.head(5)


Unnamed: 0,skipped,g_0,g_1,g_2,g_3,g_4,g_5,g_6,g_7,g_8,...,fav_1218,fav_1219,fav_1220,fav_1221,fav_1222,fav_1223,fav_1224,fav_1225,fav_1226,fav_1227
0,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Ekstrakcja labeli

In [163]:
X = data.drop(columns=["skipped"])
Y = data["skipped"]

# Podział danych na zbiór treningowy i testowy

In [164]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("{}".format(Y_train.value_counts()))
print("{}".format(Y_test.value_counts()))

False    4999
True     2905
Name: skipped, dtype: int64
False    1261
True      716
Name: skipped, dtype: int64


# Train (RandomForestClassifier)

In [165]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, Y_train)

print("TRENINGOWY")
y_pred = model.predict(X_train)
print("Accuracy:", accuracy_score(Y_train, y_pred))
print("Confusion matrix:\n", confusion_matrix(Y_train, y_pred))
print("Classification report:\n", classification_report(Y_train, y_pred))

print("TESTOWY")
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(Y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(Y_test, y_pred))
print("Classification report:\n", classification_report(Y_test, y_pred))

TRENINGOWY
Accuracy: 0.8594382591093117
Confusion matrix:
 [[4502  497]
 [ 614 2291]]
Classification report:
               precision    recall  f1-score   support

       False       0.88      0.90      0.89      4999
        True       0.82      0.79      0.80      2905

    accuracy                           0.86      7904
   macro avg       0.85      0.84      0.85      7904
weighted avg       0.86      0.86      0.86      7904

TESTOWY
Accuracy: 0.6575619625695498
Confusion matrix:
 [[969 292]
 [385 331]]
Classification report:
               precision    recall  f1-score   support

       False       0.72      0.77      0.74      1261
        True       0.53      0.46      0.49       716

    accuracy                           0.66      1977
   macro avg       0.62      0.62      0.62      1977
weighted avg       0.65      0.66      0.65      1977



# Train (MLPClassifier)

In [166]:
mlpc_grid = GridSearchCV(MLPClassifier(), {
  'hidden_layer_sizes': [(1)],
  'activation': ['relu'],
  'solver': ['adam'],
  'learning_rate': ['constant'],
}, n_jobs=-1, cv=ms.KFold(shuffle=True), verbose=10)
mlpc_grid.fit(X, Y)
pd.DataFrame(mlpc_grid.cv_results_)

# print("TRENINGOWY")
# y_pred = model.predict(X_train)
# print("Accuracy:", accuracy_score(Y_train, y_pred))
# print("Confusion matrix:\n", confusion_matrix(Y_train, y_pred))
# print("Classification report:\n", classification_report(Y_train, y_pred))
#
# print("TESTOWY")
# y_pred = model.predict(X_test)
# print("Accuracy:", accuracy_score(Y_test, y_pred))
# print("Confusion matrix:\n", confusion_matrix(Y_test, y_pred))
# print("Classification report:\n", classification_report(Y_test, y_pred))


Fitting 5 folds for each of 1 candidates, totalling 5 fits




KeyboardInterrupt: 