In [117]:
import torch
import torch.nn as nn

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from joblib import load, dump
from skorch import NeuralNetClassifier



# Przygotowanie danych do uczenia


In [118]:
merged_data_path = '../data/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

data = data.drop(columns=["user_id", "track_id"])

# TODO delete?
data = data.drop(
    columns=["release_date", "key", "loudness", "popularity", "duration_ms", "explicit",
             "danceability", "energy", "speechiness",
             "acousticness", "instrumentalness", "liveness", "valence", "tempo"])

# data = data.drop(columns=["release_date", "explicit", "key", "loudness", "favourite_genres", "genres"])


# TODO map genres

# def map_genres(genre):
#   if "rock" in genre:
#     return "rock"
#   elif "pop" in genre:
#     return "pop"
#   elif "dance" in genre:
#     return "dance"
#   elif "wave" in genre:
#     return "wave"
#   elif "metal" in genre:
#     return "metal"
#   else:
#     return genre
#
#
# data["genres"] = data["genres"].apply(lambda genres: list(set(map(map_genres, genres))))
# data["favourite_genres"] = data["favourite_genres"].apply(lambda genres: list(set(map(map_genres, genres))))

data.head(500)

Unnamed: 0,favourite_genres,genres,skipped
0,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, folk rock...",False
1,"[filmi, regional mexican, folk]","[album rock, art rock, classic rock, folk rock...",False
2,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
3,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
4,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
...,...,...,...
495,"[hard rock, alternative metal, singer-songwriter]","[album rock, art rock, blues, blues rock, brit...",False
496,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, classic u...",True
497,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, classic u...",True
498,"[filmi, regional mexican, folk]","[album rock, art rock, classic rock, classic u...",False


# Zamiana wartości kategorycznych (genres) na liczbowe

In [119]:
unique_genres = set.union(*data["genres"].apply(set).tolist(),
                          *data["favourite_genres"].apply(set).tolist())

encoder = LabelEncoder()
encoder.fit(list(unique_genres))

# convert the categorical values into numeric - favourite_genres is a list of strings
data["genres"] = data["genres"].apply(lambda genres: encoder.transform(genres).tolist())
data["favourite_genres"] = data["favourite_genres"].apply(
    lambda genres: encoder.transform(genres).tolist())

# convert encoded lists into binary arrays
mlb = MultiLabelBinarizer()
genres_binarized = mlb.fit_transform(data["genres"])
favourite_genres_binarized = mlb.transform(data["favourite_genres"])

# Combine the binary arrays with appropriate column suffixes
X_genres_df = pd.DataFrame(genres_binarized, columns=[f'g_{col}' for col in mlb.classes_.tolist()])
X_fav_genres_df = pd.DataFrame(favourite_genres_binarized,
                               columns=[f'fav_{col}' for col in mlb.classes_.tolist()])

# Join the binary arrays with the original DataFrame
data = data.join(X_genres_df).join(X_fav_genres_df)

# Drop the original columns
data = data.drop(columns=["genres", "favourite_genres"])

data.head(5)


Unnamed: 0,skipped,g_0,g_1,g_2,g_3,g_4,g_5,g_6,g_7,g_8,...,fav_1756,fav_1757,fav_1758,fav_1759,fav_1760,fav_1761,fav_1762,fav_1763,fav_1764,fav_1765
0,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Ekstrakcja labeli

In [120]:
X = data.drop(columns=["skipped"])
Y = data["skipped"]

# Podział danych na zbiór treningowy i testowy

In [121]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("{}".format(Y_train.value_counts()))
print("{}".format(Y_test.value_counts()))

# Train (RandomForestClassifier)

In [122]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, Y_train)

print("TRENINGOWY")
y_pred = model.predict(X_train)
print("Accuracy:", accuracy_score(Y_train, y_pred))
print("Confusion matrix:\n", confusion_matrix(Y_train, y_pred))
print("Classification report:\n", classification_report(Y_train, y_pred))

print("TESTOWY")
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(Y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(Y_test, y_pred))
print("Classification report:\n", classification_report(Y_test, y_pred))

# Train (MLPClassifier)

In [123]:
class ClassifierModule(nn.Module):
  def __init__(self, num_inputs=3532, num_hidden=10, num_outputs=1):
    super(ClassifierModule, self).__init__()

    self.layer1 = nn.Linear(num_inputs, num_hidden)
    self.relu = nn.ReLU()
    self.layer2 = nn.Linear(num_hidden, num_outputs)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x = self.layer1(x)
    x = self.relu(x)
    x = self.layer2(x)
    x = self.sigmoid(x)
    return x

net = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=20,
    lr=0.1,
    optimizer__momentum=0.9,
    verbose=0,
    train_split=False,
)

params = {
  'lr': [0.05, 0.1],
  'module__num_hidden': [1, 3],
  'optimizer__nesterov': [False, True],
}

mlpc_grid = GridSearchCV(net, params, refit=False, cv=3, scoring='accuracy', verbose=2)

# mlpc_grid = GridSearchCV(MLPClassifier(), {
#   'hidden_layer_sizes': [(3), (3, 3)],
#   'activation': ['relu'],
#   'solver': ['adam'],
#   'learning_rate': ['constant'],
# }, n_jobs=-1, cv=ms.KFold(shuffle=True), verbose=10)
mlpc_grid.fit(X, Y)
df = pd.DataFrame(mlpc_grid.cv_results_)
df.drop(
  columns=["split0_test_score", "split1_test_score", "split2_test_score", "split3_test_score"])

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.6525[0m       [32m0.6333[0m        [35m0.6549[0m  1.2894
      2        [36m0.6345[0m       0.6333        [35m0.6518[0m  0.8149
      3        [36m0.6143[0m       [32m0.6358[0m        [35m0.6511[0m  0.7097
      4        [36m0.5954[0m       0.6146        0.6528  0.8322
      5        [36m0.5806[0m       0.5918        0.6555  0.7092
      6        [36m0.5720[0m       0.5781        0.6568  0.7060
      7        [36m0.5641[0m       0.5797        0.6576  0.7221
      8        [36m0.5589[0m       0.5797        0.6584  0.7319
      9        [36m0.5533[0m       0.5787        0.6582  0.7057
     10        [36m0.5487[0m       0.5852        0.6586  0.7072
     11        [36m0.5445[0m       0.5862        0.6590  0.8168
     12        [36m0.5408[0m       0.5883        0.6596  0.7108
     13        [36m0.5370[0m       0.5878      

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=ClassifierModule(
    (fc1): Linear(in_features=3532, out_features=7000, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (output): Linear(in_features=7000, out_features=2, bias=True)
  ),
)

In [124]:
print("TRENINGOWY")
y_pred = mlpc_grid.predict(X_train)
print("Accuracy:", accuracy_score(Y_train, y_pred))
print("Confusion matrix:\n", confusion_matrix(Y_train, y_pred))
print("Classification report:\n", classification_report(Y_train, y_pred))

print("TESTOWY")
y_pred = mlpc_grid.predict(X_test)
print("Accuracy:", accuracy_score(Y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(Y_test, y_pred))
print("Classification report:\n", classification_report(Y_test, y_pred))


CALOSC
Accuracy: 0.8338224876024694
Confusion matrix:
 [[5339  921]
 [ 721 2900]]
Classification report:
               precision    recall  f1-score   support

           0       0.88      0.85      0.87      6260
           1       0.76      0.80      0.78      3621

    accuracy                           0.83      9881
   macro avg       0.82      0.83      0.82      9881
weighted avg       0.84      0.83      0.83      9881



(1, 3532)
[0]
