In [150]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from joblib import load, dump

# Przygotowanie danych do uczenia


In [157]:
merged_data_path = '../data/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

data = data.drop(columns=["user_id", "track_id"])

# TODO delete?
data = data.drop(
    columns=["popularity", "duration_ms", "explicit", "danceability", "energy", "speechiness",
             "acousticness", "instrumentalness", "liveness", "valence", "tempo"])


# TODO map genres

def map_genres(genre):
  if "rock" in genre:
    return "rock"
  elif "pop" in genre:
    return "pop"
  elif "dance" in genre:
    return "dance"
  else:
    return genre


data["genres"] = data["genres"].apply(lambda genres: list(set(map(map_genres, genres))))
data["favourite_genres"] = data["favourite_genres"].apply(lambda genres: list(set(map(map_genres, genres))))


data.head(500)


Unnamed: 0,favourite_genres,genres,skipped
0,"[permanent wave, pop, funk]","[protopunk, rock]",False
1,"[filmi, folk, regional mexican]","[protopunk, rock]",False
2,[rock],"[protopunk, rock]",False
3,[rock],"[protopunk, rock]",False
4,[rock],"[protopunk, rock]",False
...,...,...,...
495,"[alternative metal, singer-songwriter, rock]","[blues, british blues, electric blues, rock]",False
496,"[permanent wave, pop, funk]","[mellow gold, permanent wave, pop, new wave, r...",True
497,"[permanent wave, pop, funk]","[mellow gold, permanent wave, pop, new wave, r...",True
498,"[filmi, folk, regional mexican]","[mellow gold, permanent wave, pop, new wave, r...",False


# Zamiana wartości kategorycznych (genres) na liczbowe

In [152]:
unique_genres = set.union(*data["genres"].apply(set).tolist(),
                          *data["favourite_genres"].apply(set).tolist())

encoder = LabelEncoder()
encoder.fit(list(unique_genres))

# convert the categorical values into numeric - favourite_genres is a list of strings
data["genres"] = data["genres"].apply(lambda genres: encoder.transform(genres).tolist())
data["favourite_genres"] = data["favourite_genres"].apply(
  lambda genres: encoder.transform(genres).tolist())

# convert encoded lists into binary arrays
mlb = MultiLabelBinarizer()
genres_binarized = mlb.fit_transform(data["genres"])
favourite_genres_binarized = mlb.transform(data["favourite_genres"])

# Combine the binary arrays with appropriate column suffixes
X_genres_df = pd.DataFrame(genres_binarized, columns=[f'g_{col}' for col in mlb.classes_.tolist()])
X_fav_genres_df = pd.DataFrame(favourite_genres_binarized,
                               columns=[f'fav_{col}' for col in mlb.classes_.tolist()])

# Join the binary arrays with the original DataFrame
data = data.join(X_genres_df).join(X_fav_genres_df)

# Drop the original columns
data = data.drop(columns=["genres", "favourite_genres"])

data.head(5)


Unnamed: 0,skipped,g_0,g_1,g_2,g_3,g_4,g_5,g_6,g_7,g_8,...,fav_1307,fav_1308,fav_1309,fav_1310,fav_1311,fav_1312,fav_1313,fav_1314,fav_1315,fav_1316
0,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,False,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Ekstrakcja labeli

In [153]:
X = data.drop(columns=["skipped"])
Y = data["skipped"]

# Podział danych na zbiór treningowy i testowy

In [154]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("{}".format(Y_train.value_counts()))
print("{}".format(Y_test.value_counts()))

False    4999
True     2905
Name: skipped, dtype: int64
False    1261
True      716
Name: skipped, dtype: int64


# Train (RandomForestClassifier)

In [155]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, Y_train)

print("TRENINGOWY")
y_pred = model.predict(X_train)
print("Accuracy:", accuracy_score(Y_train, y_pred))
print("Confusion matrix:\n", confusion_matrix(Y_train, y_pred))
print("Classification report:\n", classification_report(Y_train, y_pred))

print("TESTOWY")
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(Y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(Y_test, y_pred))
print("Classification report:\n", classification_report(Y_test, y_pred))

TRENINGOWY
Accuracy: 0.8667763157894737
Confusion matrix:
 [[4521  478]
 [ 575 2330]]
Classification report:
               precision    recall  f1-score   support

       False       0.89      0.90      0.90      4999
        True       0.83      0.80      0.82      2905

    accuracy                           0.87      7904
   macro avg       0.86      0.85      0.86      7904
weighted avg       0.87      0.87      0.87      7904

TESTOWY
Accuracy: 0.65402124430956
Confusion matrix:
 [[965 296]
 [388 328]]
Classification report:
               precision    recall  f1-score   support

       False       0.71      0.77      0.74      1261
        True       0.53      0.46      0.49       716

    accuracy                           0.65      1977
   macro avg       0.62      0.61      0.61      1977
weighted avg       0.65      0.65      0.65      1977



# Train (MLPClassifier)

In [156]:
mlpc_grid = GridSearchCV(MLPClassifier(), {
  'hidden_layer_sizes': [(1)],
  'activation': ['relu'],
  'solver': ['adam'],
  'learning_rate': ['constant'],
}, n_jobs=-1, cv=ms.KFold(shuffle=True), verbose=10)
mlpc_grid.fit(X, Y)
pd.DataFrame(mlpc_grid.cv_results_)

# print("TRENINGOWY")
# y_pred = model.predict(X_train)
# print("Accuracy:", accuracy_score(Y_train, y_pred))
# print("Confusion matrix:\n", confusion_matrix(Y_train, y_pred))
# print("Classification report:\n", classification_report(Y_train, y_pred))
#
# print("TESTOWY")
# y_pred = model.predict(X_test)
# print("Accuracy:", accuracy_score(Y_test, y_pred))
# print("Confusion matrix:\n", confusion_matrix(Y_test, y_pred))
# print("Classification report:\n", classification_report(Y_test, y_pred))


Fitting 5 folds for each of 1 candidates, totalling 5 fits




[CV 4/5; 1/1] START activation=relu, hidden_layer_sizes=1, learning_rate=constant, solver=adam
[CV 4/5; 1/1] END activation=relu, hidden_layer_sizes=1, learning_rate=constant, solver=adam;, score=0.682 total time=  41.1s
[CV 3/5; 1/1] START activation=relu, hidden_layer_sizes=1, learning_rate=constant, solver=adam
[CV 3/5; 1/1] END activation=relu, hidden_layer_sizes=1, learning_rate=constant, solver=adam;, score=0.679 total time=  41.6s
[CV 2/5; 1/1] START activation=relu, hidden_layer_sizes=1, learning_rate=constant, solver=adam
[CV 2/5; 1/1] END activation=relu, hidden_layer_sizes=1, learning_rate=constant, solver=adam;, score=0.644 total time=  42.2s




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_hidden_layer_sizes,param_learning_rate,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,30.695895,0.25351,0.039361,0.001278,relu,1,constant,adam,"{'activation': 'relu', 'hidden_layer_sizes': 1...",0.645422,0.649291,0.646761,0.667004,0.663462,0.654388,0.009011,1
