In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np


In [9]:
merged_data_path = '../data/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

data = data.drop(
    columns=["release_date", "key", "loudness", "explicit",
             "speechiness",
             "acousticness", "instrumentalness", "liveness", "valence", "tempo"])

# data = data.drop(
#     columns=["release_date", "genres", "favourite_genres", "name"])

data.head(5)

Unnamed: 0,timestamp,user_id,track_id,session_id,track_name,popularity,duration_ms,id_artist,danceability,energy,...,street,favourite_genres,premium_user,name,genres,skipped,number_of_matching_genres,month,day_of_week,hour_of_day
0,2023-01-03 05:09:55.000,101,2PmGtDUyJIpYBEtI1hQIVp,124,Ballrooms Of Mars,34,247707,3dBVyJ7JuOMt4GE9607Qin,0.491,0.606,...,pl. Floriana 55/22,"[permanent wave, mandopop, funk]",False,T. Rex,"[album rock, art rock, classic rock, folk rock...",False,0,1,1,5
1,2023-02-08 21:41:02.507,103,2PmGtDUyJIpYBEtI1hQIVp,151,Ballrooms Of Mars,34,247707,3dBVyJ7JuOMt4GE9607Qin,0.491,0.606,...,al. Głogowa 14/10,"[filmi, regional mexican, folk]",False,T. Rex,"[album rock, art rock, classic rock, folk rock...",False,0,2,2,21
2,2023-02-12 01:17:14.946,132,2PmGtDUyJIpYBEtI1hQIVp,544,Ballrooms Of Mars,34,247707,3dBVyJ7JuOMt4GE9607Qin,0.491,0.606,...,aleja Olchowa 16,"[psychedelic rock, country rock, rock en espanol]",True,T. Rex,"[album rock, art rock, classic rock, folk rock...",False,1,2,6,1
3,2023-01-03 03:59:59.738,132,5yxYokipsWlpDCt4Th4VVc,534,Solid Gold Easy Action,35,140067,3dBVyJ7JuOMt4GE9607Qin,0.449,0.749,...,aleja Olchowa 16,"[psychedelic rock, country rock, rock en espanol]",True,T. Rex,"[album rock, art rock, classic rock, folk rock...",False,1,1,1,3
4,2023-03-07 14:28:25.702,132,5yxYokipsWlpDCt4Th4VVc,547,Solid Gold Easy Action,35,140067,3dBVyJ7JuOMt4GE9607Qin,0.449,0.749,...,aleja Olchowa 16,"[psychedelic rock, country rock, rock en espanol]",True,T. Rex,"[album rock, art rock, classic rock, folk rock...",False,1,3,1,14


In [10]:
unique_favourite_genres = set()
for genres in data['favourite_genres']:
  for genre in genres:
    unique_favourite_genres.add(genre)

unique_genres = set()
for genres in data['genres']:
  for genre in genres:
    unique_genres.add(genre)

# show all common genres between all favourite_genres and genres
print("unique_favourite_genres", len(unique_favourite_genres))
print("unique_genres", len(unique_genres))
common_genres = unique_favourite_genres.intersection(unique_genres)
print("common genres", common_genres)

# remove genres that are not in favourite_genres
data['genres'] = data['genres'].apply(
    lambda x: [genre for genre in x if genre in common_genres])

data.head()

# TODO remove ??? moze niepotrzebne

unique_favourite_genres 46
unique_genres 1766
common genres {'latin pop', 'mandopop', 'hard rock', 'latin rock', 'regional mexican', 'album rock', 'europop', 'classic rock', 'latin alternative', 'metal', 'vocal jazz', 'c-pop', 'brill building pop', 'dance pop', 'rock', 'adult standards', 'turkish pop', 'mellow gold', 'alternative metal', 'tropical', 'soft rock', 'hoerspiel', 'mpb', 'j-pop', 'funk', 'permanent wave', 'quiet storm', 'art rock', 'motown', 'latin', 'filmi', 'pop rock', 'ranchera', 'pop', 'new romantic', 'new wave', 'soul', 'new wave pop', 'blues rock', 'country rock', 'lounge', 'singer-songwriter', 'alternative rock', 'rock en espanol', 'folk', 'psychedelic rock'}


Unnamed: 0,timestamp,user_id,track_id,session_id,track_name,popularity,duration_ms,id_artist,danceability,energy,...,street,favourite_genres,premium_user,name,genres,skipped,number_of_matching_genres,month,day_of_week,hour_of_day
0,2023-01-03 05:09:55.000,101,2PmGtDUyJIpYBEtI1hQIVp,124,Ballrooms Of Mars,34,247707,3dBVyJ7JuOMt4GE9607Qin,0.491,0.606,...,pl. Floriana 55/22,"[permanent wave, mandopop, funk]",False,T. Rex,"[album rock, art rock, classic rock, psychedel...",False,0,1,1,5
1,2023-02-08 21:41:02.507,103,2PmGtDUyJIpYBEtI1hQIVp,151,Ballrooms Of Mars,34,247707,3dBVyJ7JuOMt4GE9607Qin,0.491,0.606,...,al. Głogowa 14/10,"[filmi, regional mexican, folk]",False,T. Rex,"[album rock, art rock, classic rock, psychedel...",False,0,2,2,21
2,2023-02-12 01:17:14.946,132,2PmGtDUyJIpYBEtI1hQIVp,544,Ballrooms Of Mars,34,247707,3dBVyJ7JuOMt4GE9607Qin,0.491,0.606,...,aleja Olchowa 16,"[psychedelic rock, country rock, rock en espanol]",True,T. Rex,"[album rock, art rock, classic rock, psychedel...",False,1,2,6,1
3,2023-01-03 03:59:59.738,132,5yxYokipsWlpDCt4Th4VVc,534,Solid Gold Easy Action,35,140067,3dBVyJ7JuOMt4GE9607Qin,0.449,0.749,...,aleja Olchowa 16,"[psychedelic rock, country rock, rock en espanol]",True,T. Rex,"[album rock, art rock, classic rock, psychedel...",False,1,1,1,3
4,2023-03-07 14:28:25.702,132,5yxYokipsWlpDCt4Th4VVc,547,Solid Gold Easy Action,35,140067,3dBVyJ7JuOMt4GE9607Qin,0.449,0.749,...,aleja Olchowa 16,"[psychedelic rock, country rock, rock en espanol]",True,T. Rex,"[album rock, art rock, classic rock, psychedel...",False,1,3,1,14


In [11]:
# Combine genres and favourite_genres
all_genres = list(data['favourite_genres'] + data['genres'])

# One-hot encode the genres
mlb = MultiLabelBinarizer()
mlb.fit(all_genres)

encoded_all_genres = mlb.fit_transform(all_genres)

# Split encoded_genres into genres and favourite_genres
# encoded_favourite_genres = encoded_all_genres[:, :len(data['favourite_genres'][0])]
# encoded_genres = encoded_all_genres[:, len(data['favourite_genres'][0]):]

encoded_favourite_genres = mlb.transform(data['favourite_genres'])
encoded_genres = mlb.transform(data['genres'])

popularity_normalized = data['popularity'].values.reshape(-1, 1)
popularity_normalized = (popularity_normalized - np.min(popularity_normalized)) / (
      np.max(popularity_normalized) - np.min(popularity_normalized))

duration_ms_normalized = data['duration_ms'].values.reshape(-1, 1)
duration_ms_normalized = (duration_ms_normalized - np.min(duration_ms_normalized)) / (
      np.max(duration_ms_normalized) - np.min(duration_ms_normalized))

danceability_normalized = data['danceability'].values.reshape(-1, 1)
danceability_normalized = (danceability_normalized - np.min(danceability_normalized)) / (
      np.max(danceability_normalized) - np.min(danceability_normalized))

energy_normalized = data['energy'].values.reshape(-1, 1)
energy_normalized = (energy_normalized - np.min(energy_normalized)) / (
      np.max(energy_normalized) - np.min(energy_normalized))

number_of_matching_genres_normalized = data['number_of_matching_genres'].values.reshape(-1, 1)
number_of_matching_genres_normalized = (number_of_matching_genres_normalized - np.min(
  number_of_matching_genres_normalized)) / (np.max(number_of_matching_genres_normalized) - np.min(
  number_of_matching_genres_normalized))

# create data frame from data genres, data favourite_genres, encoded genres, encoded favourite_genres
df = pd.DataFrame(
    data={'popularity': data['popularity'],
          'popularity_normalized': popularity_normalized.reshape(-1),
          'duration_ms': data['duration_ms'],
          'duration_ms_normalized': duration_ms_normalized.reshape(-1),
          'number_of_matching_genres': data['number_of_matching_genres'],
          'number_of_matching_genres_normalized': number_of_matching_genres_normalized.reshape(-1),
          'skipped': data['skipped']})

df.head(5)

Unnamed: 0,popularity,popularity_normalized,duration_ms,duration_ms_normalized,number_of_matching_genres,number_of_matching_genres_normalized,skipped
0,34,0.354167,247707,0.092836,0,0.0,False
1,34,0.354167,247707,0.092836,0,0.0,False
2,34,0.354167,247707,0.092836,1,0.333333,False
3,35,0.364583,140067,0.046724,1,0.333333,False
4,35,0.364583,140067,0.046724,1,0.333333,False


In [12]:
# Concatenate the one-hot encoded columns
# X = np.concatenate([encoded_favourite_genres, encoded_genres, number_of_matching_genres_normalized], axis=1)
# X = np.concatenate([popularity_normalized, duration_ms_normalized, number_of_matching_genres_normalized], axis=1)
X = np.concatenate([danceability_normalized, energy_normalized], axis=1)
# X = data.drop(columns=['skipped']).values

# Extract the labels
y = data['skipped'].astype(int).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [14]:
print("TEST")
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))

print("TRAIN")
y_pred = rf_model.predict(X_train)
print("Accuracy:", accuracy_score(y_train, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_train, y_pred))
print("Classification report:\n", classification_report(y_train, y_pred))


TEST
Accuracy: 0.5882650480526049
Confusion matrix:
 [[957 304]
 [510 206]]
Classification report:
               precision    recall  f1-score   support

           0       0.65      0.76      0.70      1261
           1       0.40      0.29      0.34       716

    accuracy                           0.59      1977
   macro avg       0.53      0.52      0.52      1977
weighted avg       0.56      0.59      0.57      1977

TRAIN
Accuracy: 0.8126265182186235
Confusion matrix:
 [[4572  427]
 [1054 1851]]
Classification report:
               precision    recall  f1-score   support

           0       0.81      0.91      0.86      4999
           1       0.81      0.64      0.71      2905

    accuracy                           0.81      7904
   macro avg       0.81      0.78      0.79      7904
weighted avg       0.81      0.81      0.81      7904

