In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np


In [2]:
merged_data_path = '../data/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

data = data.drop(
    columns=["release_date", "key", "loudness", "explicit",
             "speechiness",
             "acousticness", "instrumentalness", "liveness", "valence", "tempo"])

# data = data.drop(
#     columns=["release_date", "genres", "favourite_genres", "name"])

data.head(5)

Unnamed: 0,timestamp,user_id,track_id,session_id,track_name,popularity,duration_ms,id_artist,danceability,energy,...,street,favourite_genres,premium_user,name,genres,skipped,number_of_matching_genres,month,day_of_week,hour_of_day
0,2023-01-04 00:12:59.000,101,0NPjiwqT1xrA3ck05xKoA8,124,"Anton skaffar sig hund, del 4",21,159812,1c6OwPjqCGGUg770n3zhbq,0.667,0.199,...,pl. Floriana 55/22,"[permanent wave, mandopop, funk]",False,Margaretha Krook,[barnsagor],False,0,1,2,0
1,2023-02-04 01:20:47.302,926,0NPjiwqT1xrA3ck05xKoA8,11284,"Anton skaffar sig hund, del 4",21,159812,1c6OwPjqCGGUg770n3zhbq,0.667,0.199,...,pl. Lawendowa 35/83,"[alternative metal, rock, c-pop]",False,Margaretha Krook,[barnsagor],False,0,2,5,1
2,2023-01-28 02:30:48.635,926,1hviQqMhM4NyY4O6CWZABO,11281,"Det finns väl ingen med kniv här i stan, del 3",20,113858,1c6OwPjqCGGUg770n3zhbq,0.676,0.33,...,pl. Lawendowa 35/83,"[alternative metal, rock, c-pop]",False,Margaretha Krook,[barnsagor],True,0,1,5,2
3,2023-02-14 21:59:20.619,940,0NPjiwqT1xrA3ck05xKoA8,11473,"Anton skaffar sig hund, del 4",21,159812,1c6OwPjqCGGUg770n3zhbq,0.667,0.199,...,ulica Witosa 13,"[ranchera, pop, latin alternative]",True,Margaretha Krook,[barnsagor],True,0,2,1,21
4,2023-02-06 18:41:13.873,1095,0NPjiwqT1xrA3ck05xKoA8,13459,"Anton skaffar sig hund, del 4",21,159812,1c6OwPjqCGGUg770n3zhbq,0.667,0.199,...,pl. Orzechowa 97/37,"[singer-songwriter, pop rock, post-teen pop]",True,Margaretha Krook,[barnsagor],False,0,2,0,18


In [3]:
unique_favourite_genres = set()
for genres in data['favourite_genres']:
  for genre in genres:
    unique_favourite_genres.add(genre)

unique_genres = set()
for genres in data['genres']:
  for genre in genres:
    unique_genres.add(genre)

# show all common genres between all favourite_genres and genres
print("unique_favourite_genres", len(unique_favourite_genres))
print("unique_genres", len(unique_genres))
common_genres = unique_favourite_genres.intersection(unique_genres)
print("common genres", common_genres)

# remove genres that are not in favourite_genres
data['genres'] = data['genres'].apply(
    lambda x: [genre for genre in x if genre in common_genres])

data.head()

# TODO remove ??? moze niepotrzebne

unique_favourite_genres 50
unique_genres 2875
common genres {'mpb', 'country rock', 'alternative metal', 'album rock', 'funk', 'classic rock', 'brill building pop', 'c-pop', 'argentine rock', 'latin alternative', 'regional mexican', 'soul', 'quiet storm', 'new wave', 'folk rock', 'permanent wave', 'j-pop', 'mandopop', 'alternative rock', 'new romantic', 'hard rock', 'psychedelic rock', 'filmi', 'motown', 'art rock', 'ranchera', 'tropical', 'lounge', 'pop rock', 'metal', 'blues rock', 'latin', 'post-teen pop', 'rock en espanol', 'hoerspiel', 'rock', 'turkish pop', 'latin rock', 'singer-songwriter', 'dance pop', 'folk', 'adult standards', 'roots rock', 'soft rock', 'pop', 'europop', 'new wave pop', 'mellow gold', 'vocal jazz', 'latin pop'}


Unnamed: 0,timestamp,user_id,track_id,session_id,track_name,popularity,duration_ms,id_artist,danceability,energy,...,street,favourite_genres,premium_user,name,genres,skipped,number_of_matching_genres,month,day_of_week,hour_of_day
0,2023-01-04 00:12:59.000,101,0NPjiwqT1xrA3ck05xKoA8,124,"Anton skaffar sig hund, del 4",21,159812,1c6OwPjqCGGUg770n3zhbq,0.667,0.199,...,pl. Floriana 55/22,"[permanent wave, mandopop, funk]",False,Margaretha Krook,[],False,0,1,2,0
1,2023-02-04 01:20:47.302,926,0NPjiwqT1xrA3ck05xKoA8,11284,"Anton skaffar sig hund, del 4",21,159812,1c6OwPjqCGGUg770n3zhbq,0.667,0.199,...,pl. Lawendowa 35/83,"[alternative metal, rock, c-pop]",False,Margaretha Krook,[],False,0,2,5,1
2,2023-01-28 02:30:48.635,926,1hviQqMhM4NyY4O6CWZABO,11281,"Det finns väl ingen med kniv här i stan, del 3",20,113858,1c6OwPjqCGGUg770n3zhbq,0.676,0.33,...,pl. Lawendowa 35/83,"[alternative metal, rock, c-pop]",False,Margaretha Krook,[],True,0,1,5,2
3,2023-02-14 21:59:20.619,940,0NPjiwqT1xrA3ck05xKoA8,11473,"Anton skaffar sig hund, del 4",21,159812,1c6OwPjqCGGUg770n3zhbq,0.667,0.199,...,ulica Witosa 13,"[ranchera, pop, latin alternative]",True,Margaretha Krook,[],True,0,2,1,21
4,2023-02-06 18:41:13.873,1095,0NPjiwqT1xrA3ck05xKoA8,13459,"Anton skaffar sig hund, del 4",21,159812,1c6OwPjqCGGUg770n3zhbq,0.667,0.199,...,pl. Orzechowa 97/37,"[singer-songwriter, pop rock, post-teen pop]",True,Margaretha Krook,[],False,0,2,0,18


In [4]:
# Combine genres and favourite_genres
all_genres = list(data['favourite_genres'] + data['genres'])

# One-hot encode the genres
mlb = MultiLabelBinarizer()
mlb.fit(all_genres)

encoded_all_genres = mlb.fit_transform(all_genres)

# Split encoded_genres into genres and favourite_genres
# encoded_favourite_genres = encoded_all_genres[:, :len(data['favourite_genres'][0])]
# encoded_genres = encoded_all_genres[:, len(data['favourite_genres'][0]):]

encoded_favourite_genres = mlb.transform(data['favourite_genres'])
encoded_genres = mlb.transform(data['genres'])

popularity_normalized = data['popularity'].values.reshape(-1, 1)
popularity_normalized = (popularity_normalized - np.min(popularity_normalized)) / (
      np.max(popularity_normalized) - np.min(popularity_normalized))

duration_ms_normalized = data['duration_ms'].values.reshape(-1, 1)
duration_ms_normalized = (duration_ms_normalized - np.min(duration_ms_normalized)) / (
      np.max(duration_ms_normalized) - np.min(duration_ms_normalized))

danceability_normalized = data['danceability'].values.reshape(-1, 1)
danceability_normalized = (danceability_normalized - np.min(danceability_normalized)) / (
      np.max(danceability_normalized) - np.min(danceability_normalized))

energy_normalized = data['energy'].values.reshape(-1, 1)
energy_normalized = (energy_normalized - np.min(energy_normalized)) / (
      np.max(energy_normalized) - np.min(energy_normalized))

number_of_matching_genres_normalized = data['number_of_matching_genres'].values.reshape(-1, 1)
number_of_matching_genres_normalized = (number_of_matching_genres_normalized - np.min(
  number_of_matching_genres_normalized)) / (np.max(number_of_matching_genres_normalized) - np.min(
  number_of_matching_genres_normalized))

# create data frame from data genres, data favourite_genres, encoded genres, encoded favourite_genres
df = pd.DataFrame(
    data={'popularity': data['popularity'],
          'popularity_normalized': popularity_normalized.reshape(-1),
          'duration_ms': data['duration_ms'],
          'duration_ms_normalized': duration_ms_normalized.reshape(-1),
          'number_of_matching_genres': data['number_of_matching_genres'],
          'number_of_matching_genres_normalized': number_of_matching_genres_normalized.reshape(-1),
          'skipped': data['skipped']})

df.head(5)

Unnamed: 0,popularity,popularity_normalized,duration_ms,duration_ms_normalized,number_of_matching_genres,number_of_matching_genres_normalized,skipped
0,21,0.21875,159812,0.037853,0,0.0,False
1,21,0.21875,159812,0.037853,0,0.0,False
2,20,0.208333,113858,0.026689,0,0.0,True
3,21,0.21875,159812,0.037853,0,0.0,True
4,21,0.21875,159812,0.037853,0,0.0,False


In [5]:
# Concatenate the one-hot encoded columns
# X = np.concatenate([encoded_favourite_genres, encoded_genres, number_of_matching_genres_normalized], axis=1)
# X = np.concatenate([popularity_normalized, duration_ms_normalized, number_of_matching_genres_normalized], axis=1)
X = np.concatenate([danceability_normalized, energy_normalized], axis=1)
# X = data.drop(columns=['skipped']).values

# Extract the labels
y = data['skipped'].astype(int).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [7]:
print("TEST")
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))

print("TRAIN")
y_pred = rf_model.predict(X_train)
print("Accuracy:", accuracy_score(y_train, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_train, y_pred))
print("Classification report:\n", classification_report(y_train, y_pred))


TEST
Accuracy: 0.6313268207847961
Confusion matrix:
 [[153873   9652]
 [ 84471   7306]]
Classification report:
               precision    recall  f1-score   support

           0       0.65      0.94      0.77    163525
           1       0.43      0.08      0.13     91777

    accuracy                           0.63    255302
   macro avg       0.54      0.51      0.45    255302
weighted avg       0.57      0.63      0.54    255302

TRAIN
Accuracy: 0.6491459134671879
Confusion matrix:
 [[626787  29053]
 [329242  36126]]
Classification report:
               precision    recall  f1-score   support

           0       0.66      0.96      0.78    655840
           1       0.55      0.10      0.17    365368

    accuracy                           0.65   1021208
   macro avg       0.60      0.53      0.47   1021208
weighted avg       0.62      0.65      0.56   1021208

