In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np


In [6]:
merged_data_path = '../data/merged_data.jsonl'
data = pd.read_json(merged_data_path, lines=True)

data = data.drop(columns=["user_id", "track_id"])

data = data.drop(
    columns=["release_date", "key", "loudness", "popularity",
             "duration_ms", "explicit",
             "danceability", "energy", "speechiness",
             "acousticness", "instrumentalness", "liveness", "valence", "tempo"])

data.head(5)

Unnamed: 0,timestamp,session_id,favourite_genres,genres,skipped
0,2023-01-03 05:09:55.000,124,"[permanent wave, mandopop, funk]","[album rock, art rock, classic rock, folk rock...",False
1,2023-02-08 21:41:02.507,151,"[filmi, regional mexican, folk]","[album rock, art rock, classic rock, folk rock...",False
2,2023-02-12 01:17:14.946,544,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
3,2023-01-03 03:59:59.738,534,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False
4,2023-03-07 14:28:25.702,547,"[psychedelic rock, country rock, rock en espanol]","[album rock, art rock, classic rock, folk rock...",False


In [7]:
# Combine genres and favourite_genres
all_genres = list(data['favourite_genres'] + data['genres'])

# One-hot encode the genres
mlb = MultiLabelBinarizer()
encoded_all_genres = mlb.fit_transform(all_genres)

# Split encoded_genres into genres and favourite_genres
encoded_favourite_genres = encoded_all_genres[:, :len(data['favourite_genres'][0])]
encoded_genres = encoded_all_genres[:, len(data['favourite_genres'][0]):]

In [8]:
# Concatenate the one-hot encoded columns
X = np.concatenate([encoded_favourite_genres, encoded_genres], axis=1)

# Extract the labels
y = data['skipped'].astype(int).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [14]:
print("TEST")
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))

print("TRAIN")
y_pred = rf_model.predict(X_train)
print("Accuracy:", accuracy_score(y_train, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_train, y_pred))
print("Classification report:\n", classification_report(y_train, y_pred))


TEST
Accuracy: 0.6535154274152757
Confusion matrix:
 [[998 263]
 [422 294]]
Classification report:
               precision    recall  f1-score   support

           0       0.70      0.79      0.74      1261
           1       0.53      0.41      0.46       716

    accuracy                           0.65      1977
   macro avg       0.62      0.60      0.60      1977
weighted avg       0.64      0.65      0.64      1977

TRAIN
Accuracy: 0.917004048582996
Confusion matrix:
 [[4714  285]
 [ 371 2534]]
Classification report:
               precision    recall  f1-score   support

           0       0.93      0.94      0.93      4999
           1       0.90      0.87      0.89      2905

    accuracy                           0.92      7904
   macro avg       0.91      0.91      0.91      7904
weighted avg       0.92      0.92      0.92      7904

