In [123]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

artists_data_path = "../data/artists.jsonl"
sessions_data_path = "../data/sessions.jsonl"
track_storage_data_path = "../data/track_storage.jsonl"
tracks_data_path = "../data/tracks.jsonl"
users_data_path = "../data/users.jsonl"

artists_data_frame = pd.read_json(artists_data_path, lines=True)
sessions_data_frame = pd.read_json(sessions_data_path, lines=True)
tracks_data_frame = pd.read_json(tracks_data_path, lines=True)
users_data_frame = pd.read_json(users_data_path, lines=True)

# Scalenie danych

In [124]:
merged_data_frame = pd.merge(sessions_data_frame, tracks_data_frame, left_on="track_id",
                             right_on="id")
merged_data_frame = pd.merge(merged_data_frame, users_data_frame, left_on="user_id",
                             right_on="user_id")
merged_data_frame = pd.merge(merged_data_frame, artists_data_frame, left_on="id_artist",
                             right_on="id")
# rename column
merged_data_frame = merged_data_frame.rename(columns={"name_x": "track_name"})
merged_data_frame = merged_data_frame.rename(columns={"name_y": "artist_name"})
merged_data_frame.head(5)


Unnamed: 0,timestamp,user_id,track_id,event_type,session_id,id_x,track_name,popularity,duration_ms,explicit,id_artist,release_date,danceability,energy,key,mode,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artist_name,city,street,favourite_genres,premium_user,id_y,name,genres
0,2023-01-04 00:12:59.000,101,0NPjiwqT1xrA3ck05xKoA8,PLAY,124,0NPjiwqT1xrA3ck05xKoA8,"Anton skaffar sig hund, del 4",21,159812,0,1c6OwPjqCGGUg770n3zhbq,1975,0.667,0.199,1,,-19.284,0.952,0.47,0.0,0.0882,0.487,96.391,3,Klara Herdzik,Szczecin,pl. Floriana 55/22,"[permanent wave, mandopop, funk]",False,1c6OwPjqCGGUg770n3zhbq,Margaretha Krook,[barnsagor]
1,2023-02-04 01:20:47.302,926,0NPjiwqT1xrA3ck05xKoA8,PLAY,11284,0NPjiwqT1xrA3ck05xKoA8,"Anton skaffar sig hund, del 4",21,159812,0,1c6OwPjqCGGUg770n3zhbq,1975,0.667,0.199,1,,-19.284,0.952,0.47,0.0,0.0882,0.487,96.391,3,Dagmara Łaszczyk,Szczecin,pl. Lawendowa 35/83,"[alternative metal, rock, c-pop]",False,1c6OwPjqCGGUg770n3zhbq,Margaretha Krook,[barnsagor]
2,2023-02-04 01:22:54.580,926,0NPjiwqT1xrA3ck05xKoA8,LIKE,11284,0NPjiwqT1xrA3ck05xKoA8,"Anton skaffar sig hund, del 4",21,159812,0,1c6OwPjqCGGUg770n3zhbq,1975,0.667,0.199,1,,-19.284,0.952,0.47,0.0,0.0882,0.487,96.391,3,Dagmara Łaszczyk,Szczecin,pl. Lawendowa 35/83,"[alternative metal, rock, c-pop]",False,1c6OwPjqCGGUg770n3zhbq,Margaretha Krook,[barnsagor]
3,2023-01-28 02:29:05.362,926,1hviQqMhM4NyY4O6CWZABO,PLAY,11281,1hviQqMhM4NyY4O6CWZABO,"Det finns väl ingen med kniv här i stan, del 3",20,113858,0,1c6OwPjqCGGUg770n3zhbq,1992-10-30,0.676,0.33,8,1.0,-16.983,0.954,0.737,0.0,0.189,0.301,79.707,4,Dagmara Łaszczyk,Szczecin,pl. Lawendowa 35/83,"[alternative metal, rock, c-pop]",False,1c6OwPjqCGGUg770n3zhbq,Margaretha Krook,[barnsagor]
4,2023-01-28 02:30:48.635,926,1hviQqMhM4NyY4O6CWZABO,SKIP,11281,1hviQqMhM4NyY4O6CWZABO,"Det finns väl ingen med kniv här i stan, del 3",20,113858,0,1c6OwPjqCGGUg770n3zhbq,1992-10-30,0.676,0.33,8,1.0,-16.983,0.954,0.737,0.0,0.189,0.301,79.707,4,Dagmara Łaszczyk,Szczecin,pl. Lawendowa 35/83,"[alternative metal, rock, c-pop]",False,1c6OwPjqCGGUg770n3zhbq,Margaretha Krook,[barnsagor]


## Wyliczenie, czy dana piosenka w danej sesji została pominięta

In [125]:
# delete rows with event_type other than PLAY and SKIP
merged_data_frame = merged_data_frame[merged_data_frame["event_type"].isin(["PLAY", "SKIP"])]

# if for one user_id and track_id and session_id there is only PLAY event, then the track was not skipped
merged_data_frame["skipped"] = merged_data_frame.groupby(["user_id", "track_id", "session_id"])[
  "event_type"].transform(lambda x: x != "PLAY").astype(bool)

# if there are 2 rows with the same user_id, track_id and session_id, and different "skipped" value, remove the row with "skipped" = 0
merged_data_frame = merged_data_frame.drop_duplicates(subset=["user_id", "track_id", "session_id"],
                                                      keep="last")



## Dodanie nowych atrybutów

In [126]:
merged_data_frame["number_of_matching_genres"] = merged_data_frame.apply(lambda x: len(set(x["genres"]).intersection(set(x["favourite_genres"]))), axis=1)
merged_data_frame["month"] = merged_data_frame["timestamp"].dt.month
merged_data_frame["day_of_week"] = merged_data_frame["timestamp"].dt.dayofweek
merged_data_frame["hour_of_day"] = merged_data_frame["timestamp"].dt.hour

## Usunięcie niepotrzebnych kolumn

In [127]:
merged_data_frame = merged_data_frame.drop(columns=["id_x", "id_y", "event_type"])
merged_data_frame.head(10)

Unnamed: 0,timestamp,user_id,track_id,session_id,track_name,popularity,duration_ms,explicit,id_artist,release_date,danceability,energy,key,mode,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artist_name,city,street,favourite_genres,premium_user,name,genres,skipped,number_of_matching_genres,month,day_of_week,hour_of_day
0,2023-01-04 00:12:59.000,101,0NPjiwqT1xrA3ck05xKoA8,124,"Anton skaffar sig hund, del 4",21,159812,0,1c6OwPjqCGGUg770n3zhbq,1975,0.667,0.199,1,,-19.284,0.952,0.47,0.0,0.0882,0.487,96.391,3,Klara Herdzik,Szczecin,pl. Floriana 55/22,"[permanent wave, mandopop, funk]",False,Margaretha Krook,[barnsagor],False,0,1,2,0
1,2023-02-04 01:20:47.302,926,0NPjiwqT1xrA3ck05xKoA8,11284,"Anton skaffar sig hund, del 4",21,159812,0,1c6OwPjqCGGUg770n3zhbq,1975,0.667,0.199,1,,-19.284,0.952,0.47,0.0,0.0882,0.487,96.391,3,Dagmara Łaszczyk,Szczecin,pl. Lawendowa 35/83,"[alternative metal, rock, c-pop]",False,Margaretha Krook,[barnsagor],False,0,2,5,1
4,2023-01-28 02:30:48.635,926,1hviQqMhM4NyY4O6CWZABO,11281,"Det finns väl ingen med kniv här i stan, del 3",20,113858,0,1c6OwPjqCGGUg770n3zhbq,1992-10-30,0.676,0.33,8,1.0,-16.983,0.954,0.737,0.0,0.189,0.301,79.707,4,Dagmara Łaszczyk,Szczecin,pl. Lawendowa 35/83,"[alternative metal, rock, c-pop]",False,Margaretha Krook,[barnsagor],True,0,1,5,2
6,2023-02-14 21:59:20.619,940,0NPjiwqT1xrA3ck05xKoA8,11473,"Anton skaffar sig hund, del 4",21,159812,0,1c6OwPjqCGGUg770n3zhbq,1975,0.667,0.199,1,,-19.284,0.952,0.47,0.0,0.0882,0.487,96.391,3,Stanisław Butrym,Wrocław,ulica Witosa 13,"[ranchera, pop, latin alternative]",True,Margaretha Krook,[barnsagor],True,0,2,1,21
7,2023-02-06 18:41:13.873,1095,0NPjiwqT1xrA3ck05xKoA8,13459,"Anton skaffar sig hund, del 4",21,159812,0,1c6OwPjqCGGUg770n3zhbq,1975,0.667,0.199,1,,-19.284,0.952,0.47,0.0,0.0882,0.487,96.391,3,Fabian Zbroja,Wrocław,pl. Orzechowa 97/37,"[singer-songwriter, pop rock, post-teen pop]",True,Margaretha Krook,[barnsagor],False,0,2,0,18
8,2023-01-12 06:42:49.000,1295,0NPjiwqT1xrA3ck05xKoA8,16327,"Anton skaffar sig hund, del 4",21,159812,0,1c6OwPjqCGGUg770n3zhbq,1975,0.667,0.199,1,,-19.284,0.952,0.47,0.0,0.0882,0.487,96.391,3,Klara Majzner,Gdynia,plac Witosa 89/15,"[pop, pop, funk]",True,Margaretha Krook,[barnsagor],False,0,1,3,6
11,2023-02-15 23:36:35.331,1467,0NPjiwqT1xrA3ck05xKoA8,18574,"Anton skaffar sig hund, del 4",21,159812,0,1c6OwPjqCGGUg770n3zhbq,1975,0.667,0.199,1,,-19.284,0.952,0.47,0.0,0.0882,0.487,96.391,3,Aurelia Miąsko,Poznań,plac Wierzbowa 28,"[post-teen pop, argentine rock, latin rock]",True,Margaretha Krook,[barnsagor],True,0,2,2,23
14,2023-03-16 19:24:41.953,1544,0NPjiwqT1xrA3ck05xKoA8,19590,"Anton skaffar sig hund, del 4",21,159812,0,1c6OwPjqCGGUg770n3zhbq,1975,0.667,0.199,1,,-19.284,0.952,0.47,0.0,0.0882,0.487,96.391,3,Dariusz Tokarek,Kraków,pl. Toruńska 266,"[europop, filmi, alternative metal]",True,Margaretha Krook,[barnsagor],True,0,3,3,19
16,2023-03-14 16:20:47.409,2039,0NPjiwqT1xrA3ck05xKoA8,26304,"Anton skaffar sig hund, del 4",21,159812,0,1c6OwPjqCGGUg770n3zhbq,1975,0.667,0.199,1,,-19.284,0.952,0.47,0.0,0.0882,0.487,96.391,3,Bruno Jeszke,Szczecin,al. Kamienna 457,"[mpb, soul, alternative rock]",True,Margaretha Krook,[barnsagor],True,0,3,1,16
19,2023-01-13 23:33:23.809,2076,0NPjiwqT1xrA3ck05xKoA8,26766,"Anton skaffar sig hund, del 4",21,159812,0,1c6OwPjqCGGUg770n3zhbq,1975,0.667,0.199,1,,-19.284,0.952,0.47,0.0,0.0882,0.487,96.391,3,Fabian Snoch,Kraków,ulica Sybiraków 305,"[argentine rock, latin rock, ranchera]",False,Margaretha Krook,[barnsagor],True,0,1,4,23


In [128]:
merged_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1276510 entries, 0 to 2179841
Data columns (total 34 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   timestamp                  1276510 non-null  datetime64[ns]
 1   user_id                    1276510 non-null  int64         
 2   track_id                   1276510 non-null  object        
 3   session_id                 1276510 non-null  int64         
 4   track_name                 1276510 non-null  object        
 5   popularity                 1276510 non-null  int64         
 6   duration_ms                1276510 non-null  int64         
 7   explicit                   1276510 non-null  int64         
 8   id_artist                  1276510 non-null  object        
 9   release_date               1276510 non-null  object        
 10  danceability               1276510 non-null  float64       
 11  energy                     1276510 non-nul

In [129]:
merged_data_frame.describe()

Unnamed: 0,timestamp,user_id,session_id,popularity,duration_ms,explicit,danceability,energy,key,mode,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,number_of_matching_genres,month,day_of_week,hour_of_day
count,1276510,1276510.0,1276510.0,1276510.0,1276510.0,1276510.0,1276510.0,1276510.0,1276510.0,279578.0,1276510.0,1276510.0,1276510.0,1276510.0,1276510.0,1276510.0,1276510.0,1276510.0,1276510.0,1276510.0,1276510.0,1276510.0
mean,2023-02-17 06:30:55.759123968,2859.073,37242.97,46.37075,230934.1,0.08232603,0.5665693,0.5997763,5.279286,0.639825,-9.098161,0.08181135,0.3240204,0.07121924,0.2046845,0.5397278,120.4185,3.930228,0.1659979,2.09589,2.974657,11.50624
min,2023-01-01 00:04:29,101.0,124.0,0.0,4000.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,2023-01-24 11:10:13.823000064,1486.0,18831.0,26.0,180283.0,0.0,0.455,0.431,2.0,0.0,-11.481,0.0335,0.0361,0.0,0.0942,0.341,98.544,4.0,0.0,1.0,1.0,6.0
50%,2023-02-17 05:30:08.064999936,2869.0,37296.0,41.0,214947.0,0.0,0.573,0.62,5.0,1.0,-8.315,0.0425,0.212,4.41e-05,0.126,0.547,119.965,4.0,0.0,2.0,3.0,12.0
75%,2023-03-13 01:19:03.110000128,4234.0,55764.0,75.0,262240.0,0.0,0.686,0.789,9.0,1.0,-6.05,0.0768,0.574,0.0052,0.247,0.743,136.892,4.0,0.0,3.0,5.0,17.0
max,2023-04-05 20:24:55.388000,5600.0,74131.0,96.0,4120258.0,1.0,0.982,1.0,11.0,1.0,2.487,0.967,0.996,0.999,1.0,1.0,218.072,5.0,3.0,4.0,6.0,23.0
std,,1589.681,21359.82,25.37431,98975.27,0.2748609,0.1657664,0.2278273,3.559949,0.480052,4.34466,0.1169348,0.3107948,0.2033808,0.1895666,0.2515016,28.48244,0.3764612,0.4159961,0.9210477,2.009431,6.903861


## Współczynnik informacji wzajemnej

Sprawdzamy, czy atrbuty niosą ze sobą informację o zmiennej celu.
Przed wyliczeniem współczynnika dla poszczególnych zbiorów, sprawdzimy wartość entropii zbioru etykiet "skipped", aby oszacować jaka może być maksymalna wartość MI.

In [130]:
from sklearn import metrics
from scipy.stats import entropy

print("Entropy of skipped column: ", str(round(entropy(merged_data_frame["skipped"].value_counts()), 4)))

mutual_info_score = {}

for column in merged_data_frame.columns:
  if column != "skipped" and column != "genres" and column != "favourite_genres" and column != "encoded_genres":
    mutual_info_score[column] = round(metrics.mutual_info_score(merged_data_frame["skipped"], merged_data_frame[column]), 4)

# print sorted by value
for key, value in sorted(mutual_info_score.items(), key=lambda item: item[1], reverse=True):
  print("%s: %s" % (key, value))

Entropy of skipped column:  0.6523




ValueError: Input contains NaN.

## Macierz korelacji

In [131]:
df = merged_data_frame.drop(columns=["genres", "favourite_genres", "track_id", "track_name", "id_artist", "release_date", "artist_name", "city", "street", "name"])

df["skipped"] = df["skipped"].astype(int)
df["premium_user"] = df["premium_user"].astype(int)
df["timestamp"] = pd.to_numeric(df["timestamp"], errors="coerce")

correlation_matrix = df.corrwith(df["skipped"])
correlation_matrix

timestamp                   -0.013453
user_id                      0.003888
session_id                   0.003906
popularity                  -0.006764
duration_ms                  0.052508
explicit                    -0.010313
danceability                -0.075586
energy                      -0.000453
key                          0.003896
mode                         0.003584
loudness                    -0.003847
speechiness                 -0.030642
acousticness                -0.007098
instrumentalness             0.013742
liveness                     0.009499
valence                     -0.045396
tempo                        0.011392
time_signature              -0.003515
premium_user                 0.262185
skipped                      1.000000
number_of_matching_genres    0.003511
month                       -0.012886
day_of_week                  0.001368
hour_of_day                 -0.002158
dtype: float64

In [132]:
# save to jsonl file
merged_data_frame.to_json("../data/merged_data.jsonl", orient="records", lines=True)


## Podsumowanie