In [66]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

artists_data_path = "../data/artists.jsonl"
sessions_data_path = "../data/sessions.jsonl"
track_storage_data_path = "../data/track_storage.jsonl"
tracks_data_path = "../data/tracks.jsonl"
users_data_path = "../data/users.jsonl"

artists_data_frame = pd.read_json(artists_data_path, lines=True)
sessions_data_frame = pd.read_json(sessions_data_path, lines=True)
tracks_data_frame = pd.read_json(tracks_data_path, lines=True)
users_data_frame = pd.read_json(users_data_path, lines=True)

# Scalenie danych

In [67]:
merged_data_frame = pd.merge(sessions_data_frame, tracks_data_frame, left_on="track_id",
                             right_on="id")
merged_data_frame = pd.merge(merged_data_frame, users_data_frame, left_on="user_id",
                             right_on="user_id")
merged_data_frame = pd.merge(merged_data_frame, artists_data_frame, left_on="id_artist",
                             right_on="id")
# rename column
merged_data_frame = merged_data_frame.rename(columns={"name_x": "track_name"})
merged_data_frame = merged_data_frame.rename(columns={"name_y": "artist_name"})
merged_data_frame.head(5)


## Wyliczenie etykiety, czy dana piosenka w danej sesji została pominięta przez danego użytkownika

In [68]:
# delete rows with event_type other than PLAY and SKIP
merged_data_frame = merged_data_frame[merged_data_frame["event_type"].isin(["PLAY", "SKIP"])]

# if for one user_id and track_id and session_id there is only PLAY event, then the track was not skipped
merged_data_frame["skipped"] = merged_data_frame.groupby(["user_id", "track_id", "session_id"])[
  "event_type"].transform(lambda x: x != "PLAY").astype(bool)

# if there are 2 rows with the same user_id, track_id and session_id, and different "skipped" value, remove the row with "skipped" = 0
merged_data_frame = merged_data_frame.drop_duplicates(subset=["user_id", "track_id", "session_id"],
                                                      keep="last")



## Dodanie nowych atrybutów

In [69]:
merged_data_frame["number_of_matching_genres"] = merged_data_frame.apply(lambda x: len(set(x["genres"]).intersection(set(x["favourite_genres"]))), axis=1)
merged_data_frame["month"] = merged_data_frame["timestamp"].dt.month
merged_data_frame["day_of_week"] = merged_data_frame["timestamp"].dt.dayofweek
merged_data_frame["hour_of_day"] = merged_data_frame["timestamp"].dt.hour
merged_data_frame["genres_with_favourite_genres"] = merged_data_frame["genres"] + merged_data_frame["favourite_genres"]
merged_data_frame["common_genres"] = merged_data_frame["genres_with_favourite_genres"].apply(lambda x: list(set(x)))

## Usunięcie niepotrzebnych kolumn

In [70]:
merged_data_frame = merged_data_frame.drop(columns=["id_x", "id_y", "event_type"])
merged_data_frame.head(10)

In [71]:
merged_data_frame.info()

In [72]:
merged_data_frame.describe()

## Współczynnik informacji wzajemnej

#### Pominięto atrybuty ciągłe
Sprawdzamy, czy atrybuty niosą ze sobą informację o zmiennej celu.

In [73]:
from sklearn import metrics
from scipy.stats import entropy

print("Entropy of skipped column: ", str(round(entropy(merged_data_frame["skipped"].value_counts()), 4)))

df = merged_data_frame.copy()
df = df.drop(columns=["danceability", "energy", "loudness", "speechiness"])

df["favourite_genres"] = df["favourite_genres"].apply(lambda x: str(x))
df["genres"] = df["genres"].apply(lambda x: str(x))
df["genres_with_favourite_genres"] = df["genres_with_favourite_genres"].apply(lambda x: str(x))
df["common_genres"] = df["common_genres"].apply(lambda x: str(x))

df["timestamp"] = df["timestamp"].astype(str)

mutual_info_score = {}

for column in df.columns:
  if column != "skipped":
    mutual_info_score[column] = round(metrics.mutual_info_score(df["skipped"], df[column]), 4)

# print sorted by value
for key, value in sorted(mutual_info_score.items(), key=lambda item: item[1], reverse=True):
  print("%s: %s" % (key, value))

df.info()

## Macierz korelacji

#### Pominięto atrybuty dyskretne

In [74]:
df = merged_data_frame.copy()
df = df.drop(columns=["genres", "favourite_genres", "genres_with_favourite_genres", "common_genres", "track_id", "track_name", "id_artist", "release_date", "artist_name", "city", "street", "name"])

df["skipped"] = df["skipped"].astype(int)
df["premium_user"] = df["premium_user"].astype(int)
df["timestamp"] = pd.to_numeric(df["timestamp"], errors="coerce")

correlation_matrix = df.corrwith(df["skipped"])
correlation_matrix.head(50)

In [75]:
# save to jsonl file
merged_data_frame.to_json("../data/merged_data.jsonl", orient="records", lines=True)


## Podsumowanie

Jak widać na podstawie współczyników informacji wzajemnej, połączenie atrybutów "genres" i "favourite_genres" jest dobrym atrybutem do uczenia naszego modelu predykcyjnego.
Innymi dobrymi atrybutami są: "tempo", "duration_ms" i "loudness".

Początkowo zdecydowaliśmy się na użycie atrybutów "genres" i "favourite_genres" jako danych wejściowych modelu. Zmienną celu jest etykieta "skipped".
