Imports

In [1]:
import json
import os
from typing import List, Tuple

import pandas as pd  # type: ignore

Utility functions

In [2]:
def get_data_from_path(data_dir: str, lines: bool) -> pd.DataFrame:
    data_file_name = os.listdir(data_dir)[0]
    data_file_path = os.path.join(data_dir, data_file_name)
    return pd.read_json(data_file_path, lines=lines)

In [3]:
def split_by_week(session_data: pd.DataFrame) -> pd.DataFrame:
    session_data["timestamp"] = pd.to_datetime(session_data["timestamp"], unit="ms")
    session_data["week"] = session_data["timestamp"].dt.strftime("%Y-W%U")
    return session_data

File paths

In [4]:
dirname = os.path.abspath('')

tracks_dir = os.path.join(dirname, "data/data/tracks/")
sessions_dir = os.path.join(dirname, "data/data/sessions/")

training_data_by_weeks_file = os.path.join(dirname, "data/training_data_by_weeks/training_data")
basic_training_data_by_weeks_file = os.path.join(dirname, "data/basic_training_data_by_weeks/basic_training_data")

Data frames

In [5]:
tracks_data_frame = get_data_from_path(tracks_dir, True)
sessions_data_frame = get_data_from_path(sessions_dir, True)

1. Parse tracks

In [6]:
agg_df = sessions_data_frame.loc[sessions_data_frame["event_type"].isin(["play", "like", "skip"])].groupby(["track_id", "event_type"]).size().reset_index()

agg_df.columns = ["track_id", "event_type", "count"]
agg_df = agg_df.pivot_table(index="track_id", columns="event_type", values="count", aggfunc="sum", fill_value=0).reset_index()

tracks_new_data_frame = tracks_data_frame.merge(agg_df, left_on='id', right_on="track_id", how="left")

tracks_new_data_frame = tracks_new_data_frame.fillna(0)
tracks_new_data_frame = tracks_new_data_frame.astype({"play": int, "like": int, "skip": int})

tracks_new_data_frame = tracks_new_data_frame.rename(columns={"play": "play_count", "like": "likes", "skip": "number_of_skip"})

tracks_new_data_frame.insert(18, "play_count", tracks_new_data_frame.pop("play_count"))

tracks_new_data_frame.drop(columns=["track_id"], inplace=True)

2. Remove useless tracks

In [7]:
print(len(tracks_new_data_frame))

tracks_new_data_frame = tracks_new_data_frame[tracks_new_data_frame.play_count != 0]

print(len(tracks_new_data_frame))

tmp_tracks_df = tracks_new_data_frame.sort_values(by=["id"], inplace=False, ascending=True)
unique_track_ids = tmp_tracks_df["id"].unique().tolist()

with open(f"{dirname}/data/unique_ids.json", 'w+') as f:
    json.dump(unique_track_ids, f, indent=4)

22412
4071


3. Remove useless sessions

In [8]:
print(len(sessions_data_frame))

sessions_new_data_frame_1 = sessions_data_frame[sessions_data_frame["track_id"].isin(tracks_new_data_frame["id"]) & (sessions_data_frame["event_type"] == "play")].copy()
sessions_new_data_frame_2 = sessions_data_frame[sessions_data_frame["track_id"].isin(tracks_new_data_frame["id"]) & (sessions_data_frame["event_type"] == "like")].copy()
sessions_new_data_frame_3 = sessions_data_frame[sessions_data_frame["track_id"].isin(tracks_new_data_frame["id"]) & (sessions_data_frame["event_type"] == "skip")].copy()
sessions_new_data_frame = pd.concat([sessions_new_data_frame_1, sessions_new_data_frame_2, sessions_new_data_frame_3])

print(len(sessions_new_data_frame))

909936
786103


4. Split sessions by data and remove wrong track attributes

In [9]:
sessions_new_data_frame = split_by_week(sessions_new_data_frame)

sessions_new_data_frame.drop(columns=["session_id", "timestamp", "user_id"], inplace=True)

tracks_new_data_frame.drop(columns=["play_count", "likes", "number_of_skip"], inplace=True)

5. Add track data to sessions

In [10]:
sessions_new_data_frame = pd.merge(sessions_new_data_frame, tracks_new_data_frame, left_on="track_id", right_on="id")
sessions_new_data_frame.drop(columns=["id"], inplace=True)
sessions_new_data_frame = sessions_new_data_frame.groupby("week").apply(lambda x: x.to_dict(orient="records"))

6. Split training data by weeks

In [11]:
training_weeks: List[Tuple[int, pd.Series]] = []
iterator = pd.DataFrame(sessions_new_data_frame).iterrows()
for _ in range(len(sessions_new_data_frame)):
    current = next(iterator)
    training_weeks.append(current)

7. Remove useless columns

In [12]:
for i, data_series in enumerate(sessions_new_data_frame):
    print(f"{i + 1}/{len(sessions_new_data_frame)}")
    data_frame = pd.DataFrame(data_series)
    data_frame["release_date_year"] = (pd.to_datetime(data_frame["release_date"]).dt.strftime("%Y"))
    data_frame["release_date_week"] = (pd.to_datetime(data_frame["release_date"]).dt.strftime("%U"))
    data_frame = data_frame.astype({"release_date_year": int, "release_date_week": int})
    data_frame.drop(columns=["week", "name", "id_artist", "release_date"], inplace=True)

    df_agg = data_frame.groupby(['track_id','event_type']).size().reset_index(name='counts')

    output = []
    for track_id in df_agg['track_id'].unique():
        track_data = data_frame.loc[data_frame['track_id'] == track_id]
        track_data = track_data.drop_duplicates(subset=['track_id'])
        play_count = df_agg.loc[(df_agg['track_id'] == track_id) & (df_agg['event_type'] == 'play'), 'counts'].values[0] if 'play' in df_agg.loc[df_agg['track_id'] == track_id, 'event_type'].values else 0
        likes = df_agg.loc[(df_agg['track_id'] == track_id) & (df_agg['event_type'] == 'like'), 'counts'].values[0] if 'like' in df_agg.loc[df_agg['track_id'] == track_id, 'event_type'].values else 0
        number_of_skips = df_agg.loc[(df_agg['track_id'] == track_id) & (df_agg['event_type'] == 'skip'), 'counts'].values[0] if 'skip' in df_agg.loc[df_agg['track_id'] == track_id, 'event_type'].values else 0
        track_data['play_count'] = play_count
        track_data['likes'] = likes
        track_data['number_of_skips'] = number_of_skips
        output.append(track_data.to_dict(orient='records')[0])

    data_frame = pd.DataFrame(output)

    data_frame.drop(columns=["event_type"], inplace=True)
    track_ids = data_frame["track_id"].unique().tolist()
    not_present = list(set(unique_track_ids).symmetric_difference(set(track_ids)))

    for track_id in not_present:
        data_frame = pd.concat([data_frame, pd.Series({
            "track_id": track_id,
            "popularity": 0,
            "duration_ms": 0,
            "explicit": 0,
            "danceability": 0,
            "energy": 0,
            "key": 0,
            "loudness": 0,
            "speechiness": 0,
            "acousticness": 0,
            "instrumentalness": 0,
            "liveness": 0,
            "valence": 0,
            "tempo": 0,
            "release_date_year": 0,
            "release_date_week": 0,
            "play_count": 0,
            "likes": 0,
            "number_of_skips": 0
        }).to_frame().T], ignore_index=True)

    # data_frame.sort_values(by=["play_count"], inplace=True, ascending=False)
    data_frame.sort_values(by=["track_id"], inplace=True, ascending=True)

    # save training data
    training_json = json.loads(data_frame.to_json(orient="records"))
    with open(f"{training_data_by_weeks_file}_{i + 1}.json", 'w+') as f:
        json.dump(training_json, f, indent=4)

    data_frame.drop(columns=["popularity", "duration_ms", "explicit", "danceability",
                             "energy", "key", "loudness", "speechiness", "acousticness",
                             "instrumentalness", "liveness", "valence", "tempo",
                             "release_date_year", "release_date_week", "likes",
                             "number_of_skips"], inplace=True)

    # save training data
    basic_training_json = json.loads(data_frame.to_json(orient="records"))
    with open(f"{basic_training_data_by_weeks_file}_{i + 1}.json", 'w+') as f:
        json.dump(basic_training_json, f, indent=4)

1/53


  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_

2/53


  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_

3/53


  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_

4/53


  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_frame.append({
  data_frame = data_

5/53


KeyboardInterrupt: 