# SetUp решения

In [144]:
# configuration
from omegaconf import OmegaConf
from typing import Any

# utils
import numpy as np
import pandas as pd
import numexpr
import os

# dl setup
import sklearn
import torch

In [3]:


path_conf = OmegaConf.create({
    'path': './cmc-ml-spotify-recommendations',
    'train_assigns': '${concat_path:${path},"train.csv"}',
    'assign_info': '${concat_path:${path},"added_info.csv"}',
    'audio_features': '${concat_path:${path},"audio_features.csv"}',
    'track_info': '${concat_path:${path},"tracks_info.csv"}',
})

print(OmegaConf.to_yaml(path_conf, resolve=True))

path: ./cmc-ml-spotify-recommendations
train_assigns: ./cmc-ml-spotify-recommendations/train.csv
assign_info: ./cmc-ml-spotify-recommendations/added_info.csv
audio_features: ./cmc-ml-spotify-recommendations/audio_features.csv
track_info: ./cmc-ml-spotify-recommendations/tracks_info.csv



# Parsing

In [293]:
OmegaConf.register_new_resolver('concat_path', lambda *joints: os.path.join(*joints), replace=True)

conf = OmegaConf.create({
    # path config
    'path': {
        'standard_path': './cmc-ml-spotify-recommendations',
        'train_assigns': '${concat_path:${path.standard_path},"train.csv"}',
        'assign_info': '${concat_path:${path.standard_path},"added_info.csv"}',
        'audio_features': '${concat_path:${path.standard_path},"audio_features.csv"}',
        'track_info': '${concat_path:${path.standard_path},"tracks_info.csv"}',
    },

    # parsing audio features
    'audio_features_conf': {
        'drop': ['key', 'time_signature'],
        'normalize': ['danceability', 'energy', 'loudness', 
                    'speechiness', 'acousticness', 'instrumentalness', 
                    'liveness', 'valence', 'tempo', 'duration_ms'],
    },

    # parsing assignment info
    'assignment_info_conf': {
        'drop': ['added_by_type', 'added_by_id', 'playlist_id'],
        'parse_time': True,
    },

    # parsing track info
    'track_info_conf': {
        'drop': ['track_artists', 'track_available_markets', 'track_name', 'track_type', 'track_album_album',
                 'track_album_artists', 'track_album_id', 'track_album_name', 'track_album_type',
                 'track_disc_number', 'track_track_number', 'track_album_disc_number', 'track_album_track_number'],
        'bool': ['track_episode', 'track_explicit', 'track_is_local', 'track_track', 'track_album_episode', 
                 'track_album_explicit', 'track_album_is_local', 'track_album_track'],
        'normalize': ['track_duration_ms', 'track_popularity', 'track_album_duration_ms', 'track_album_popularity'],
        
    },

    # learning config
    'learning': {
        'dataloader': {
            'split_seed': 666,
            'n_workers': 4,
            'train_test_split': [0.8, 0.2],
            'batch_size': 512,
        }

    },
})

In [280]:
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import StandardScaler

In [327]:
class AudioFeaturesDataset(Dataset):
    def __init__(self, path, config):
        self.df = pd.read_csv(path).drop(columns=config.drop)
        self.normalize = StandardScaler(with_mean=True, with_std=True).set_output(transform='pandas')

        for column in config.normalize:
            self.df[column] = self.df[column].astype(np.float32)
        
        self.df.loc[:, config.normalize] = self.normalize.fit_transform(self.df.loc[:, config.normalize])
        self.df['track_id'] = self.df['id']
        self.df = self.df.drop(columns='id')

    def __len__(self) -> int:
        return len(self.df)
    
    def __getitem__(self, index: int) -> Any:
        return self.df[self.df['track_id'] == index]

In [328]:
class AssignmentInfoDataset(Dataset):
    def __init__(self, path, config):
        self.df = pd.read_csv(path).drop(columns=config.drop)
        
        if config.parse_time:
            self.df['added_at'] = pd.to_datetime(self.df['added_at']).dt.tz_localize(None)
            self.df['added_at'] = ((self.df['added_at'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')).astype(np.float32)

            self.time_normalize = StandardScaler(with_mean=True, with_std=True).set_output(transform='pandas')
            self.df['added_at'] = self.time_normalize.fit_transform((self.df[['added_at']]))


    def __len__(self) -> int:
        return len(self.df)
    
    def __getitem__(self, index: int) -> Any:
        return self.df[self.df['track_id'] == index]

In [333]:
class SpotifyTrackDataset(Dataset):
    def __init__(self, path, config, audio_ds):        
        data = pd.read_csv(path).drop(columns=config.drop)

        for column in config.bool:
            data[column] = data[column].astype(np.float32)
        normalize = StandardScaler(with_mean=True, with_std=True).set_output(transform='pandas')
        data.loc[:, config.normalize] = normalize.fit_transform(data.loc[:, config.normalize])

        self.keys = {key: i for i, key in enumerate(data.track_id)}
        self.data = data.merge(audio_ds.df, on='track_id', how='left').drop(columns=['track_id']).to_numpy()

    def __len__(self) -> int:
        return len(self.data)
    
    def __getitem__(self, index: int) -> Any:
        return torch.tensor(self.data[self.keys[index]])

In [334]:
audio_features_dataset = AudioFeaturesDataset(conf.path.audio_features, conf.audio_features_conf)
# assignment_info_dataset = AssignmentInfoDataset(conf.path.assign_info, conf.assignment_info_conf)
spotify_dataset = SpotifyTrackDataset(conf.path.track_info, conf.track_info_conf, audio_features_dataset)

In [337]:
test = pd.read_csv(conf.path.track_info)

In [338]:
for ind in test.track_id:
   assert spotify_dataset['4E2TwkPZq7cYiTkYkmHMbz'].shape[0] != 0

In [341]:
spotify_dataset.data.shape

(900411, 23)

In [340]:
len(test)

899712

In [335]:
torch.random.manual_seed(conf.learning.dataloader.split_seed)
train_ds, test_ds = random_split(spotify_dataset, conf.learning.dataloader.train_test_split)
train_dl = DataLoader(train_ds, batch_size=conf.learning.dataloader.batch_size)

In [336]:
next(iter(train_dl))

KeyError: 81389