## Data Preprocessing for Artist Embedding

In [None]:
def hex2id(hex):
    return df.id[df.artist_id==hex].values[0]

In [None]:
id_similar_artistx = {hex2id(i):set() for i in tqdm.tqdm(final_all_artists)}

In [None]:
import tqdm
for i in tqdm.tqdm(similar_all_artists.items()):
    for sim_ar in i[1]:
        id_similar_artistx[hex2id(i[0])].add(hex2id(sim_ar))
        id_similar_artistx[hex2id(sim_ar)].add(hex2id(i[0]))

In [None]:
pickle.dump(id_similar_artistx, open('id_similar_artists.pkl', 'wb'))

In [None]:
simar = [list(i[1]) for i in id_similar_artistx.items()]

df_train = pd.DataFrame({"rated":simar})
df_train.head()

In [None]:
import pandas as pd
data = pd.read_csv("matches_train_for_d2v.dat")

In [None]:
data.head()

## Training the Model

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import Word2Vec, KeyedVectors
import multiprocessing
import pickle

np.random.seed(0)
tqdm.pandas()


class ModelSaver(CallbackAny2Vec):
    """Gensim callback to save model every log_frequency epochs"""

    def __init__(
        self, d2v_object, rated_embeddings_path, w2v_model_path, log_frequency=5
    ):
        self.epoch = 1
        self.log_frequency = log_frequency
        self.d2v_object = d2v_object
        self.rated_embeddings_path = rated_embeddings_path
        self.w2v_model_path = w2v_model_path

    def on_epoch_begin(self, model):
        pass

    def on_epoch_end(self, model):
        if self.epoch % self.log_frequency == 0:
            self.d2v_object.w2v_model = model
            self.d2v_object.wv = model.wv
            self.d2v_object.save_rated_vec(self.rated_embeddings_path)
            self.d2v_object.save_w2v_model(self.w2v_model_path)
        self.epoch += 1


class D2V_Recommender:
    def __init__(
        self,
        embedding_size=100,
        window=3,
        min_count=1,
        workers=multiprocessing.cpu_count() - 1,
        num_epochs=50,
        sample=0,  # do not downsample
    ):
        self.embedding_size = embedding_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.num_epochs = num_epochs
        self.sample = sample

        self.wv = None
        self.mean_embeddings = (
            # access to embeddings with self.mean_embeddings["rater_user_id"]
            None
        )
        self.data_dict = None  # dict of arrays with X_train, X_test, y_train, y_test

    def fit_rated_embeddings(
        self, d2v_train, w2v_model_path, rated_embeddings_path, resume_training=False
    ):
        """
        Fit and save Word2Vec model to embed rated users based on dating behavior of raters.

        :param d2v_train: A pd.Series of list of rated_ids (string) that were co-liked by a rater
        """
        # Prepare the data iterator
        d2v_train_iterator = self.build_data_iterator(d2v_train)

        # Initiate the model
        
        model_saver = ModelSaver(self, rated_embeddings_path, w2v_model_path)
        self.w2v_model = Word2Vec(
            vector_size=self.embedding_size,
            window=self.window,
            min_count=self.min_count,
            workers=self.workers,
            sample=self.sample,
            sg=1,  # skip-gram
            hs=0,
            negative=5,
            callbacks=[model_saver],
            seed=0,
        )
        model = self.w2v_model
        if resume_training:
            model = self.load_w2v_model(w2v_model_path)
            model.build_vocab(d2v_train_iterator, update=True)
        elif model.train_count == 0:
            model.build_vocab(d2v_train_iterator)

        # train and save final model
        model.train(
            d2v_train_iterator,
            total_examples=model.corpus_count,
            epochs=self.num_epochs,
            compute_loss=True,
        )

        self.w2v_model = model
        self.wv = model.wv
        self.save_rated_vec(rated_embeddings_path)
        self.save_w2v_model(w2v_model_path)

    def build_data_iterator(self, data):
        """ Create an iterator of which an iter is random passes on the data"""

        class shuffle_generator:
            def __init__(self, data):
                self.data = data

            def __iter__(self):
                self.data.apply(np.random.shuffle)
                return shuffle_generator_iter(self.data)

        class shuffle_generator_iter:
            def __init__(self, data):
                self.i = 0
                self.data = data
                self.data_length = len(data)

            def __iter__(self):
                # Iterators are iterables too.
                # Adding this functions to make them so.
                return self

            def __next__(self):
                if self.i < 5 * self.data_length:
                    # Shuffle at the end of the data
                    if self.i % self.data_length == 0:
                        self.data.apply(np.random.shuffle)
                    i = self.i
                    self.i += 1
                    return self.data[i % self.data_length]  # a list
                else:
                    raise StopIteration()

        return shuffle_generator(data)

    def get_single_rated_vec(self, rated_id):
        """ Get embedding vector of rated user of id rated_id"""
        try:
            return self.wv[str(rated_id)]
        except KeyError:
            # The rated user did not appear in the training dataset
            return None

    def get_single_rater_vec(self, rater_id):
        """ Get embedding vector of rater user of id rater_id"""
        try:
            return self.mean_embeddings.loc[str(rater_id)].values
        except KeyError:
            return None

    def save_rated_vec(self, wordvectors_path):
        # wordvectors_path.parent.mkdir(parents=True, exist_ok=True)
        self.wv.save(str(wordvectors_path))

    def load_rated_vec(self, wordvectors_path):
        self.wv = KeyedVectors.load(str(wordvectors_path), mmap="r")
        return self.wv

    def save_w2v_model(self, w2v_model_path):
        # w2v_model_path.parent.mkdir(parents=True, exist_ok=True)
        self.w2v_model.save(str(w2v_model_path))

    def load_w2v_model(self, w2v_model_path):
        self.w2v_model = Word2Vec.load(str(w2v_model_path), mmap="r")
        return self.w2v_model

    def save_data_dict(self, data_dict_path):
        with open(data_dict_path, "wb") as handle:
            pickle.dump(self.data_dict, handle,
                        protocol=pickle.HIGHEST_PROTOCOL)

    def load_data_dict(self, data_dict_path):
        with open(data_dict_path, "rb") as handle:
            self.data_dict = pickle.load(handle)
        return self.data_dict


In [None]:
def process(x):
    return [str(i) for i in x]
    
recommender = D2V_Recommender()

d2v_train = df_train["rated"].map(process)

In [None]:
resume_training = False
recommender.fit_rated_embeddings(
    d2v_train,
    "model.bin",
    "rated.vectors",
    resume_training=resume_training,
)
del d2v_train

In [None]:
len(recommender.wv.index_to_key)