<a href="https://colab.research.google.com/github/DomizianoScarcelli/big-data-project/blob/nn-model/NN_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install dependencies

In [5]:
!pip install petastorm -qq
!pip install pyspark -qq
!pip install -U -q PyDrive -qq
!apt install openjdk-8-jdk-headless -qq

openjdk-8-jdk-headless is already the newest version (8u372-ga~us1-0ubuntu1~20.04).
0 upgraded, 0 newly installed, 0 to remove and 28 not upgraded.


In [6]:
#@title Imports
import os
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly

import pyspark
from pyspark.sql import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, FloatType, LongType
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.ml.linalg import SparseVector, DenseVector

from tqdm.notebook import tqdm
import time
import gc

from google.colab import drive
from petastorm import make_batch_reader
from petastorm.pytorch import DataLoader

In [7]:
#@title Set up variables
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
GDRIVE_DIR = "/content/drive"
GDRIVE_HOME_DIR = GDRIVE_DIR + "/MyDrive"
GDRIVE_DATA_DIR = GDRIVE_HOME_DIR + "/Big Data/datasets"
DATASET_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_friendly_spotify_playlist_dataset")
AUDIO_FEATURES_FILE = os.path.join(GDRIVE_DATA_DIR, "pyspark_track_features")
LITTLE_SLICE_FILE = os.path.join(GDRIVE_DATA_DIR, "little_slice")
SMALL_SLICE_FLIE = os.path.join(GDRIVE_DATA_DIR, "small_slice")
LITTLE_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "little_slice_audio_features")
MICRO_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "micro_slice_audio_features")
SPLITTED_SLICE_AUDIO_FEATURES = os.path.join(GDRIVE_DATA_DIR, "splitted_pyspark_track_features")
SAVED_DFS_PATH = os.path.join(GDRIVE_DATA_DIR, "saved_dfs")
RANDOM_SEED = 42 # for reproducibility
os.environ["JAVA_HOME"] = JAVA_HOME
os.environ["PYSPARK_PYTHON"]="python"

In [8]:
drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


In [9]:
#@title Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '100G').\
                set("spark.executor.extraJavaOptions", "-XX:+UseG1GC").\
                setAppName("PySparkTutorial").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

## Setup ngrok

In [10]:
!pip install pyngrok

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyngrok
  Downloading pyngrok-6.0.0.tar.gz (681 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m681.2/681.2 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyngrok
  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone
  Created wheel for pyngrok: filename=pyngrok-6.0.0-py3-none-any.whl size=19867 sha256=36e3772e65e2b9041b1bf6d776df2a76bbfac4489d74bb619b7d60732b325565
  Stored in directory: /root/.cache/pip/wheels/5c/42/78/0c3d438d7f5730451a25f7ac6cbf4391759d22a67576ed7c2c
Successfully built pyngrok
Installing collected packages: pyngrok
Successfully installed pyngrok-6.0.0


In [11]:
!ngrok authtoken 2NVN8kdoOnMVtlDGGWtwsbT5M3Q_2EJv2HE77FEXkz978Qtnq

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [12]:
from pyngrok import ngrok

# Open a ngrok tunnel on the port 4050 where Spark is running
port = '4050'
public_url = ngrok.connect(port).public_url



In [13]:
print("To access the Spark Web UI console, please click on the following link to the ngrok tunnel \"{}\" -> \"http://127.0.0.1:{}\"".format(public_url, port))

To access the Spark Web UI console, please click on the following link to the ngrok tunnel "https://feb0-35-201-201-181.ngrok-free.app" -> "http://127.0.0.1:4050"


In [14]:
#@title Check if everything is ok
spark, sc._conf.getAll()


(<pyspark.sql.session.SparkSession at 0x7f89e22f61d0>,
 [('spark.executor.extraJavaOptions',
   '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false -XX:+UseG1GC'),
  ('spark.app.name', 'PySparkTutorial'),
  ('spark.driver.host', 'e8b508dba625'),


# Data acquisition

In [15]:
song_schema = StructType([
    StructField("pos", IntegerType(), True),
    StructField("artist_name", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("artist_uri", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("album_uri", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("album_name", StringType(), True)
])

playlist_schema = StructType([
    StructField("name", StringType(), True),
    StructField("collaborative", StringType(), True),
    StructField("pid", IntegerType(), True),
    StructField("modified_at", IntegerType(), True),
    StructField("num_tracks", IntegerType(), True),
    StructField("num_albums", IntegerType(), True),
    StructField("num_followers", IntegerType(), True),
    StructField("tracks", ArrayType(song_schema), True),
    StructField("num_edits", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("num_artists", IntegerType(), True),
])

audio_features_schema = StructType([
    StructField("danceability", FloatType(), True),
    StructField("energy", FloatType(), True),
    StructField("key", IntegerType(), True),
    StructField("loudness", FloatType(), True),
    StructField("mode", IntegerType(), True),
    StructField("speechiness", FloatType(), True),
    StructField("acousticness", FloatType(), True),
    StructField("instrumentalness", FloatType(), True),
    StructField("liveness", FloatType(), True),
    StructField("valence", FloatType(), True),
    StructField("tempo", FloatType(), True),
    StructField("type", StringType(), True),
    StructField("id", StringType(), True),
    StructField("uri", StringType(), True),
    StructField("track_href", StringType(), True),
    StructField("analysis_url", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("time_signature", IntegerType(), True)
])


In [16]:
slice_df = spark.read.schema(playlist_schema).json(SMALL_SLICE_FLIE, multiLine=True)

# Import pyspark dataframe

In [17]:
import warnings
warnings.filterwarnings('ignore')

In [18]:
from pyspark.ml.linalg import VectorUDT

playlist_schema_mapped = StructType([
    StructField("name", StringType(), True),
    StructField("collaborative", StringType(), True),
    StructField("pid", IntegerType(), True),
    StructField("modified_at", IntegerType(), True),
    StructField("num_tracks", IntegerType(), True),
    StructField("num_albums", IntegerType(), True),
    StructField("num_followers", IntegerType(), True),
    StructField("tracks", VectorUDT(), True),
    StructField("num_edits", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("num_artists", IntegerType(), True),
])

In [19]:
PLAYLIST_EMBEDDINGS = os.path.join(SAVED_DFS_PATH, "playlist_embeddings_new.parquet")
JSON_PLAYLIST_EMBEDDINGS = os.path.join(SAVED_DFS_PATH, "playlist_embeddings_NN.json")

In [20]:
if os.path.exists(JSON_PLAYLIST_EMBEDDINGS):
  mapped_slice_df = spark.read.schema(playlist_schema_mapped).json(JSON_PLAYLIST_EMBEDDINGS)
else:
  mapped_slice_df = spark.read.schema(playlist_schema_mapped).parquet(PLAYLIST_EMBEDDINGS)
  mapped_slice_df.write.json(JSON_PLAYLIST_EMBEDDINGS)

# Extract artist matrix $\mathbf{A}$

From the main dataframe `slice_df` that contains the information about playlists and their relative songs, I want to obtain an artist binary `SparseVector` $\mathbf{a}$ for each playlist that describes the artists that are inside of that playlist.
Stacked togheter, all the vectors make the artists matrix $\mathbf{A} \in \mathbb{R}^{m \times k}$

In [21]:
slice_df.show()

+--------------+-------------+-----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|          name|collaborative|  pid|modified_at|num_tracks|num_albums|num_followers|              tracks|num_edits|duration_ms|num_artists|
+--------------+-------------+-----+-----------+----------+----------+-------------+--------------------+---------+-----------+-----------+
|         Ratch|        false|45000| 1508976000|        88|        70|            1|[{0, Beyoncé, spo...|       50|   20047039|         48|
|  slow it down|        false|45001| 1505952000|        80|        77|            1|[{0, Twinbed, spo...|       20|   20365984|         65|
|    Phat Beats|        false|45002| 1466640000|        24|        15|            5|[{0, Baths, spoti...|       16|    5127143|         14|
|           ✌🏽|        false|45003| 1509148800|        77|        63|            3|[{0, Owl City, sp...|       50|   17201663|         54|
|          💘💘|       

In [37]:
slice_df.limit(1).cache().select(explode("tracks")).select("col.*").show()

+---+--------------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+
|pos|   artist_name|           track_uri|          artist_uri|          track_name|           album_uri|duration_ms|          album_name|
+---+--------------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+
|  0|       Beyoncé|spotify:track:7te...|spotify:artist:6v...|         ***Flawless|spotify:album:2UJ...|     250960|BEYONCÉ [Platinum...|
|  1|     Rich Gang|spotify:track:7DT...|spotify:artist:6h...|           Lifestyle|spotify:album:4EZ...|     269693|           Lifestyle|
|  2|     Fetty Wap|spotify:track:5NQ...|spotify:artist:6P...|679 (feat. Remy B...|spotify:album:0Ty...|     196693|679 (feat. Remy B...|
|  3|        T-Pain|spotify:track:6lb...|spotify:artist:3a...|Up Down (Do This ...|spotify:album:184...|     231093|T-Pain Presents H...|
|  4|       Jidenna|spotify:track:

In [38]:
def get_all_artists(playlist_df: DataFrame) -> Tuple[DataFrame, int]:
   all_songs = playlist_df.select(explode("tracks.artist_uri").alias("artist_uri")).distinct()
   return all_songs

def create_artists_pos_mapping(artists_df: DataFrame) -> DataFrame:
  artists_df = get_all_artists(slice_df)
  artists_df.createOrReplaceTempView("ARTISTS")
  artists_df = spark.sql("""
  SELECT 
      row_number() OVER (
          PARTITION BY '' 
          ORDER BY '' 
      ) as pos,
      *
  FROM 
      ARTISTS
  """)

  artists_df = artists_df.sort("artist_uri")

  ARTIST_VECTOR_LENGTH = artists_df.count()

  return artists_df, ARTIST_VECTOR_LENGTH


In [None]:
artists_df, ARTIST_VECTOR_LENGTH = create_artists_pos_mapping(slice_df)

In [47]:
from pyspark.ml.linalg import VectorUDT
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, ArrayType
from functools import reduce

artist_uri_to_id = artists_df.select('artist_uri', 'pos').rdd.collectAsMap() # TODO: Pass it as a parameter maybe?
#TODO: Since the .rdd is very slow, I can embed the position information of the track inside the track itself,
# So then I can just do pos_list.add(row.rating_position) in a few miliseconds. 
def create_artists_vector(playlist_df: DataFrame, mapping: DataFrame) -> DataFrame:
    """
    Returns a DataFrames containing the playlists, but the tracks are represented as a binary sparse vector.
    """

    @udf(returnType=VectorUDT())
    def extract_vector(tracks):
      pos_list = set()

      def reduce_fn(pos_list, row):
          pos_list.add(artist_uri_to_id.get(row.artist_uri))
          return pos_list
      
      pos_list = reduce(reduce_fn, tracks, pos_list)
      
      return SparseVector(ARTIST_VECTOR_LENGTH + 1, sorted(list(pos_list)), [1 for _ in pos_list])

    # Apply the mapping UDF on the "tracks" column of the slice_df dataframe
    mapped_df = playlist_df.withColumn('tracks', extract_vector(col('tracks')))

    return mapped_df

# if not os.path.exists(PLAYLIST_EMBEDDINGS):
artists_slice_df = create_artists_vector(slice_df, artists_df).cache()
  # mapped_slice_df.write.parquet(PLAYLIST_EMBEDDINGS)
# else:
  # mapped_slice_df = spark.read.schema(playlist_schema_mapped).parquet(PLAYLIST_EMBEDDINGS)

100000

# Convert PySpark DataFrame into PyTorch DataLoader

In [22]:
import numpy as np
from pyspark.sql.functions import udf, length
from pyspark.ml.linalg import VectorUDT
import torch
from typing import Tuple, List


def convert_sparse_to_indices(df: DataFrame) -> DataFrame:
  """
  Given a dataframe fo columns "pos":int and "tracks":SparseVector, it returns a new dataframe where
  the SparseVector are replaced with a list of the indices where the values are.
  (The value information is lost, but we don't care since they are binary values so they will be all ones)
  """
  # SparseTupleType = ArrayType(
  #   ArrayType(IntegerType())
  # )

  @udf(returnType=ArrayType(IntegerType()))
  def transform_array(item: SparseVector):
    """
    Given a SparseVector (binary) it returns the tuple that represent it, of the type (size, indices)
    """
    indices_list = item.indices.tolist()
    padding_width = max_songs - len(indices_list)
    return indices_list + [-1] * padding_width
  
  max_songs = mapped_slice_df.select(max("num_tracks")).first()[0]
  print(f"Max number of songs: {max_songs}")
  df = df.withColumn("embedding_indices", transform_array(col("tracks"))).drop("tracks")
  return df


In [50]:
def padded_tensors_to_sparse_matrix(padded_tensor: torch.Tensor, shape: tuple) -> torch.Tensor:
  batch_size, max_songs = padded_tensor.size(0), padded_tensor.size(1)
  rows = []
  for row_idx in range(batch_size):
    row = padded_tensor[row_idx]
    indices = row[row != -1]
    sparse_tensor = torch.sparse_coo_tensor(indices.unsqueeze(0), torch.ones(indices.shape), shape)
    rows.append(sparse_tensor)
  return torch.stack(rows)

def padded_tensors_to_dense_matrix(padded_tensor: torch.Tensor, shape: tuple) -> torch.Tensor:
  batch_size, max_songs = padded_tensor.size(0), padded_tensor.size(1)
  rows = []
  for row_idx in range(batch_size):
    row = padded_tensor[row_idx]
    indices = row[row != -1]
    sparse_tensor = torch.sparse_coo_tensor(indices.unsqueeze(0), torch.ones(indices.shape), shape)
    dense = sparse_tensor.to_dense()
    rows.append(dense)
  unpadded = torch.stack(rows)
  return unpadded

In the paper they have two matrices,l et $n$ be the number of unique songs, $m$ the number of playlists and $k$ the number of unique artists:

- $P \in \mathbb{R}^{m \times n}$ where $p_i = 1$ if song $i$ is in the playlist, $p_i=0$ otherwise
- $A \in \mathbb{R}^{m \times k}$ where $a_i=1$ if the artist is present in the playlist, $a_i = 0$ otherwise

In [53]:
from petastorm.spark import SparkDatasetConverter, make_spark_converter
from petastorm.unischema import dict_to_spark_row, Unischema, UnischemaField
from petastorm.codecs import ScalarCodec, CompressedImageCodec, NdarrayCodec
from torchvision import transforms
from petastorm import TransformSpec

CACHE = os.path.join(GDRIVE_DIR, "/big_data/cache_5")
spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, f'file://{CACHE}')

pytorch_songs_df = convert_sparse_to_indices(mapped_slice_df.select("tracks"))
songs_converter = make_spark_converter(pytorch_songs_df)

pytorch_artists_df = convert_sparse_to_indices(artists_slice_df.select("tracks"))
artist_converter = make_spark_converter(pytorch_songs_df)

Max number of songs: 250




Max number of songs: 250




# PyTorch Model

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pickle


class DAE_tied(nn.Module):
    def __init__(self, conf):
        super(DAE_tied, self).__init__()
        self.save_dir = conf["save"]

        self.n_batch = conf["batch"]
        self.n_input = conf["n_input"]
        self.n_hidden = conf["hidden"]
        self.learning_rate = conf["lr"]
        self.reg_lambda = conf["reg_lambda"]

        self.x_positions = torch.LongTensor()
        self.x_ones = torch.FloatTensor()

        self.y_positions = torch.LongTensor()
        self.y_ones = torch.FloatTensor()

        self.keep_prob = torch.tensor(conf["keep_prob"], dtype=torch.float32)
        self.input_keep_prob = torch.tensor(conf["input_keep_prob"], dtype=torch.float32)

        self.weights = {}
        self.biases = {}
        self.d_params = []

    def init_weight(self):
        self.weights['encoder_h'] = nn.Parameter(torch.FloatTensor(self.n_input, self.n_hidden))
        nn.init.xavier_uniform_(self.weights['encoder_h'])
        self.biases['encoder_b'] = nn.Parameter(torch.FloatTensor(self.n_hidden))
        nn.init.zeros_(self.biases['encoder_b'])
        self.biases['decoder_b'] = nn.Parameter(torch.FloatTensor(self.n_input))
        nn.init.zeros_(self.biases['decoder_b'])
        self.d_params = [self.weights['encoder_h'], self.biases['encoder_b'], self.biases['decoder_b']]

    # Building the encoder
    def encoder(self, x):
        # Encoder Hidden layer with sigmoid activation #1
        layer = torch.add(torch.matmul(x, self.weights['encoder_h']), self.biases['encoder_b'])
        layer = torch.sigmoid(layer)
        layer = torch.nn.functional.dropout(layer, p=1 - self.keep_prob)

        return layer

    # Building the decoder
    def decoder(self, x):
        # Decoder Hidden layer with sigmoid activation #1
        layer = torch.sigmoid(torch.add(torch.matmul(x, self.weights['encoder_h'].t()), self.biases['decoder_b']))
        return layer

    def l2_loss(self):
      encoder_h_l2 = torch.norm(self.weights['encoder_h']) ** 2
      decoder_b_l2 = torch.norm(self.biases['decoder_b']) ** 2
      encoder_b_l2 = torch.norm(self.biases['encoder_b']) ** 2

      l2 = encoder_h_l2 + decoder_b_l2 + encoder_b_l2
      return l2

    def forward(self, x, y):

        # TODO: Take sparse matrix representation instead of dense
        # self.x = x_positxions
        # self.x_ones = x_ones
        # self.y_positions = y_positions
        # self.y_ones = y_ones

        self.x = x.t()
        self.y = y.t()

        # x_sparse = torch.sparse.FloatTensor(self.x_positions.t(), self.x_ones, torch.Size([self.n_batch, self.n_input]))
        # self.x = x_sparse.to_dense()
        # y_sparse = torch.sparse.FloatTensor(self.y_positions.t(), self.y_ones, torch.Size([self.n_batch, self.n_input]))
        # self.y = y_sparse.to_dense()

        x_dropout = torch.nn.functional.dropout(self.x, p= 1 - self.input_keep_prob) # Maybe error
        reduce_sum = torch.sum(x_dropout, dim=1, keepdim=True)
        self.x_dropout = torch.div(x_dropout, reduce_sum + 1e-10)

        encoder_op = self.encoder(self.x_dropout)
        self.y_pred = self.decoder(encoder_op)

        l2 = self.l2_loss()

        L = -torch.sum(self.y * torch.log(self.y_pred + 1e-10) +
                       0.55 * (1 - self.y) * torch.log(1 - self.y_pred + 1e-10), dim=1)
        self.cost = torch.mean(L) + self.reg_lambda * l2

    def save_model(self):
        params = [param.detach().numpy() for param in self.d_params]
        with open(self.save_dir, 'wb') as f:
            pickle.dump(params, f)

            
class DAE(DAE_tied):
    def __init__(self, conf):
        super(DAE, self).__init__(conf)
        self.initval_dir = conf["initval"]

    def init_weight(self):
        if self.initval_dir == 'NULL':
            self.weights['encoder_h'] = nn.Parameter(torch.FloatTensor(self.n_input, self.n_hidden))
            nn.init.xavier_uniform_(self.weights['encoder_h'])
            self.weights['decoder_h'] = nn.Parameter(torch.FloatTensor(self.n_input, self.n_hidden))
            nn.init.xavier_uniform_(self.weights['decoder_h'])
            self.biases['encoder_b'] = nn.Parameter(torch.FloatTensor(self.n_hidden))
            nn.init.zeros_(self.biases['encoder_b'])
            self.biases['decoder_b'] = nn.Parameter(torch.FloatTensor(self.n_input))
            nn.init.zeros_(self.biases['decoder_b'])
        else:
            with open(self.initval_dir, 'rb') as f:
                emb = pickle.load(f)
            self.weights['encoder_h'] = nn.Parameter(torch.FloatTensor(emb[0]))
            self.weights['decoder_h'] = nn.Parameter(torch.FloatTensor(emb[1]))
            self.biases['encoder_b'] = nn.Parameter(torch.FloatTensor(emb[2]))
            self.biases['decoder_b'] = nn.Parameter(torch.FloatTensor(emb[3]))

        self.d_params = [self.weights['encoder_h'], self.weights['decoder_h'],
                         self.biases['encoder_b'], self.biases['decoder_b']]

    def decoder(self, x):
        # Decoder Hidden layer with sigmoid activation #1
        layer = torch.sigmoid(torch.add(torch.matmul(x, self.weights['decoder_h'].t()), self.biases['decoder_b']))
        return layer

    def l2_loss(self):
      encoder_h_l2 = torch.norm(self.weights['encoder_h']) ** 2
      decoder_b_l2 = torch.norm(self.biases['decoder_b']) ** 2
      encoder_b_l2 = torch.norm(self.biases['encoder_b']) ** 2
      decoder_h_l2 = torch.norm(self.weights['decoder_h']) ** 2

      l2 = encoder_h_l2 + decoder_b_l2 + encoder_b_l2 + decoder_h_l2
      return l2

I'm trying to figure out what x and y are. From the source code, we can see this in the `data_readery.py:48` file:
```python
trk_positions = np.concatenate(trk_positions)
art_positions = np.concatenate(art_positions)
y_positions = np.concatenate((trk_positions, art_positions), 0)
```
So I can assume that y is just the concatenation of p_i and a_i.

On the other hand, from `data_reader.py:250`, we can see:
```python
trk_positions = np.concatenate(trk_positions)
art_positions = np.concatenate(art_positions)
x_positions = np.concatenate((trk_positions, art_positions), 0)
```

So From this I assume `y` and `x` are the same thing, meanin the concatenation of A and P.

In [56]:
#Hyperparameters used in the paper
NUM_SONGS = 681_806
NUM_PLAYLISTS = 100_000
conf = {
    'batch': 32,
    'n_input': NUM_SONGS + ARTIST_VECTOR_LENGTH,
    'hidden': 64,
    'lr': 0.001,
    'reg_lambda': 0.001,
    'initval': 'NULL',
    "keep_prob": 0.8,
    "input_keep_prob": 0.8,
    'save': './'
}
dae_model = DAE(conf)
dae_model.init_weight()
optimizer = optim.Adam(dae_model.d_params, lr=conf['lr'])

In [None]:
from tqdm.notebook import tqdm

with songs_converter.make_torch_dataloader() as songs_dataloader:
  with artist_converter.make_torch_dataloader() as artists_dataloader:

    num_epochs = 50
    losses = []
    ARTIST_SHAPE = (ARTIST_VECTOR_LENGTH, )
    SONG_SHAPE = (NUM_SONGS, )
    for epoch in tqdm(range(num_epochs), desc="Training..."):
      for song, artist in tqdm(zip(songs_dataloader, artists_dataloader), desc=f"Executing epoch n. {epoch}", total=songs_dataloader.__sizeof__()):
        padded_song_tensor = song["embedding_indices"]
        padded_artist_tensor = artist["embedding_indices"]
        
        song_dense = padded_tensors_to_dense_matrix(padded_song_tensor, SONG_SHAPE)
        artist_dense = padded_tensors_to_dense_matrix(padded_artist_tensor, ARTIST_SHAPE)

        optimizer.zero_grad()
        x = torch.concat((song_dense, artist_dense), dim=1).t()
        y = torch.concat((song_dense, artist_dense), dim=1).t()
        dae_model(x, y)
        loss = dae_model.cost
        dae_model.cost.backward()
        optimizer.step()

      losses.append(loss)
      print(f"Loss: {loss}")


Training...:   0%|          | 0/50 [00:00<?, ?it/s]

Executing epoch n. 0:   0%|          | 0/32 [00:00<?, ?it/s]

In [34]:
# import numpy as np
# from torch.utils.data import Dataset
# from torch.utils.data import DataLoader
# from torch.utils.data import random_split

# np.random.seed(42)  # Set the random seed for reproducibility

# # n = conf["n_input"]  # Number of unique songs
# n = 10_000
# m = 2_000   # Number of playlists
# k = 4_000 # Number of unique artists

# # Generate random binary matrix P
# P = np.random.randint(0, 2, size=(m, n))

# # Generate random binary matrix A
# A = np.random.randint(0, 2, size=(m, k))

# A = torch.tensor(A, dtype=torch.float)
# P = torch.tensor(P, dtype=torch.float)

# class MyDataset(Dataset):
#     def __init__(self, A, P):
#         self.A = A
#         self.P = P

#     def __len__(self):
#         return len(self.A)

#     def __getitem__(self, index):
#         return (self.A[index], self.P[index])

# dataset = MyDataset(A, P)
# train_ratio = 0.8
# test_ratio = 0.2

# batch_size = conf["batch"]

# train_size = int(train_ratio * len(dataset))
# test_size = len(dataset) - train_size
# train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# from tqdm.notebook import tqdm
# num_epochs = 50
# losses = []
# for epoch in tqdm(range(num_epochs), desc="Training..."):
#     for p, a in train_loader:
#         optimizer.zero_grad()
#         print(p.shape)
#         print(a.shape)

#         #TODO: this gives an error because the shapes are not compatible
#         # in particular p = (batch_size, num_songs)
#         # and a = (batc_size, num_artists)
#         # I want something like (2 * batch_size * num_songs)

#         x = torch.concat((p, a), dim=1).t() 
#         y = torch.concat((p, a), dim=1).t()

#         print(y.shape)

#         dae_model(x, y)
#         loss = dae_model.cost
#         dae_model.cost.backward()
#         optimizer.step()
#     losses.append(loss)
#     print(f"Loss: {loss}")