In [1]:
import argparse
import errno
import os

import numpy as np
import torch
!pip install wget
import wget
from torch.nn.functional import softplus

!pip install pyro-ppl
#!pip install pyro


import pyro
import pyro.optim as optim
from pyro.contrib.easyguide import EasyGuide
from pyro.contrib.examples.util import get_data_directory
from pyro.distributions import Gamma, Normal, Poisson
from pyro.infer import SVI, TraceMeanField_ELBO
from pyro.infer.autoguide import AutoDiagonalNormal, init_to_feasible
import pandas as pd
torch.set_default_tensor_type("torch.FloatTensor")
pyro.util.set_rng_seed(0)
from scipy import sparse

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
randseed = 29266137

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=9714d22eb22353207e698c19c698f8630b31910a176b63a08fb4f3f76dbf6214
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Collecting pyro-ppl
  Downloading pyro_ppl-1.8.1-py3-none-any.whl (718 kB)
[K     |████████████████████████████████| 718 kB 5.4 MB/s 
Collecting pyro-api>=0.1.1
  Downloading pyro_api-0.1.2-py3-none-any.whl (11 kB)
Installing collected packages: pyro-api, pyro-ppl
Successfully installed pyro-api-0.1.2 pyro-ppl-1.8.1


In [2]:
# helper for initializing variational parameters
def rand_tensor(shape, mean, sigma):
    return mean * torch.ones(shape) + sigma * torch.randn(shape)

class SparseGammaDEF:
    def __init__(self,users,items):
        # define the sizes of the layers in the deep exponential family
        self.top_width = 10
        self.mid_width = 4
        self.bottom_width = 5

#        self.image_size = 64 * 64
        self.users = users
        self.items = items
        self.image_size = items

        # define hyperparameters that control the prior
        self.alpha_z = torch.tensor(0.1)
        self.beta_z = torch.tensor(0.1)
        self.alpha_w = torch.tensor(0.1)
        self.beta_w = torch.tensor(0.3)

        # define parameters used to initialize variational parameters
        self.alpha_init = 0.5
        self.mean_init = 0.0
        self.sigma_init = 0.1




      # define the model
    def model(self, x):
        x_size = x.size(0)

        # sample the global weights
        with pyro.plate("w_top_plate", self.top_width * self.mid_width):
            w_top = pyro.sample("w_top", Gamma(self.alpha_w, self.beta_w))
        with pyro.plate("w_mid_plate", self.mid_width * self.bottom_width):
            w_mid = pyro.sample("w_mid", Gamma(self.alpha_w, self.beta_w))
        with pyro.plate("w_bottom_plate", self.bottom_width * self.image_size):
            w_bottom = pyro.sample("w_bottom", Gamma(self.alpha_w, self.beta_w))

        # sample the local latent random variables
        # (the plate encodes the fact that the z's for different datapoints are conditionally independent)
        with pyro.plate("data", x_size):
            z_top = pyro.sample(
                "z_top",
                Gamma(self.alpha_z, self.beta_z).expand([self.top_width]).to_event(1),
            )
            # note that we need to use matmul (batch matrix multiplication) as well as appropriate reshaping
            # to make sure our code is fully vectorized
            w_top = (
                w_top.reshape(self.top_width, self.mid_width)
                if w_top.dim() == 1
                else w_top.reshape(-1, self.top_width, self.mid_width)
            )
            mean_mid = torch.matmul(z_top, w_top)
            z_mid = pyro.sample(
                "z_mid", Gamma(self.alpha_z, self.beta_z / mean_mid).to_event(1)
            )

            w_mid = (
                w_mid.reshape(self.mid_width, self.bottom_width)
                if w_mid.dim() == 1
                else w_mid.reshape(-1, self.mid_width, self.bottom_width)
            )
            mean_bottom = torch.matmul(z_mid, w_mid)
            z_bottom = pyro.sample(
                "z_bottom", Gamma(self.alpha_z, self.beta_z / mean_bottom).to_event(1)
            )

            w_bottom = (
                w_bottom.reshape(self.bottom_width, self.image_size)
                if w_bottom.dim() == 1
                else w_bottom.reshape(-1, self.bottom_width, self.image_size)
            )
            mean_obs = torch.matmul(z_bottom, w_bottom)

            # observe the data using a poisson likelihood
            pyro.sample("obs", Poisson(mean_obs).to_event(1), obs=x)

    # define our custom guide a.k.a. variational distribution.
        # (note the guide is mean field gamma)
    def guide(self, x):
      x_size = x.size(0)

      # define a helper function to sample z's for a single layer
      def sample_zs(name, width):
          alpha_z_q = pyro.param(
              "alpha_z_q_%s" % name,
              lambda: rand_tensor((x_size, width), self.alpha_init, self.sigma_init),
          )
          mean_z_q = pyro.param(
              "mean_z_q_%s" % name,
              lambda: rand_tensor((x_size, width), self.mean_init, self.sigma_init),
          )
          alpha_z_q, mean_z_q = softplus(alpha_z_q), softplus(mean_z_q)
          pyro.sample(
              "z_%s" % name, Gamma(alpha_z_q, alpha_z_q / mean_z_q).to_event(1)
          )

      # define a helper function to sample w's for a single layer
      def sample_ws(name, width):
          alpha_w_q = pyro.param(
              "alpha_w_q_%s" % name,
              lambda: rand_tensor((width), self.alpha_init, self.sigma_init),
          )
          mean_w_q = pyro.param(
              "mean_w_q_%s" % name,
              lambda: rand_tensor((width), self.mean_init, self.sigma_init),
          )
          alpha_w_q, mean_w_q = softplus(alpha_w_q), softplus(mean_w_q)
          pyro.sample("w_%s" % name, Gamma(alpha_w_q, alpha_w_q / mean_w_q))

      # sample the global weights
      with pyro.plate("w_top_plate", self.top_width * self.mid_width):
          sample_ws("top", self.top_width * self.mid_width)
      with pyro.plate("w_mid_plate", self.mid_width * self.bottom_width):
          sample_ws("mid", self.mid_width * self.bottom_width)
      with pyro.plate("w_bottom_plate", self.bottom_width * self.image_size):
          sample_ws("bottom", self.bottom_width * self.image_size)

      # sample the local latent random variables
      with pyro.plate("data", x_size):
          sample_zs("top", self.top_width)
          sample_zs("mid", self.mid_width)
          sample_zs("bottom", self.bottom_width)

    #def getBottomZExpectations(self):        # grab the learned variational parameters
     # return pyro.param("mean_z_q_bottom")

# define a helper function to clip parameters defining the custom guide.
# (this is to avoid regions of the gamma distributions with extremely small means)
def clip_params():
    for param, clip in zip(("alpha", "mean"), (-2.5, -4.5)):
        for layer in ["_q_top", "_q_mid", "_q_bottom"]:
            for wz in ["_w", "_z"]:
                pyro.param(param + wz + layer).data.clamp_(min=clip)


# Define a guide using the EasyGuide class.
# Unlike the 'auto' guide, this guide supports data subsampling.
# This is the best performing of the three guides.
#
# This guide is functionally similar to the auto guide, but performs
# somewhat better. The reason seems to be some combination of: i) the better
# numerical stability of the softplus; and ii) the custom initialization.
# Note however that for both the easy guide and auto guide KL divergences
# are not computed analytically in the ELBO because the ELBO thinks the
# mean-field condition is not satisfied, which leads to higher variance gradients.
class MyEasyGuide(EasyGuide):
    def guide(self, x):
        # group all the latent weights into one large latent variable
        global_group = self.group(match="w_.*")
        global_mean = pyro.param(
            "w_mean", lambda: rand_tensor(global_group.event_shape, 0.5, 0.1)
        )
        global_scale = softplus(
            pyro.param(
                "w_scale", lambda: rand_tensor(global_group.event_shape, 0.0, 0.1)
            )
        )
        # use a mean field Normal distribution on all the ws
        global_group.sample("ws", Normal(global_mean, global_scale).to_event(1))

        # group all the latent zs into one large latent variable
        local_group = self.group(match="z_.*")
        x_shape = x.shape[:1] + local_group.event_shape

        with self.plate("data", x.size(0)):
            local_mean = pyro.param("z_mean", lambda: rand_tensor(x_shape, 0.5, 0.1))
            local_scale = softplus(
                pyro.param("z_scale", lambda: rand_tensor(x_shape, 0.0, 0.1))
            )
            # use a mean field Normal distribution on all the zs
            local_group.sample("zs", Normal(local_mean, local_scale).to_event(1))


In [3]:
#Preprocessing the graph
df = pd.read_csv('/content/songsDataset.csv')
df = df.rename({"\'userID\'": "userId", "\'songID\'": "movieId", "\'rating\'": "rating"}, axis='columns')
df = df.drop(labels = range(10000,2000000), axis = 0)
songIntCode, songUniques = pd.factorize(df['movieId'], sort=True) #Reindexing songs ids
df['movieId'] = songIntCode

exposureDf = df.copy()
exposureDf['rating'] = exposureDf['rating'].where(exposureDf['rating'] == 0, 1)
nusers = exposureDf['userId'].nunique()
nitems = exposureDf['movieId'].nunique()
a_matrix = sparse.coo_matrix((exposureDf["rating"],(exposureDf["userId"],exposureDf["movieId"])),shape=(nusers,nitems))
a_matrix = a_matrix.todense()
data = torch.tensor(a_matrix) #Required by our model
 
users, items = data.shape
print('Users Count')
print(users)
print('Items Count')
print(items)

print("data processed...")

sparse_gamma_def = SparseGammaDEF(users,items)

# Due to the special logic in the custom guide (e.g. parameter clipping), the custom guide
# seems to be more amenable to higher learning rates.
# Nevertheless, the easy guide performs the best (presumably because of numerical instabilities
# related to the gamma distribution in the custom guide).
#learning_rate = 0.2 if args.guide in ["auto", "easy"] else 4.5
learning_rate = 4.5
#momentum = 0.05 if args.guide in ["auto", "easy"] else 0.1
momentum = 0.1
opt = optim.AdagradRMSProp({"eta": learning_rate, "t": momentum})

# use one of our three different guide types
# if args.guide == "auto":
#     guide = AutoDiagonalNormal(sparse_gamma_def.model, init_loc_fn=init_to_feasible)
# elif args.guide == "easy":
#     guide = MyEasyGuide(sparse_gamma_def.model)
# else:
#     guide = sparse_gamma_def.guide
guid_type = 'custom'
guide = sparse_gamma_def.guide

num_epochs = 30
eval_frequency = 25
eval_particles = 20


# this is the svi object we use during training; we use TraceMeanField_ELBO to
# get analytic KL divergences
svi = SVI(sparse_gamma_def.model, guide, opt, loss=TraceMeanField_ELBO())

# we use svi_eval during evaluation; since we took care to write down our model in
# a fully vectorized way, this computation can be done efficiently with large tensor ops
svi_eval = SVI(
    sparse_gamma_def.model,
    guide,
    opt,
    loss=TraceMeanField_ELBO(
        num_particles=eval_particles, vectorize_particles=True
    ),
)

print("\nbeginning training with %s guide..." % guid_type)

# the training loop
for k in range(num_epochs):
    loss = svi.step(data)
    # for the custom guide we clip parameters after each gradient step
    if guid_type == "custom":
        clip_params()

    if k % eval_frequency == 0 and k > 0 or k == num_epochs - 1:
        loss = svi_eval.evaluate_loss(data)
        print("[epoch %04d] training elbo: %.4g" % (k, -loss))


z_b =  pyro.param("mean_z_q_bottom") #Bottom modt layer latents

Users Count
1000
Items Count
7389
data processed...

beginning training with custom guide...
[epoch 0025] training elbo: -3.577e+05
[epoch 0029] training elbo: -1.28e+06


In [4]:
def get_ratings_matrix(df, train_size=0.75):
    user_to_row = {}
    movie_to_column = {}
    df_values = df.values
    n_dims = 10
    parameters = {}
    
    uniq_users = np.unique(df_values[:, 0])
    uniq_movies = np.unique(df_values[:, 1])

    for i, UserId in enumerate(uniq_users):
        user_to_row[UserId] = i

    for j, ItemId in enumerate(uniq_movies):
        movie_to_column[ItemId] = j
    
    n_users = len(uniq_users)
    n_movies = len(uniq_movies)
    
    R = np.zeros((n_users, n_movies))
    
    df_copy = df.copy()
    train_set = df_copy.sample(frac=train_size, random_state=0)
    test_set = df_copy.drop(train_set.index)
    
    for index, row in train_set.iterrows():
        i = user_to_row[row.userId]
        j = movie_to_column[row.movieId]
        R[i, j] = row.rating

    return R, train_set, test_set, n_dims, n_users, n_movies, user_to_row, movie_to_column

R, train_set, test_set, n_dims, n_users, n_movies, user_to_row, movie_to_column = get_ratings_matrix(df, 0.8)



def matrix_X(R):
  X = []
  for i in range(len(R)):
    row = [1 if val == 1 else 0 for val in R[i]]
    X.append(row)
  return X

X = matrix_X(R)
ratings = df['rating']
y = R
pmfU = z_b.detach().numpy()

y_scaler = preprocessing.StandardScaler().fit(y)
y_scaled = y_scaler.fit_transform(y)

X_scaler = preprocessing.StandardScaler().fit(X)
X_scaled = X_scaler.fit_transform(X)

pmfU_scaler = preprocessing.StandardScaler().fit(pmfU)
pmfU_scaled = pmfU_scaler.fit_transform(pmfU)

X_train, X_test = train_test_split(X_scaled, test_size=0.20, random_state=randseed)
y_train, y_test = train_test_split(y_scaled, test_size=0.20, random_state=randseed)
pmfU_train, pmfU_test = train_test_split(pmfU_scaled, test_size=0.20, random_state=randseed)
n_users, n_items = X_train.shape

In [5]:
import warnings
warnings.filterwarnings('ignore')
reg = linear_model.Ridge(normalize=True)
for i in range(n_items):
    # if i%100 == 0:
    #   print('---- Fitting row', i, '----')
    reg.fit(np.column_stack([X_train[:,i], pmfU_train]), y_train[:,i])


test_items = X_test.shape[1]
prediction = []

for i in range(test_items):
    # if i%100 == 0:
    #   print('---- Predicting row', i, '----')
#    print(len([X_test[:,i], pmfU_test]))
    res = reg.predict(np.column_stack([X_test[:,i], pmfU_test]))
    prediction.append(res)    

In [6]:
y_test = np.transpose(y_test)
rmse = mean_squared_error(y_test, prediction, squared=False)
print(rmse)

0.8880783998490532
