In [1]:
import os
import sys
import pickle
import warnings
import numpy as np 
import pandas as pd 
from ast import literal_eval
import matplotlib.pyplot as plt
from scipy import sparse, stats
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.model_selection import train_test_split

randseed = 29266137

In [18]:
!pip install tensorflow_probability
%tensorflow_version 1.x
import tensorflow as tf
import numpy as np 
import pandas as pd
import numpy.random as npr
from scipy import sparse
import tensorflow_probability as tfp
from tensorflow_probability import edward2 as ed

TensorFlow 1.x selected.


In [6]:
def PreprocessData():
    df = pd.read_csv('/content/songsDataset.csv')
    df = df.rename({"\'userID\'": "userId", "\'songID\'": "movieId", "\'rating\'": "rating"}, axis='columns')
    
    df = df.drop(labels = range(10000,2000000), axis = 0)
    songIntCode, songUniques = pd.factorize(df['movieId'], sort=True) #Reindexing songs ids
    df['movieId'] = songIntCode
    return df


In [7]:
df = PreprocessData()

In [8]:
df.head()

Unnamed: 0,userId,movieId,rating
0,0,382,5
1,0,448,4
2,0,1185,4
3,0,1891,5
4,0,4466,5


In [14]:
exposureDf = df.copy()
exposureDf['rating'] = exposureDf['rating'].where(exposureDf['rating'] == 0, 1)
nusers = exposureDf['userId'].nunique()
nitems = exposureDf['movieId'].nunique()

In [15]:
exposureDf.head()

Unnamed: 0,userId,movieId,rating
0,0,382,1
1,0,448,1
2,0,1185,1
3,0,1891,1
4,0,4466,1


In [17]:
a_matrix = sparse.coo_matrix((exposureDf["rating"],(exposureDf["userId"],exposureDf["movieId"])),shape=(nusers,nitems))
a_matrix = a_matrix.todense()
#matrix = matrix.tocsr()[:10000,:5000].tocoo()
#a_matrix = a_matrix.tocsr()[:10000,:5000].tocoo()
#matrix = matrix.tocsr().tocoo()
#a_matrix = a_matrix.tocsr().tocoo()

#df_values = df.values

In [20]:
type(a_matrix)
a_matrix.shape

(1000, 7389)

In [23]:
def GetRowFactors(latent_dim, a_matrix):
  stddv_datapoints = 0.1
  num_datapoints, data_dim = a_matrix.shape


  # we allow both linear and quadratic model
  # for linear model x_n has mean z_n * W
  # for quadratic model x_n has mean b + z_n * W + (z_n**2) * W_2
  # quadractice model needs to change the checking step accordingly

  def ppca_model(data_dim, latent_dim, num_datapoints, stddv_datapoints, form="quadratic"):
      w = ed.Normal(loc=tf.zeros([latent_dim, data_dim]),
                  scale=tf.ones([latent_dim, data_dim]),
                  name="w")  # parameter
      z = ed.Normal(loc=tf.zeros([num_datapoints, latent_dim]),
                  scale=tf.ones([num_datapoints, latent_dim]), 
                  name="z")  # local latent variable / substitute confounder
      if form == "linear":
#          x = ed.Normal(loc=tf.multiply(tf.matmul(z, w), a_matrix),
          x = ed.Normal(loc=tf.matmul(z, w),
                      scale=stddv_datapoints * tf.ones([num_datapoints, data_dim]),
                      name="x")  # (modeled) data
      elif form == "quadratic":
          b = ed.Normal(loc=tf.zeros([1, data_dim]),
                  scale=tf.ones([1, data_dim]),
                  name="b")  # intercept
          w2 = ed.Normal(loc=tf.zeros([latent_dim, data_dim]),
                  scale=tf.ones([latent_dim, data_dim]),
                  name="w2")  # quadratic parameter
#          x = ed.Normal(loc=tf.multiply(b + tf.matmul(z, w) + tf.matmul(tf.square(z), w2), a_matrix),
          x = ed.Normal(loc=b + tf.matmul(z, w) + tf.matmul(tf.square(z), w2),                        
                      scale=stddv_datapoints * tf.ones([num_datapoints, data_dim]),
                      name="x")  # (modeled) data
      return x, (w, z)

  log_joint = ed.make_log_joint_fn(ppca_model)


  def variational_model(qb_mean, qb_stddv, qw_mean, qw_stddv, 
                      qw2_mean, qw2_stddv, qz_mean, qz_stddv):
    qb = ed.Normal(loc=qb_mean, scale=qb_stddv, name="qb")
    qw = ed.Normal(loc=qw_mean, scale=qw_stddv, name="qw")
    qw2 = ed.Normal(loc=qw2_mean, scale=qw2_stddv, name="qw2")
    qz = ed.Normal(loc=qz_mean, scale=qz_stddv, name="qz")
    return qb, qw, qw2, qz


  log_q = ed.make_log_joint_fn(variational_model)

  def target(b, w, w2, z):
      """Unnormalized target density as a function of the parameters."""
      return log_joint(data_dim=data_dim,
                    latent_dim=latent_dim,
                    num_datapoints=num_datapoints,
                    stddv_datapoints=stddv_datapoints,
                    w=w, z=z, w2=w2, b=b, x=a_matrix)

  def target_q(qb, qw, qw2, qz):
      return log_q(qb_mean=qb_mean, qb_stddv=qb_stddv,
                  qw_mean=qw_mean, qw_stddv=qw_stddv,
                  qw2_mean=qw2_mean, qw2_stddv=qw2_stddv,
                  qz_mean=qz_mean, qz_stddv=qz_stddv,
                  qw=qw, qz=qz, qw2=qw2, qb=qb)

  qb_mean = tf.Variable(np.ones([1, data_dim]), dtype=tf.float32)
  qw_mean = tf.Variable(np.ones([latent_dim, data_dim]), dtype=tf.float32)
  qw2_mean = tf.Variable(np.ones([latent_dim, data_dim]), dtype=tf.float32)
  qz_mean = tf.Variable(np.ones([num_datapoints, latent_dim]), dtype=tf.float32)
  qb_stddv = tf.nn.softplus(tf.Variable(0 * np.ones([1, data_dim]), dtype=tf.float32))
  qw_stddv = tf.nn.softplus(tf.Variable(-4 * np.ones([latent_dim, data_dim]), dtype=tf.float32))
  qw2_stddv = tf.nn.softplus(tf.Variable(-4 * np.ones([latent_dim, data_dim]), dtype=tf.float32))
  qz_stddv = tf.nn.softplus(tf.Variable(-4 * np.ones([num_datapoints, latent_dim]), dtype=tf.float32))

  qb, qw, qw2, qz = variational_model(qb_mean=qb_mean, qb_stddv=qb_stddv,
                                      qw_mean=qw_mean, qw_stddv=qw_stddv,
                                      qw2_mean=qw2_mean, qw2_stddv=qw2_stddv,
                                      qz_mean=qz_mean, qz_stddv=qz_stddv)


  energy = target(qb, qw, qw2, qz)
  entropy = -target_q(qb, qw, qw2, qz)

  elbo = energy + entropy


  optimizer = tf.train.AdamOptimizer(learning_rate = 0.05)
  train = optimizer.minimize(-elbo)

  init = tf.global_variables_initializer()

  t = []

  num_epochs = 500

  with tf.Session() as sess:
      sess.run(init)

      for i in range(num_epochs):
          sess.run(train)
          if i % 5 == 0:
              t.append(sess.run([elbo]))

          b_mean_inferred = sess.run(qb_mean)
          b_stddv_inferred = sess.run(qb_stddv)
          w_mean_inferred = sess.run(qw_mean)
          w_stddv_inferred = sess.run(qw_stddv)
          w2_mean_inferred = sess.run(qw2_mean)
          w2_stddv_inferred = sess.run(qw2_stddv)
          z_mean_inferred = sess.run(qz_mean)
          z_stddv_inferred = sess.run(qz_stddv)
        
  return z_mean_inferred, z_stddv_inferred

In [24]:
z_mean_inferred, z_stddv_inferred= GetRowFactors(10, a_matrix)

In [25]:
z_mean_inferred.shape

(1000, 10)

In [26]:
def get_ratings_matrix(df, train_size=0.75):
    user_to_row = {}
    movie_to_column = {}
    df_values = df.values
    n_dims = 10
    parameters = {}
    
    uniq_users = np.unique(df_values[:, 0])
    uniq_movies = np.unique(df_values[:, 1])

    for i, user_id in enumerate(uniq_users):
        user_to_row[user_id] = i

    for j, movie_id in enumerate(uniq_movies):
        movie_to_column[movie_id] = j
    
    n_users = len(uniq_users)
    n_movies = len(uniq_movies)
    
    R = np.zeros((n_users, n_movies))
    
    df_copy = df.copy()
    train_set = df_copy.sample(frac=train_size, random_state=0)
    test_set = df_copy.drop(train_set.index)
    
    for index, row in train_set.iterrows():
        i = user_to_row[row.userId]
        j = movie_to_column[row.movieId]
        R[i, j] = row.rating

    return R, train_set, test_set, n_dims, n_users, n_movies, user_to_row, movie_to_column

In [27]:
R, train_set, test_set, n_dims, n_users, n_movies, user_to_row, movie_to_column = get_ratings_matrix(df, 0.8)

In [28]:
def matrix_X(R):
  X = []
  for i in range(len(R)):
    row = [1 if val == 1 else 0 for val in R[i]]
    X.append(row)
  return X

X = matrix_X(R)
y = R #Rating matrix
y_scaler = preprocessing.StandardScaler().fit(y)
y_scaled = y_scaler.fit_transform(y)

X_scaler = preprocessing.StandardScaler().fit(X)
X_scaled = X_scaler.fit_transform(X)

pmfU_scaler = preprocessing.StandardScaler().fit(z_mean_inferred)
pmfU_scaled = pmfU_scaler.fit_transform(z_mean_inferred)

In [29]:
X_train, X_test = train_test_split(X_scaled, test_size=0.20, random_state=randseed)
y_train, y_test = train_test_split(y_scaled, test_size=0.20, random_state=randseed)
pmfU_train, pmfU_test = train_test_split(pmfU_scaled, test_size=0.20, random_state=randseed)
n_users, n_items = X_train.shape

In [30]:
import warnings
warnings.filterwarnings('ignore')
reg = linear_model.Ridge(normalize=True)
for i in range(n_items):
    reg.fit(np.column_stack([X_train[:,i], pmfU_train]), y_train[:,i])

In [31]:
test_items = X_test.shape[1]
prediction = []

for i in range(test_items):
    res = reg.predict(np.column_stack([X_test[:,i], pmfU_test]))
    prediction.append(res)

In [32]:
y_test = np.transpose(y_test)
rmse = mean_squared_error(y_test, prediction, squared=False)
print(rmse)

0.8880783998490532
