In [3]:
import os
import shutil
import sys
import boto3
import json
from io import BytesIO # python2: BytesIO 
import pandas as pd
import logging
import numpy as np
import time
import scipy.stats as ss
from itertools import product
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from scipy import sparse
import pickle
import scipy
import random 
from multiprocessing import Pool
from sklearn.linear_model import LinearRegression, Lasso
from collections import Counter
import matplotlib.pyplot as plt
import bottleneck as bn

from tensorflow.keras.layers import Input, Concatenate, Dense, Dropout, Embedding, Flatten, Dot, MultiHeadAttention, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import L1L2, l1, l2
from tensorflow.keras.callbacks import Callback
import tensorflow as tf

### Load training/validation data, hyperparameters, evaluate functions

Load the pre-processed training and validation data

In [8]:
#load sparse matrices: sparse matrix of type '<class 'numpy.float64'>
train_data = pickle.loads("<path>-train.pickle")
cold_test_data = pickle.loads("<path>-vad.pickle")
cold_vad_data = pickle.loads("<path>-test.pickle")



In [10]:
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))
#     tp1 = np.ones(len(tp))
    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    
    tp = 1. / np.log2(np.arange(2, k + 2))
#     tp1 =  1. / np.log2(2*np.ones(min(heldout_batch.getnnz(axis=1)),k))
#     IDCG = np.array([tp1.sum() for n in heldout_batch.getnnz(axis=1)])
    IDCG = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis=1)])

    return DCG / IDCG


def Recall_at_k_batch(X_pred, heldout_batch, k=100):
    batch_users = X_pred.shape[0]

    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_batch > 0).toarray()
    
#     indexes = X_true_binary.sum(axis=1).nonzero()
#     X_pred_binary = X_pred_binary[indexes]
#     X_true_binary = X_true_binary[indexes]

    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32)
#     recall = tmp / X_true_binary.sum(axis=1)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return recall

### Compute metadata features

In [11]:
def get_movieId_from_indexes(idx, unique_id_list):
    return unique_id_list[idx]


In [14]:
unique_id_csv = pd.read_csv('s3://jul-atv/Netflix/pro_sg/unique_sid_netflix.csv', header=None)
unique_id_list = unique_id_csv[unique_id_csv.columns[0]].to_numpy()


In [15]:
json_content = json.loads("<path>-netflix-metadata.json")


array([    1,     3,     6, ..., 17764, 17769, 17770])

In [17]:
list_genres = []
list_actors = []
list_directors = []
list_producers = []
list_composers = []

list_year = []
list_runtime = []
list_adult = []

vocabulary_title = []


In [None]:
def cosine_sim(encoding,shrinkage=0.1):
    sim1 = encoding.dot(encoding.T)
    norm_fi = np.linalg.norm(encoding,axis=1)
    sim2 = np.outer(norm_fi,norm_fi)+shrinkage
    sim = sim1/sim2
    return sim

In [18]:
# we need to create the IxI similarity matrices with the same indices the UxI matrix is constructed 
counter = 0
# create similarity matrices now, same indexing as sparse UxI click matrix 
for key in json_content: 
    
    data = json_content[key]['title'] 
    vocabulary_title.extend(data.split())

    list_actors.extend(json_content[key]['actors'])
    list_actors.extend(json_content[key]['actresses'])
    list_directors.extend(json_content[key]['directors']) 
    list_composers.extend(json_content[key]['composers']) 
    list_producers.extend(json_content[key]['producers']) 
    list_genres.extend(json_content[key]['genre'])
    
    counter += 1

set_actors = set(list_actors)
set_directors = set(list_directors)
set_genres = set(list_genres)
set_vocabulary_title = set(vocabulary_title)

corpus_title = [json_content[key]['title'] for key in json_content]
corpus_actors = [list(set(json_content[key]['actors']+json_content[key]['actresses'])) for key in json_content]
corpus_composers= [json_content[key]['composers'] for key in json_content]
corpus_producers= [json_content[key]['producers'] for key in json_content]
corpus_directors = [json_content[key]['directors'] for key in json_content]
corpus_genres = [json_content[key]['genre'] for key in json_content]

counter_actors = Counter(list_actors)
df_actors = pd.DataFrame(list(counter_actors.items()),columns = ['actors','count'])
df_actors = df_actors.sort_values("count",ascending=False)
# max_features = len(df_actors)//5
df_actors= df_actors[df_actors["count"]>=2]
# df_actors = df_actors.iloc[:max_features]
print(len(df_actors))

counter_directors = Counter(list_directors)
df_directors = pd.DataFrame(list(counter_directors.items()),columns = ['directors','count'])
df_directors = df_directors.sort_values("count",ascending=False)
df_directors= df_directors[df_directors["count"]>=2]
print(len(df_directors))

counter_producers = Counter(list_producers)
df_producers = pd.DataFrame(list(counter_producers.items()),columns = ['producers','count'])
df_producers = df_producers.sort_values("count",ascending=False)
df_producers= df_producers[df_producers["count"]>=2]
print(len(df_producers))

counter_composers = Counter(list_composers)
df_composers = pd.DataFrame(list(counter_composers.items()),columns = ['composers','count'])
df_composers = df_composers.sort_values("count",ascending=False)
df_composers = df_composers[df_composers["count"]>=2]
print(len(df_composers))


max_features = 1000
pipe_title = Pipeline([('count', CountVectorizer(stop_words='english',max_features=max_features))]).fit(corpus_title)
data_title = pipe_title.transform(corpus_title).toarray()

# should use transformer embeddings instead
# data_title = np.load('title.npy')
# data_title = pickle.loads(s3.Bucket("toby-visual-recommender").Object("ML10M/LLM/title_emb.pickle").get()['Body'].read())


# pipe_description = Pipeline([('count', CountVectorizer(stop_words='english',max_features=max_features))]).fit(corpus_description)
# data_description = pipe_description.transform(corpus_description).toarray()
# # should use transformer embeddings instead
# data_description = np.load('description.npy')
# data_description = pickle.loads(s3.Bucket("toby-visual-recommender").Object("ML10M/LLM/description_emb.pickle").get()['Body'].read())


mlb = MultiLabelBinarizer(classes = list(df_actors.actors))
# mlb = MultiLabelBinarizer()
encoding_actors = mlb.fit_transform(corpus_actors)

mlb = MultiLabelBinarizer(classes = list(df_directors.directors))
# mlb = MultiLabelBinarizer()
encoding_directors = mlb.fit_transform(corpus_directors)

mlb = MultiLabelBinarizer(classes = list(df_composers.composers))
# mlb = MultiLabelBinarizer()
encoding_composers = mlb.fit_transform(corpus_composers)

mlb = MultiLabelBinarizer(classes = list(df_producers.producers))
# mlb = MultiLabelBinarizer()
encoding_producers = mlb.fit_transform(corpus_producers)

mlb = MultiLabelBinarizer()
encoding_genres = mlb.fit_transform(corpus_genres)


5352
1604
1818
923


  .format(sorted(unknown, key=str)))
  .format(sorted(unknown, key=str)))
  .format(sorted(unknown, key=str)))
  .format(sorted(unknown, key=str)))


In [21]:
Btitle = np.array(data_title)
Bgenres = encoding_genres
Bactors = np.array(encoding_actors)
Bdirectors = np.array(encoding_directors)
Bproducers = np.array(encoding_producers)
Bcomposers = np.array(encoding_composers)

In [1]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

Bgenres = scaler.fit_transform(Bgenres)
Bactors = scaler.fit_transform(Bactors)
Bdirectors = scaler.fit_transform(Bdirectors)
Bcomposers = scaler.fit_transform(Bcomposers)
Bproducers = scaler.fit_transform(Bproducers)
Btitle = scaler.fit_transform(Btitle)


NameError: name 'Bgenres' is not defined

In [None]:
def cosine_sim(encoding,shrinkage=0.1):
    sim1 = encoding.dot(encoding.T)
    norm_fi = np.linalg.norm(encoding,axis=1)
    sim2 = np.outer(norm_fi,norm_fi)+shrinkage
    sim = sim1/sim2
    return sim

In [None]:
B1 = cosine_sim(Btitle, shrinkage=50)
B2 = cosine_sim(Bactors, shrinkage=50)
B3 = cosine_sim(Bdirectors, shrinkage=50)
B4 = cosine_sim(Bproducers, shrinkage=50)
B5 = cosine_sim(Bcomposers, shrinkage=50)
B6 = cosine_sim(Bgenres, shrinkage=50)



In [None]:
size = B1.shape[0]
for i in range(size):
    B1[i,i]=0
    B2[i,i]=0
    B3[i,i]=0
    B4[i,i]=0     
    B5[i,i]=0
    B6[i,i]=0



## baseline sum of cosine similarities

In [None]:
B12 = B1*B2
B13 = B1*B3
B14 = B1*B4
B15 = B1*B5
B23 = B2*B3
B24 = B2*B4
B25 = B2*B5
B34 = B3*B4
B35 = B3*B5
B45 = B4*B5


In [None]:
X = train_data.toarray()
X1 = X.dot(B1)
X2 = X.dot(B2)
X3 = X.dot(B3)
X4 = X.dot(B4)
X5 = X.dot(B5)
X6 = X.dot(B6)
X12 = X.dot(B12)
X13 = X.dot(B13)
X14 = X.dot(B14)
X15 = X.dot(B15)
X23 = X.dot(B23)
X24 = X.dot(B24)
X25 = X.dot(B25)
X34 = X.dot(B34)
X35 = X.dot(B35)
X45 = X.dot(B45)

print("X^{(i)} computed")
# del B1, B2, B3, B4, B5, json_content

X = np.asarray(X).reshape((X.shape[0]*X.shape[1],1))
Xtot = np.concatenate((np.asarray(X1).reshape((X1.shape[0]*X1.shape[1],1)),
                      np.asarray(X2).reshape((X2.shape[0]*X2.shape[1],1)),
                       np.asarray(X3).reshape((X3.shape[0]*X3.shape[1],1)),
                      np.asarray(X4).reshape((X4.shape[0]*X4.shape[1],1)),
                       np.asarray(X5).reshape((X5.shape[0]*X5.shape[1],1)),
                      np.asarray(X12).reshape((X12.shape[0]*X12.shape[1],1)),
                       np.asarray(X13).reshape((X13.shape[0]*X13.shape[1],1)),
                      np.asarray(X14).reshape((X14.shape[0]*X14.shape[1],1)),
                       np.asarray(X15).reshape((X15.shape[0]*X15.shape[1],1)),
                      np.asarray(X23).reshape((X23.shape[0]*X23.shape[1],1)),
                       np.asarray(X24).reshape((X24.shape[0]*X24.shape[1],1)),
                      np.asarray(X25).reshape((X25.shape[0]*X25.shape[1],1)),
                       np.asarray(X34).reshape((X34.shape[0]*X34.shape[1],1)),
                       np.asarray(X35).reshape((X35.shape[0]*X35.shape[1],1)),
                       np.asarray(X45).reshape((X45.shape[0]*X45.shape[1],1))),axis=1)

my_array = X.copy()
my_array[my_array == 0] = 0.1
my_array = my_array.flatten()


reg = LinearRegression(positive=True).fit(Xtot, X, sample_weight=my_array)

print("linear regression fitted")



In [None]:
reg.coef_

In [None]:
X1bis = reg.coef_[0][0]*X1
X2bis = reg.coef_[0][1]*X2
X3bis = reg.coef_[0][2]*X3
X4bis = reg.coef_[0][3]*X4
X5bis = reg.coef_[0][4]*X5
X6bis = reg.coef_[0][5]*X6

X6bis = reg.coef_[0][6]*X12
X7bis = reg.coef_[0][7]*X13
X8bis = reg.coef_[0][8]*X14
X9bis = reg.coef_[0][9]*X15
X10bis = reg.coef_[0][10]*X23
X11bis = reg.coef_[0][11]*X24
X12bis = reg.coef_[0][12]*X25
X13bis = reg.coef_[0][13]*X34
X14bis = reg.coef_[0][14]*X35
X15bis = reg.coef_[0][15]*X45
X16bis = reg.coef_[0][16]*X45

pred_val = X1bis+X2bis+X3bis+X4bis+X5bis+X6bis+X7bis+X8bis+X9bis+X10bis+X11bis+X12bis+X13bis+X14bis+X15bis+X16bis
values = train_data.nonzero()
pred_val[values] =  -np.inf

In [None]:
del X1, X2, X3, X4, X5

In [None]:
n10_list, r10_list, n25_list, r25_list, n50_list, r50_list, n100_list, r100_list = [], [],[], [],[], [],[], []
r10_list.append(Recall_at_k_batch(pred_val,cold_test_data, k=10))
n10_list.append(NDCG_binary_at_k_batch(pred_val,cold_test_data, k=10))
r25_list.append(Recall_at_k_batch(pred_val, cold_test_data, k=25))
n25_list.append(NDCG_binary_at_k_batch(pred_val, cold_test_data, k=25))
r50_list.append(Recall_at_k_batch(pred_val, cold_test_data, k=50))
n50_list.append(NDCG_binary_at_k_batch(pred_val, cold_test_data, k=50))
r100_list.append(Recall_at_k_batch(pred_val, cold_test_data, k=100))
n100_list.append(NDCG_binary_at_k_batch(pred_val, cold_test_data, k=100))

In [None]:
#content similarity with 2nd order regression (B1 to B5, B34)
print("Test Recall@10=%.5f (%.5f)" % (np.nanmean(r10_list), np.nanstd(r10_list) / np.sqrt(len(r10_list))))
# print("Test NDCG@10=%.5f (%.5f)" % (np.nanmean(n10_list), np.nanstd(n10_list) / np.sqrt(len(n10_list))))

print("Test Recall@25=%.5f (%.5f)" % (np.nanmean(r25_list), np.nanstd(r25_list) / np.sqrt(len(r25_list))))
# print("Test NDCG@25=%.5f (%.5f)" % (np.nanmean(n25_list), np.nanstd(n25_list) / np.sqrt(len(n25_list))))

print("Test Recall@50=%.5f (%.5f)" % (np.nanmean(r50_list), np.nanstd(r50_list) / np.sqrt(len(r50_list))))
# print("Test NDCG@50=%.5f (%.5f)" % (np.nanmean(n50_list), np.nanstd(n50_list) / np.sqrt(len(n50_list))))

# print("Test NDCG@100=%.5f (%.5f)" % (np.nanmean(n100_list), np.nanstd(n100_list) / np.sqrt(len(n100_list))))
print("Test Recall@100=%.5f (%.5f)" % (np.nanmean(r100_list), np.nanstd(r100_list) / np.sqrt(len(r100_list))))

### Run our method 

In [None]:
X = train_data.toarray()
values = train_data.nonzero()


#initialization
lambda1s = [100,300,500,700,900]
lambda2s = [1,10,50,100]
lambda3s = [1]
ks = [0]

b = np.average(X,0)
xo = np.mean(b)

Xtilde = X1bis+X2bis+X3bis+X4bis+X5bis+X6bis+X7bis+X8bis+X9bis+X10bis+X11bis+X12bis+X13bis+X14bis+X15bis+X16bis

In [None]:
for k in ks:
    for lambda1 in lambda1s:
        for lambda2 in lambda2s:
            for lambda3 in lambda3s:

                perc_value = 10
                rho = 1 
                yo = lambda1
                y = [lambda1+k*bi-k*xo for bi in b]


                #transform vector to diag matrix IR
                vector = np.sum(X, axis=0)
                percentile = max(np.percentile(vector,perc_value),1)
                k = lambda2/percentile
                vector_tr = np.zeros(len(vector))
                for counter,item in enumerate(vector):
                    if item <= percentile:
                        vector_tr[counter] = k*(percentile-item)

                IR = np.diag(vector_tr)

                #compute P
                P0 = rho*X.T.dot(X)+y*np.diag(np.ones(train_data.shape[1]))+X.T.dot(Xtilde).dot(IR)
                P = np.linalg.inv(P0)
                print("P computed")
                del P0

                #update of Bk
                B_temp = rho*X.T.dot(X)+lambda3*X.T.dot(Xtilde).dot(IR)
                B_tilde = P.dot(B_temp)
                gamma = np.diag(B_tilde) / np.diag(P)
                B_k = B_tilde - P.dot(np.diag(gamma))
                B_k = np.asarray(B_k).reshape((B_k.shape[0],B_k.shape[1]))
                # del B_temp
                # del B_tilde
                # del gamma
                print("B_k updated")

                val_index_min = 0
                n10_list, n20_list, n50_list, n100_list, r10_list, r20_list, r50_list, r100_list = [], [], [], [], [], [], [], []


                pred_val = X.dot(B_k)
                pred_val[values] = -np.inf

                # exclude examples from training and validation (if any)
                n10_list.append(NDCG_binary_at_k_batch(pred_val, cold_vad_data, k=10))
                n20_list.append(NDCG_binary_at_k_batch(pred_val, cold_vad_data, k=25))
                n50_list.append(NDCG_binary_at_k_batch(pred_val, cold_vad_data, k=50))
                n100_list.append(NDCG_binary_at_k_batch(pred_val, cold_vad_data, k=100))
                r10_list.append(Recall_at_k_batch(pred_val, cold_vad_data, k=10))
                r20_list.append(Recall_at_k_batch(pred_val, cold_vad_data, k=25))
                r50_list.append(Recall_at_k_batch(pred_val, cold_vad_data, k=50))
                r100_list.append(Recall_at_k_batch(pred_val, cold_vad_data, k=100))

                n10_list = np.concatenate(n10_list)
                n20_list = np.concatenate(n20_list)
                n50_list = np.concatenate(n50_list)
                n100_list = np.concatenate(n100_list)

                r10_list = np.concatenate(r10_list)
                r20_list = np.concatenate(r20_list)
                r50_list = np.concatenate(r50_list)
                r100_list = np.concatenate(r100_list)

                #0cold-
                print("lambda1={}".format(str(lambda1)))
                print("lambda2={}".format(str(lambda2)))
                print("lambda3={}".format(str(lambda3)))

                print("Test NDCG@10=%.5f (%.5f)" % (np.nanmean(n10_list), np.nanstd(n10_list) / np.sqrt(len(n10_list))))
                print("Test NDCG@25=%.5f (%.5f)" % (np.nanmean(n20_list), np.nanstd(n20_list) / np.sqrt(len(n20_list))))
                print("Test NDCG@50=%.5f (%.5f)" % (np.nanmean(n50_list), np.nanstd(n50_list) / np.sqrt(len(n50_list))))
                print("Test NDCG@100=%.5f (%.5f)" % (np.nanmean(n100_list), np.nanstd(n100_list) / np.sqrt(len(n100_list))))
                print("Test Recall@10=%.5f (%.5f)" % (np.nanmean(r10_list), np.nanstd(r10_list) / np.sqrt(len(r10_list))))
                print("Test Recall@25=%.5f (%.5f)" % (np.nanmean(r20_list), np.nanstd(r20_list) / np.sqrt(len(r20_list))))
                print("Test Recall@50=%.5f (%.5f)" % (np.nanmean(r50_list), np.nanstd(r50_list) / np.sqrt(len(r50_list))))
                print("Test Recall@100=%.5f (%.5f)" % (np.nanmean(r100_list), np.nanstd(r100_list) / np.sqrt(len(r100_list))))
