In [None]:
import gzip, json
import fsspec
import math
import datetime
import random
import bottleneck as bn
import pickle5 as pickle
import pandas as pd
import numpy as np

from scipy import sparse
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from collections import Counter

import tensorflow as tf
import tensorflow_addons as tfa

from tensorflow.keras.layers import Input, Concatenate, Dense, Dropout, Embedding, Flatten, Dot, MultiHeadAttention, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import L1L2, l1, l2
from tensorflow.keras.callbacks import Callback


## Amazon dataset

Amazon Video Games (Ni et al. 2019): it is a collection of reviews gathered from
the Amazon website in a 22 years period between 1996 and 2018. Reviews contain ratings on a 5-star scale, and products are accompanied with detailed meta-
data. Also in this case the preferences have been binarized as described for Mov-
ielens.

### Now dealing with 80-20 cold splits

In [None]:
#get click data 
path = "data/splits/AmazonReviewData/AmazonGames/original/implicit_3.0/kcore_user_5_item_5_5_feature_5_reshaped/ICM_all.npz"
df_icm = sparse.load_npz(<path>)

path = "data/splits/AmazonReviewData/AmazonGames/original/implicit_3.0/kcore_user_5_item_5_5_feature_5_reshaped/cold_items_holdout_0.80_0.00_0.20_testtreshold_0.0_no_cold_users/URM_all_0_train.npz"
df_urm = sparse.load_npz(<path>)

path = "data/splits/AmazonReviewData/AmazonGames/original/implicit_3.0/kcore_user_5_item_5_5_feature_5_reshaped/cold_items_holdout_0.80_0.00_0.20_testtreshold_0.0_no_cold_users/URM_all_0_test.npz"
df_urm_test = sparse.load_npz(<path>)

In [None]:
path = "data/splits/AmazonReviewData/AmazonGames/original/implicit_3.0/kcore_user_5_item_5_5_feature_5_reshaped/URM_all_mapper"
urm_mapper = pickle.loads(<path>)

In [None]:
key = "data/splits/AmazonReviewData/AmazonGames/original/implicit_3.0/kcore_user_5_item_5_5_feature_5_reshaped/ICM_all_mapper"
icm_mapper = pickle.loads(<path>)

In [None]:
key = "data/raw-datasets/AmazonReviewData/AmazonGames/Video_Games.json.gz"
#get json buffer
buffer = BytesIO(response['Body'].read())
with gzip.open(buffer, 'r') as fin:       
    json_bytes = fin.read()   
json_string = json_bytes.decode('utf-8')
json_lines = json_string.splitlines()

In [None]:
#review data about asins (in the case of cold start, no data)
metadata_big = {}
for line in json_lines: 
    data = json.loads(line)
    metadata_big[data["asin"]]=data
    

In [None]:
#metadata about asins
metadata = {}
for line in json_lines: 
    data = json.loads(line)
    metadata[data["asin"]]=data
    

In [None]:
feature_keys = ['category','main_cat', 'brand', 'description','title', 'feature',  'date', 'price']


In [None]:
dict_user_index = urm_mapper[0]
dict_asin_index = icm_mapper[0]
dict_index_asin = {}
for key in dict_asin_index:
    dict_index_asin[dict_asin_index[key]]=key


In [None]:
feature_keys = ['category','main_cat', 'brand', 'title', 'description', 'feature']

list_category = []
list_main_cat = []
list_brand = []
list_date = []
list_price = []
list_title = []
list_description = []
list_feature = []

for key in dict_index_asin:
    asin = dict_index_asin[key]
    
    #category
    category = metadata[asin]["category"]
    if '</span></span></span>' in category:
        category.remove('</span></span></span>')
    list_category.append(category)
    
    #main_cat
    main_cat = metadata[asin]["main_cat"]
    if main_cat!="":
        list_main_cat.append([main_cat])
    else:
        list_main_cat.append([])
    
    #brand
    brand = metadata[asin]["brand"]
    if brand !="":
        brand = brand.replace('by',"")
        brand = brand.strip()
        list_brand.append([brand])
    else:
        list_brand.append([])
    
    
    #title
    list_title.append(metadata[asin]["title"])
    
    #description
    description = metadata[asin]["description"]
    if description!=[]:
        list_description.append(description[0])
    else:
        list_description.append("")
    
    #feature
    features = metadata[asin]["feature"]
    features = ' '.join(features)
    list_feature.append(features)
    
    

In [None]:
total_list_category = []
for temp_list in list_category:
    total_list_category.extend(temp_list)

total_list_main_cat = []
for temp_list in list_main_cat:
    total_list_main_cat.extend(temp_list)
    
total_list_brand = []
for temp_list in list_brand:
    total_list_brand.extend(temp_list)


In [None]:
max_features = 2000
pipe_title = Pipeline([('count', CountVectorizer(stop_words='english',max_features=max_features)),('tfid', TfidfTransformer())]).fit(list_title)
data_title = pipe_title.transform(list_title).toarray()

max_features = 2000
pipe_feature = Pipeline([('count', CountVectorizer(stop_words='english',max_features=max_features)),('tfid', TfidfTransformer())]).fit(list_feature)
data_feature = pipe_feature.transform(list_feature).toarray()

max_features = 2000
pipe_description = Pipeline([('count', CountVectorizer(stop_words='english',max_features=max_features)),('tfid', TfidfTransformer())]).fit(list_description)
data_description = pipe_description.transform(list_description).toarray()


In [None]:
counter_brand = Counter(total_list_brand)
df_brand = pd.DataFrame(list(counter_brand.items()),columns = ['brand','count'])
df_brand = df_brand.sort_values("count",ascending=False)
df_brand= df_brand[df_brand["count"]>=2]

counter_category = Counter(total_list_category)
df_category = pd.DataFrame(list(counter_category.items()),columns = ['category','count'])
df_category = df_category.sort_values("count",ascending=False)
df_category = df_category[df_category["count"]>=2]

counter_main_cat = Counter(total_list_main_cat)
df_main_cat = pd.DataFrame(list(counter_main_cat.items()),columns = ['main_cat','count'])
df_main_cat = df_main_cat.sort_values("count",ascending=False)
df_main_cat = df_main_cat[df_main_cat["count"]>=2]

mlb = MultiLabelBinarizer(classes = list(df_brand.brand))
encoding_brand = mlb.fit_transform(list_brand)

mlb = MultiLabelBinarizer(classes = list(df_main_cat.main_cat))
encoding_main_cat = mlb.fit_transform(list_main_cat)

mlb = MultiLabelBinarizer(classes = list(df_category.category))
encoding_category = mlb.fit_transform(list_category)


In [None]:
print(data_description.shape)
print(data_title.shape)
print(data_feature.shape)
print(encoding_main_cat.shape)
print(encoding_category.shape)
print(encoding_brand.shape)

In [None]:
scaler = MinMaxScaler()
Bdescription = scaler.fit_transform(data_description)
Btitle = scaler.fit_transform(data_title)
Bfeature = scaler.fit_transform(data_feature)
Bmaincat = scaler.fit_transform(encoding_main_cat)
Bcategory = scaler.fit_transform(encoding_category)
Bbrand = scaler.fit_transform(encoding_brand)


In [None]:
Bconstant = np.concatenate((Btitle, Bdescription,Bfeature, Bmaincat, Bcategory, Bbrand), axis=1)


In [None]:
### item KNN
def cosine_sim(encoding,shrinkage=1):
    sim1 = encoding.dot(encoding.T)
    norm_fi = np.linalg.norm(encoding,axis=1)
    sim2 = np.outer(norm_fi,norm_fi)+shrinkage
    sim = sim1/sim2
    return sim

In [None]:
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))
#     tp1 = np.ones(len(tp))
    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    
    tp = 1. / np.log2(np.arange(2, k + 2))
#     tp1 =  1. / np.log2(2*np.ones(min(heldout_batch.getnnz(axis=1)),k))
#     IDCG = np.array([tp1.sum() for n in heldout_batch.getnnz(axis=1)])
    IDCG = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis=1)])

    return DCG / IDCG


def Recall_at_k_batch(X_pred, heldout_batch, k=100):
    batch_users = X_pred.shape[0]

    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_batch > 0).toarray()
    
#     indexes = X_true_binary.sum(axis=1).nonzero()
#     X_pred_binary = X_pred_binary[indexes]
#     X_true_binary = X_true_binary[indexes]

    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32)
#     recall = tmp / X_true_binary.sum(axis=1)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return recall

### Cosine similarity weighting

In [None]:
Bf = cosine_sim(Bconstant, shrinkage=50)
X= df_urm.toarray()
Xf = X.dot(Bf)# that is the Xtilde
size = Bf.shape[0]
for i in range(size):
    Bf[i,i]=0
    
X = np.asarray(X).reshape((X.shape[0]*X.shape[1],1))
Xtot = np.asarray(Xf).reshape((Xf.shape[0]*Xf.shape[1],1))

my_array = X.copy()
my_array[my_array == 0] = 0.01
my_array = my_array.flatten()

reg = LinearRegression().fit(Xtot, X, sample_weight=my_array)

pred_val = reg.coef_[0][0]*Xf#normalised Xtilde
values = df_urm.nonzero()
pred_val[values] =  -np.inf

In [None]:
test_data = df_urm_test.copy()

n10_list, r10_list, n25_list, r25_list, n50_list, r50_list, n100_list, r100_list = [], [],[], [],[], [],[], []
r10_list.append(Recall_at_k_batch(pred_val,test_data, k=10))
n10_list.append(NDCG_binary_at_k_batch(pred_val,test_data, k=10))
r25_list.append(Recall_at_k_batch(pred_val, test_data, k=25))
n25_list.append(NDCG_binary_at_k_batch(pred_val, test_data, k=25))

In [None]:
#20 epochs filter at 3 streams, dropout 0.2, siamese with 2 layers (fraternal on 1st) and summed transformers outputs 
print("Test NDCG@10=%.5f (%.5f)" % (np.nanmean(n10_list), np.nanstd(n10_list) / np.sqrt(len(n10_list))))
print("Test NDCG@25=%.5f (%.5f)" % (np.nanmean(n25_list), np.nanstd(n25_list) / np.sqrt(len(n25_list))))
print("Test Recall@10=%.5f (%.5f)" % (np.nanmean(r10_list), np.nanstd(r10_list) / np.sqrt(len(r10_list))))
print("Test Recall@25=%.5f (%.5f)" % (np.nanmean(r25_list), np.nanstd(r25_list) / np.sqrt(len(r25_list))))

### Test hybrid method

In [None]:
#initialization

X= df_urm.toarray()

lambda1s = [1,500,1000]
lambda2s = [1,10,100,1000]
lambda3s = [1,10]
coefs = [0.0]
ks = [0]

b = np.average(X,0)
xo = np.mean(b)

In [None]:
vad_test_data = df_urm_vad 

In [None]:
n10_list, r10_list, n25_list, r25_list, n50_list, r50_list, n100_list, r100_list = [], [],[], [],[], [],[], []

k= 0
for coef in coefs:
    for lambda1 in lambda1s:
        for lambda2 in lambda2s:
            for lambda3 in lambda3s:

                perc_value = 10
                rho = 1 
                yo = lambda1
                y = [lambda1+k*bi-k*xo for bi in b]


                #transform vector to diag matrix IR
                vector = np.sum(X, axis=0)
                percentile = max(np.percentile(vector,perc_value),1)
                k = lambda2/percentile
                vector_tr = np.zeros(len(vector))
                for counter,item in enumerate(vector):
                    if item <= percentile:
                        vector_tr[counter] = k*(percentile-item)

                IR = np.diag(vector_tr)

                #dense computations
                #Xtilde
#                 Xtilde = X.dot(B)
                Xtilde= reg.coef_[0][0]*X.dot(Bf)#X1bis+X2bis+X3bis+X4bis+X5bis+X6bis#+X9bis#+lambdas[6][0]*X7+lambdas[7][0]*X8
                print("Xtilde computed")

                #compute P
                P0 = rho*X.T.dot(X)+coef*Bconstant.dot(Bconstant.T)+y*np.diag(np.ones(df_urm.shape[1]))+X.T.dot(Xtilde).dot(IR)
                P = np.linalg.inv(P0)
                print("P computed")
                del P0

                #update of Bk
                B_temp = rho*X.T.dot(X)+coef*Bconstant.dot(Bconstant.T)+lambda3*X.T.dot(Xtilde).dot(IR)
                B_tilde = P.dot(B_temp)
                gamma = np.diag(B_tilde) / np.diag(P)
                B_k = B_tilde - P.dot(np.diag(gamma))
                B_k = np.asarray(B_k).reshape((B_k.shape[0],B_k.shape[1]))
                # del B_temp
                # del B_tilde
                # del gamma
                print("B_k updated")

                val_index_min = 0
                n10_list, n25_list, n50_list, n100_list, r10_list, r25_list, r50_list, r100_list = [], [], [], [], [], [], [], []


                Xtest = X

                pred_val = Xtest.dot(B_k)
                pred_val[values] = -np.inf


                n10_list.append(NDCG_binary_at_k_batch(pred_val, vad_test_data, k=10))
                n25_list.append(NDCG_binary_at_k_batch(pred_val, vad_test_data, k=25))
                r10_list.append(Recall_at_k_batch(pred_val, vad_test_data, k=10))
                r25_list.append(Recall_at_k_batch(pred_val, vad_test_data, k=25))


                n10_list = np.concatenate(n10_list)
                n25_list = np.concatenate(n25_list)
                r10_list = np.concatenate(r10_list)
                r25_list = np.concatenate(r25_list)
                
                #0cold-
                print("lambda1={}".format(str(lambda1)))
                print("lambda2={}".format(str(lambda2)))
                print("lambda3={}".format(str(lambda3)))
                
                print("Test NDCG@10=%.5f (%.5f)" % (np.nanmean(n10_list), np.nanstd(n10_list) / np.sqrt(len(n10_list))))
                print("Test Recall@10=%.5f (%.5f)" % (np.nanmean(r10_list), np.nanstd(r10_list) / np.sqrt(len(r10_list))))
                
                print("Test NDCG@25=%.5f (%.5f)" % (np.nanmean(n25_list), np.nanstd(n25_list) / np.sqrt(len(n25_list))))
                print("Test Recall@25=%.5f (%.5f)" % (np.nanmean(r25_list), np.nanstd(r25_list) / np.sqrt(len(r25_list))))
