In [None]:
import os
import sys
import json
import pickle
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from scipy import sparse

import bottleneck as bn
import random

from tensorflow.keras.layers import Input, Concatenate, Dense, Dropout, Embedding, Flatten, Dot, MultiHeadAttention, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import L1L2, l1, l2
from tensorflow.keras.callbacks import Callback
import tensorflow as tf

## Loading training and testing data

In [None]:
train_data = pickle.loads("<path>-train.pickle")
cold_test_data = pickle.loads("<path>-test.pickle")

## Get meta data and pre-computed embeddings

In [None]:
def cosine_sim(encoding,shrinkage=0.1):
    sim1 = encoding.dot(encoding.T)
    norm_fi = np.linalg.norm(encoding,axis=1)
    sim2 = np.outer(norm_fi,norm_fi)+shrinkage
    sim = sim1/sim2
    return sim

In [None]:
json_content = json.loads("<path>-ML-10M-metadata.json")

In [None]:
list_actors = []
list_genres = []
list_directors = []
vocabulary_title = []
vocabulary_description = []
list_tags = []


In [None]:
# we need to create the IxI similarity matrices with the same indices the UxI matrix is constructed 
B6_0_id = []
counter = 0
# create similarity matrices now, same indexing as sparse UxI click matrix 
for key in json_content: 
    
    data = json_content[key]['title'] 
    vocabulary_title.extend(data.split())
    data = json_content[key]['description']
    vocabulary_description.extend(data.split())
    list_actors.extend(json_content[key]['actors'])
    list_directors.extend(json_content[key]['directors']) 
    list_genres.extend(json_content[key]['genres'])
    list_tags.extend(json_content[key]['tags'])
    if json_content[key]['tags']==['Not']:
        B6_0_id.append(counter)
    counter += 1

set_actors = set(list_actors)
set_directors = set(list_directors)
set_genres = set(list_genres)
set_vocabulary_title = set(vocabulary_title)
set_vocabulary_description = set(vocabulary_description)
set_tags = set(list_tags)

corpus_title = [json_content[key]['title'] for key in json_content]
corpus_description = [json_content[key]['description'] for key in json_content]
corpus_actors = [json_content[key]['actors'] for key in json_content]
corpus_directors = [json_content[key]['directors'] for key in json_content]
corpus_genres = [json_content[key]['genres'] for key in json_content]
corpus_tags = [json_content[key]['tags'] for key in json_content]

In [None]:
mlb = MultiLabelBinarizer()
encoding_genres = mlb.fit_transform(corpus_genres)

mlb = MultiLabelBinarizer()
encoding_tags = mlb.fit_transform(corpus_tags)

In [None]:
## Falcon embeddings for concatenated title and description
cat_llm_emb = pickle.loads("<path>-cat_embs.pickle")


In [None]:
Bgenres = encoding_genres
Bllmcat = cat_llm_emb

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
Bgenres = scaler.fit_transform(Bgenres)

In [None]:
Bactors = pickle.loads("<path>-actors_list.pickle")
Bdirs = pickle.loads("<path>-dirs_list.pickle")


In [None]:
Bactors = np.array(Bactors)
Bdirs = np.array(Bdirs)

print(Bactors.shape)
print(Bdirs.shape)

## Metrics defination

In [None]:
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    
    tp = 1. / np.log2(np.arange(2, k + 2))

    IDCG = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis=1)])

    return DCG / IDCG

def Recall_at_k_batch(X_pred, heldout_batch, k=100):
    batch_users = X_pred.shape[0]

    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_batch > 0).toarray()
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(
        np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return recall

## Preparing training dataset for Siamese network

In [None]:
Bconstant = np.concatenate((Bllmcat, Bactors, Bdirs, Bgenres), axis=1)
X= train_data.toarray()
X= train_data.toarray()
Bconstant_updated = Bconstant

Xsum = np.sum(X,0)
indices_ite = np.where(Xsum>=1)#at least 1 ratings per item
X = X[:,indices_ite[0]]
Bconstant_updated = Bconstant_updated[indices_ite[0],:]

print(X.shape)
print(Bconstant.shape)
print(Bconstant_updated.shape)

In [None]:
title_des_index = Bllmcat.shape[1]
actor_index = title_des_index + Bactors.shape[1]
dir_index = actor_index + Bdirs.shape[1]

print(title_des_index)
print(actor_index)
print(dir_index)

In [None]:
Bnew = cosine_similarity(X.T, Y=None, dense_output=True)

In [None]:
MAX_INDICES = 50

pos_samples = []
neg_samples = []
pos_couples = {}
for i in range(len(Bnew)):
    pos_couples[i]=[]

for i in range(len(Bnew)):
    
    #argsort then sort 
    mylist = Bnew[i,:]
    pos_indices = list(np.argsort(-mylist))

    pos_indices.pop(0)
    pos_indices = pos_indices[:MAX_INDICES]
    
    datafiltneg = list(np.where(Bnew[i,:]==0.)[0])
    random.shuffle(datafiltneg)

    counter_pos = 0
    for indice in pos_indices:
        if indice not in pos_couples:
            pos_samples.append((i,indice,1))
            pos_couples[i].append(indice)
            counter_pos+=1
        elif i not in pos_couples[indice]:
            pos_samples.append((i,indice,1))
            pos_couples[i].append(indice)
            counter_pos+=1
            
    neg_indices = datafiltneg[:counter_pos]
    for indice in neg_indices:
        neg_samples.append((i,indice,0))


In [None]:
pos_samples_train = random.sample(pos_samples, k=round(len(pos_samples) * 0.8))
pos_samples_test = list(set(pos_samples_train) ^ set(pos_samples))

neg_samples_train = random.sample(neg_samples, k=round(len(neg_samples) * 0.8))
neg_samples_test = list(set(neg_samples_train) ^ set(neg_samples))

samples_train = []
samples_train.extend(pos_samples_train)
samples_train.extend(neg_samples_train)

samples_test = []
samples_test.extend(pos_samples_test)
samples_test.extend(neg_samples_test)

samples = []
samples.extend(pos_samples)
samples.extend(neg_samples)

In [None]:
random.shuffle(samples_train)
random.shuffle(samples_test)
random.shuffle(samples)

In [None]:
class CustomDataGen(tf.keras.utils.Sequence):
    
    def __init__(self, B, samples,
                 batch_size,
                 shuffle=True):
        
        self.batch_size = batch_size
        self.B = B
        self.samples = samples
        self.shuffle = shuffle  
        self.n = len(self.samples)
    
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.samples)


    
    def __getitem__(self, index):
        #lets start simple: batch is multiple of num_items
        
        batched_samples = self.samples[index * self.batch_size:(index+1) * self.batch_size]
        indexes_i = [item[0] for item in batched_samples]
        indexes_j = [item[1] for item in batched_samples]
        y = np.array([item[2] for item in batched_samples])
        batches = (np.array(self.B[indexes_i,:]),np.array(self.B[indexes_j,:]))    
        X_batch = (batches, y)   
        
                
        return X_batch
    
    def __len__(self):
        return self.n // self.batch_size

## Defining Cross-Attention Siamese network

In [None]:
## TF model
embedding_dim = 768 // 2
num_heads = 3

input1 = Input(shape=(Bconstant_updated.shape[1],),name='ratings1')
input2 = Input(shape=(Bconstant_updated.shape[1],),name='ratings2')

# FCs [proj, i] project sparse features only (all together)
layeract = Dense(768, activation="linear",name='layer1')
layermulti = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)

T11 = layeract(input1[:,dir_index:]) # all categorical
T21 = layeract(input2[:,dir_index:])# all categorical

T12 = tf.expand_dims(Dense(embedding_dim)(input1[:,:title_des_index]), axis=1) # concatnated title/description
T13 = tf.expand_dims(Dense(embedding_dim)(input1[:,title_des_index:actor_index]), axis=1) # actors
T14 = tf.expand_dims(Dense(embedding_dim)(input1[:,actor_index:dir_index]), axis=1) # directors

T22 = tf.expand_dims(Dense(embedding_dim)(input2[:,:title_des_index]), axis=1) # concatnated title/description
T23 = tf.expand_dims(Dense(embedding_dim)(input2[:,title_des_index:actor_index]), axis=1) # actors
T24 = tf.expand_dims(Dense(embedding_dim)(input2[:,actor_index:dir_index]), axis=1) # directors

T1X = tf.squeeze(layermulti(layermulti(T12, T13), T14), axis=1) # CROSS 1x
T2X = tf.squeeze(layermulti(layermulti(T22, T23), T24), axis=1) # CROSS 1x

T31 = Concatenate(axis=1)([T1X, T11])
T32 = Concatenate(axis=1)([T2X, T21])

layer3 = Dense(768, activation="relu",name='layer3')
layer4 = Dense(768, activation="relu",name='layer4')
layerattention = Attention(use_scale=True,name='att')

T31 = layer4(layer3(T31))
T32 = layer4(layer3(T32))

# scaled pairwise cosine similarities (+shrinkage?) [i, i]
T3 = Dot(axes=(1, 1), normalize=True)([T31, T32])

# # our model will accept the inputs of the two branches and then output a single value
model = Model(inputs=[input1,input2], outputs=T3)

In [None]:
model.summary()

## Training

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False,label_smoothing=0.)
optimizer = tf.keras.optimizers.experimental.Nadam(learning_rate=2e-4)

In [None]:
num_epochs = 30
traingen = CustomDataGen(B=Bconstant_updated, samples=samples_train, batch_size=256)
vadgen = CustomDataGen(B=Bconstant_updated, samples=samples_test, batch_size=256)
finaltraingen = CustomDataGen(B=Bconstant_updated, samples=samples, batch_size=256)
model.compile(loss=loss, optimizer=optimizer)
# model.fit(traingen,epochs=num_epochs,validation_data=vadgen)
model.fit(finaltraingen,epochs=num_epochs)

## Testing

In [None]:
feature_network = Model(model.input, model.get_layer('layer3').output)
feature = feature_network.predict([Bconstant,Bconstant])
B =  cosine_similarity(feature, dense_output=True)
np.fill_diagonal(B,0)

In [None]:
from sklearn.linear_model import LinearRegression
X= train_data.toarray()
Bf = B.copy()
Xf = X.dot(Bf)
size = Bf.shape[0]
for i in range(size):
    Bf[i,i]=0
X = np.asarray(X).reshape((X.shape[0]*X.shape[1],1))
Xtot = np.asarray(Xf).reshape((Xf.shape[0]*Xf.shape[1],1))
my_array = X.copy()
my_array[my_array == 0] = 0.01
my_array = my_array.flatten()
reg = LinearRegression().fit(Xtot, X, sample_weight=my_array)
pred_val = reg.coef_[0][0]*Xf

In [None]:
# CA-Siamese on cold
X = train_data.toarray()
pred_val[X.nonzero()] = -np.inf
n10_list, r10_list, n20_list, r20_list, n50_list, r50_list, n100_list, r100_list = [], [],[], [],[], [],[], []
r10_list.append(Recall_at_k_batch(pred_val,cold_test_data, k=10))
n10_list.append(NDCG_binary_at_k_batch(pred_val,cold_test_data, k=10))
r20_list.append(Recall_at_k_batch(pred_val, cold_test_data, k=20))
n20_list.append(NDCG_binary_at_k_batch(pred_val, cold_test_data, k=20))
r50_list.append(Recall_at_k_batch(pred_val, cold_test_data, k=50))
n50_list.append(NDCG_binary_at_k_batch(pred_val, cold_test_data, k=50))
r100_list.append(Recall_at_k_batch(pred_val, cold_test_data, k=100))
n100_list.append(NDCG_binary_at_k_batch(pred_val, cold_test_data, k=100))

print("Test NDCG@10=%.5f (%.5f)" % (np.nanmean(n10_list), np.nanstd(n10_list) / np.sqrt(len(n10_list))))
print("Test NDCG@20=%.5f (%.5f)" % (np.nanmean(n20_list), np.nanstd(n20_list) / np.sqrt(len(n20_list))))
print("Test NDCG@50=%.5f (%.5f)" % (np.nanmean(n50_list), np.nanstd(n50_list) / np.sqrt(len(n50_list))))
print("Test NDCG@100=%.5f (%.5f)" % (np.nanmean(n100_list), np.nanstd(n100_list) / np.sqrt(len(n100_list))))
print("Test Recall@10=%.5f (%.5f)" % (np.nanmean(r10_list), np.nanstd(r10_list) / np.sqrt(len(r10_list))))
print("Test Recall@20=%.5f (%.5f)" % (np.nanmean(r20_list), np.nanstd(r20_list) / np.sqrt(len(r20_list))))
print("Test Recall@50=%.5f (%.5f)" % (np.nanmean(r50_list), np.nanstd(r50_list) / np.sqrt(len(r50_list))))
print("Test Recall@100=%.5f (%.5f)" % (np.nanmean(r100_list), np.nanstd(r100_list) / np.sqrt(len(r100_list))))