In [None]:
import glob
import pandas as pd
from google.colab import files
from google.colab import drive
import numpy as np
import scipy as sc
from tqdm import tqdm
import matplotlib.pyplot as plt
import re
import random
import sys
import math
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns


#Load in data

In [None]:
drive.mount('/content/drive', force_remount=True)

In [None]:
!ls "/content/drive/My Drive/Knab/Data/CleanData/ALS_inputs"

In [None]:
#Load in pairs
df_read_pairs = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/ALS_inputs/read_pairs.csv").drop_duplicates() #['URL','clientid_hashed']
df_clicked_pairs = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/ALS_inputs/clicked_pairs.csv").drop_duplicates()
df_article_data = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/clean_article_data.csv")

In [None]:
#df_read_pairs = df_read_pairs.sort_values(['clientid_hashed', 'URL', 'Confidence_level'], axis = 0).drop_duplicates(['clientid_hashed','URL'],keep='last')

In [None]:
df_read_pairs = df_read_pairs.drop_duplicates(['clientid_hashed', 'URL'], keep = 'last')

#Find useful data, to decrease sparsity

In [None]:
#Minimum amount of articles read/clicked
lower_bound_client = 7
#lower_bound_item = 7

In [None]:
df_read_count = df_read_pairs['clientid_hashed'].value_counts().reset_index().rename(columns = {'index':'clientid_hashed', 'clientid_hashed':'read_count'})
df_read_count['read_count'] = df_read_count['read_count'].astype(int)

df_clicked_count = df_clicked_pairs['clientid_hashed'].value_counts().reset_index().rename(columns = {'index':'clientid_hashed', 'clientid_hashed':'clicked_count'}).drop_duplicates()
df_clicked_count['clicked_count'] = df_clicked_count['clicked_count'].astype(int)

#Part of dataset that meets the requirement
df_read_bound = df_read_count[df_read_count['read_count'] >= lower_bound_client]

#All the clientids that meet the requirement
client_list_read = df_read_bound['clientid_hashed'].values

#Add two client lists, as the loops will have to run over the same client lists
client_list = client_list_read.tolist()
client_list.sort()

In [None]:
df_read_pairs = df_read_pairs[df_read_pairs['clientid_hashed'].isin(client_list)] #Only take data with clients that have >=7 reads
df_clicked_pairs = df_clicked_pairs[df_clicked_pairs['clientid_hashed'].isin(client_list)]

df_read_items_count = df_read_pairs['URL'].value_counts().reset_index().rename(columns = {'index':'URL', 'URL':'read_count'})

item_list_read = df_read_items_count['URL'].values
item_list = item_list_read.tolist()
item_list.sort()

In [None]:
#Filter the data to meet the requirement
df_read_data = df_read_pairs
df_clicked = df_clicked_pairs[(df_clicked_pairs['clientid_hashed'].isin(client_list)) & (df_clicked_pairs['URL'].isin(item_list))]

df_test_set = df_read_data.drop_duplicates(['clientid_hashed'], keep = 'last') #take the last read as the test data
df_train_set = df_read_data[~df_read_data.isin(df_test_set)].dropna()

df_clicked = df_clicked[['URL','clientid_hashed']]
df_clicked_testing = pd.merge(left = df_clicked, right = df_test_set, left_on = ['URL', 'clientid_hashed'], right_on = ['URL', 'clientid_hashed'], how = 'left')
df_clicked = df_clicked_testing[df_clicked_testing['Confidence_level'].isna()]

df_clicked = df_clicked[['URL','clientid_hashed']]
df_clicked_training = pd.merge(left = df_clicked, right = df_train_set, left_on = ['URL', 'clientid_hashed'], right_on = ['URL', 'clientid_hashed'], how = 'left')
df_clicked = df_clicked_training[df_clicked_training['Confidence_level'].isna()]

df_clicked = df_clicked[['URL','clientid_hashed']]
df_read = df_train_set

#Calculate occurrence matrices

In [None]:
#Initiate the matrices
df_read_matrix = pd.DataFrame(np.zeros([len(client_list),len(item_list)]))
df_weightRead_matrix = pd.DataFrame(np.zeros([len(client_list),len(item_list)]))
df_clicked_matrix = pd.DataFrame(np.zeros([len(client_list),len(item_list)]))

#Loop through all occurrences in read list
for i in range(len(df_read)):
    url = df_read.iloc[i]['URL']
    clientid = df_read.iloc[i]['clientid_hashed']
    Conf_L = df_read.iloc[i]['Confidence_level']

    #Find location in list
    item_loc = item_list.index(url)
    client_loc = client_list.index(clientid)

    df_read_matrix.loc[df_read_matrix.index == client_loc, item_loc] = 1
    df_weightRead_matrix.loc[df_read_matrix.index == client_loc, item_loc] = Conf_L

for i in range(len(df_clicked)):
    url = df_clicked.iloc[i]['URL']
    clientid = df_clicked.iloc[i]['clientid_hashed']

    #Find location in list
    item_loc = item_list.index(url)
    client_loc = client_list.index(clientid)

    df_clicked_matrix.loc[df_clicked_matrix.index == client_loc, item_loc] = 1
  
#Set values of clicked matrix to 0 where both click and read are 1.
dup_users = np.where(df_clicked_matrix+df_read_matrix == 2)[0]
dup_items = np.where(df_clicked_matrix+df_read_matrix == 2)[1]

for i in range(len(dup_users)):
    df_clicked_matrix.iloc[dup_users[i]][dup_items[i]] = 0
 

In [None]:
#Check if matrix has same number of elements in read matrix
sum(df_read_matrix.sum()) == len(df_read.drop_duplicates(['URL', 'clientid_hashed']))

#ALS

In [None]:
#Hyper parameters
random.seed(sum([ord(c) for c in "KNAB"])) #seeding

max_epochs = 50 #number of top-level iterations

initialize_loc = 0.1 #mean of random initialization
intialize_scale = 0.01 #standard deviation of random initialization

number_of_clients = len(client_list) 
number_of_items = len(item_list)

factors = 32 #number of factors
reg = 2 #Regularization parameter
gamma_1 = 0.35
gamma_2 = 0.35
S_0 = 800 #Weight parameter negative missing
alpha = 1 #Weight power parameter negative missing
C_0 = 1 #Weight parameter clicked
beta = 1 #Weight power parameter clicked
w_0 = 1 #Weight of read pairs

In [None]:
number_of_clients

In [None]:
number_of_items

##Initialize values

In [None]:
#Randomly initialize factors using normal distribution
P = np.random.normal(loc=initialize_loc, scale=intialize_scale, size=[number_of_clients, factors])
Q = np.random.normal(loc=initialize_loc, scale=intialize_scale, size=[number_of_items, factors])

#Initialize the first R estimations
R = df_read_matrix.values

#Initialize click training matrix
C = df_clicked_matrix.values

#Initialize vectors
prediction_users = [0]*number_of_clients
prediction_users_clicked = [0]*number_of_clients
prediction_items = [0]*number_of_items

In [None]:
#Uniform weighting
#S_i = [0] * number_of_items
#C_i = [0] * number_of_items

#for i in range(number_of_items):
#    S_i[i] = S_0 / number_of_items
#    C_i[i] = C_0 / number_of_items

##Calculate weights for ALS

In [None]:
#S_i
som = 0.
Z = 0.
p = [0] * number_of_items
for i in range(number_of_items):
    p[i] = sum(R[:,i])
    som += p[i]

for i in range(number_of_items):
    p[i] /= som
    if p[i] > 0:
        p[i] = pow(p[i], alpha)
    Z += p[i]

S_i = [0] * number_of_items
Wimin = 0.
Wimax = 0.
N0 = 0.

for i in range(number_of_items):
    S_i[i] = S_0 * p[i] / Z
    if S_i[i] < Wimin:
        Wimin = S_i[i]
    if S_i[i] > Wimax:
        Wimax = S_i[i]
    if S_i[i] == 0:
        N0 += 1
    
    
#C_i
pv = [0] * number_of_items
som1 = 0.
Z1 = 0.
for i in range(number_of_items):
    pv[i] = sum(C[:,i])
    som1 += pv[i]

for i in range(number_of_items):
    pv[i] /= som1
    if pv[i] > 0:
        pv[i] = pow(pv[i], beta)
    Z1 += pv[i]

C_i = [0] * number_of_items
for i in range(number_of_items):
    C_i[i] = C_0 * pv[i] / Z1

##Fast V-ALS

In [None]:
#USER CACHES
def caches(P, Q):
    C_u = np.zeros(number_of_clients) #Sum of weights clicked per user
    V_u = np.zeros(number_of_clients) #Number of clicked per user
    R_u = np.zeros(number_of_clients) #Number of read per user
    GR =  np.zeros(number_of_clients) #
    GvR = np.zeros(number_of_clients) #Cache sum predictions
    LvR = np.zeros(number_of_clients) #Cache clicked weighted sum predictions

    for u in range(number_of_clients):
        val1 = 0.
        val2 = 0.

        for i in np.where(C[u] == 1)[0]:
            C_u[u] += C_i[i]
            V_u[u] += 1
            val1 += np.dot(P[u], np.transpose(Q[i]))
            val2 += C_i[i] * np.dot(P[u], np.transpose(Q[i]))

        R_u[u] = sum(R[u])        
        GvR[u] = val1
        LvR[u] = val2

    T = np.dot(np.transpose(P), LvR)
    Told = np.copy(T)

    DU = np.zeros(factors) #Importance of clicked factors per user
    EU = np.dot(np.transpose(P), P) #E users from paper
    HU = np.zeros([factors,factors]) 

    for f in range(factors):
        val1 = 0.
        for u in range(number_of_clients):
            val1 += P[u,f]*C_u[u]
        DU[f] = val1
        for k in range(f+1):
            val = 0.
            for u in range(number_of_clients):
                val += P[u,f] * P[u,k] * C_u[u]

            HU[f,k] = val
            HU[k,f] = val

    DV = np.zeros(factors) #Importance of clicked factors per item
    EV = np.dot(np.transpose(Q), Q) #E users from paper
    HV = np.zeros([factors,factors])
    for f in range(factors):
        val1 = 0.
        for i in range(number_of_items):
            val1 += Q[i,f]
        DV[f] = val1
        for k in range(f+1):
            val = 0.
            for i in range(number_of_items):
                val += Q[i,f] * Q[i,k] * S_i[i]

            HV[f,k] = val
            HV[k,f] = val
    return C_u, V_u, R_u, GR, GvR, LvR, T, Told, EU, DU, HU, EV, DV, HV

In [None]:
# def loss():
#   L = reg * (sum(sum(P**2)) + sum(sum(Q**2))) #Regularization part
#   for u in range(number_of_clients):
#     l = 0.
#     for i in np.where(R[u] == 1)[0]:
#       pred = np.dot(P[u], np.transpose(Q[i]))
#       l += pow(R[u,i] - pred, 2) - S_i[i]*pow(pred,2)
#     for i in np.where(C[u] == 1)[0]:
#       pred = np.dot(P[u], np.transpose(Q[i]))
#       l -= S_i[i] * pow(pred, 2)
#       for j in np.where(R[u] == 1)[0]:
#         pred_j = np.dot(P[u], np.transpose(Q[j]))
#         l += C_i[i] * pow(gamma_1 - (pred_j - pred),2) - pow(gamma_2 - (pred - pred_j),2)
#       for j in np.where(C[u] == 1)[0]:
#         pred_j = np.dot(P[u], np.transpose(Q[j]))
#         l -= C_i[i] * pow(gamma_2 - (pred - pred_j),2)
#       for j in range(number_of_items):
#         pred_j = np.dot(P[u], np.transpose(Q[j]))
#         l += C_i[i] * pow(gamma_2 - (pred - pred_j),2) 
    
#     for i in range(number_of_items):
#       pred = np.dot(P[u], np.transpose(Q[i]))
#       l += S_i[i] * pow(pred, 2)
    
#     L += l
#   return L

In [None]:
def loss(C_u, V_u, R_u, GR, GvR, LvR, T, Told, EU, DU, HU, EV, DV, HV, P, Q):
    
    L = reg * (sum(sum(P**2)) + sum(sum(Q**2))) #Regularization part
    for u in range(number_of_clients):
        l = 0.
        for i in np.where(R[u] == 1)[0]:
            pred = np.dot(P[u], np.transpose(Q[i]))
            l += df_weightRead_matrix.values[u,i] * pow(R[u,i] - pred, 2)
            l -= S_i[i] * pow(pred, 2)
            l -= 2 * C_u[u] * (gamma_1 + gamma_2) * pred

        for i in np.where(C[u] == 1)[0]:
            pred = np.dot(P[u], np.transpose(Q[i]))
            l -= (S_i[i] + C_u[u]) * pow(pred, 2)
            l += (number_of_items - V_u[u]) * C_i[i] * pow(pred, 2)
            #l -= Cu[u] * pow(pred,2)

        for k in range(factors):
            l += (2 * gamma_2 * C_u[u] - 2 * LvR[u]) * P[u,k] * DV[k]

        l += np.dot(np.dot(HV, P[u]), P[u])
        l += C_u[u] * (pow(gamma_1, 2) - pow(gamma_2, 2)) * R_u[u] 
        l += 2 * (gamma_1 + gamma_2) * R_u[u] * LvR[u] 
        l -= 2 * C_u[u] * gamma_2 * GvR[u] 
        l += 2 * GvR[u] * LvR[u] 
        l += C_u[u] * (number_of_items - V_u[u]) * pow(gamma_2, 2) 
        l -= 2 * gamma_2 * (number_of_items - V_u[u]) * LvR[u]
        l += C_u[u] * np.dot(np.dot(EV, P[u]), P[u])

        L += l

    return L

In [None]:
def update_user(P, Q, u, NN, old_loss):
    iter = 1
    new_loss = old_loss + 1
    P_old = P
    Q_old = Q
    learning_rate = 2
    threshold = 1
    
    while (new_loss > old_loss and new_loss > 0) or iter == 1:
        iter = 0
        learning_rate /= 2 
        
        C_u, V_u, R_u, GR, GvR, LvR, T, Told, EU, DU, HU, EV, DV, HV = caches(P_old, Q_old)
        
        if learning_rate < threshold:
            new_loss = old_loss
            break

        #Get row from R
        rating_items = R[u]
        item_list = np.where(rating_items == 1)[0]
        w_items = w_0*df_weightRead_matrix.values[u]

        #Get row from C
        click_items = C[u]
        item_click_list = np.where(click_items == 1)[0]

        #Memory
        old_vector = P[u]

        #If the user has not read any articles, return without update
        if sum(item_list) == 0:
            return

        for i in item_list:
        # Overwrite 0s from only view predictions, with predicted values from click matrix
            prediction_items[i] = np.dot(P[u], np.transpose(Q[i])) 

        for i in item_click_list:
            prediction_items[i] = np.dot(P[u], np.transpose(Q[i]))


        for f in range(factors):
            numer = 0.
            denom = 0.
            pd = 0.
            for k in range(factors):
                if k != f:
                    numer += P[u,k] * HV[f,k] + C_u[u] * P[u,k] * EV[f,k]
                    pd += P[u,k] * DV[k]

            for i in item_list:
                prediction_items[i] -=  P[u,f] * Q[i,f] #*learning_rate 
                numer += -(w_items[i] * rating_items[i] - (w_items[i] - S_i[i]) * prediction_items[i]) * Q[i,f] - (gamma_1 + gamma_2) * C_u[u] * Q[i,f] 
                denom += (w_items[i] - S_i[i]) * Q[i,f] * Q[i,f]


            denom += HV[f,f] + reg


            #Initialize counters
            cq = 0.
            r = 0.
            cr = 0.
            q = 0.

            for i in item_click_list:
                prediction_items[i] -=  P[u,f] * Q[i,f] #*learning_rate 
                numer += -S_i[i] * Q[i,f] * prediction_items[i] 
                numer += (number_of_items - V_u[u]) * C_i[i] * Q[i,f] * prediction_items[i] 
                numer += ((gamma_1 + gamma_2) * R_u[u] - gamma_2 * (number_of_items - V_u[u])) * C_i[i] * Q[i,f] 
                numer -= C_u[u] * Q[i,f] * prediction_items[i] 
                numer -= gamma_2 * C_u[u] * Q[i,f] 
                numer -= pd * C_i[i] * Q[i,f] 
                numer -= C_i[i] * prediction_items[i] * DV[f]

                cq += C_i[i] * Q[i,f]
                r += prediction_items[i]
                cr += C_i[i] * prediction_items[i]
                q += Q[i,f]

                denom += - S_i[i] * Q[i,f] * Q[i,f] 
                denom += (number_of_items - V_u[u]) * C_i[i] * Q[i,f] * Q[i,f]
                denom -= C_u[u] * Q[i,f] * Q[i,f] 
                denom -= 2 * C_i[i] * Q[i,f] * DV[f]

            numer += cq * r + cr * q + gamma_2 * C_u[u] * DV[f]
            denom += C_u[u] * EV[f,f] + 2 * cq * q 

            #update = P_old[u,f] - learning_rate * (P_old[u,f] - numer/denom)
            update = -numer/denom
        #Update Factor
            if NN == True:
                if update >= 0:
                    P[u,f] = update
                else:
                    P[u,f] = 0
            else:
                P[u,f] = update

        #Update Prediction Cache
            for i in item_list:
                prediction_items[i] += P[u,f] * Q[i,f] #learning_rate * 
            for i in item_click_list:
                prediction_items[i] += P[u,f] * Q[i,f] #learning_rate * 
        #end for f
        
        #Update Cache
        tmp1 = 0.
        tmp2 = 0.
        for i in item_click_list:
            tmp1 += prediction_items[i]
            tmp2 += C_i[i] * prediction_items[i]

        GvR[u] = tmp1
        LvR[u] = tmp2


        for f in range(factors):
            val = P[u,f] * LvR[u]
            if u == 0:
                T[f] = val
                Told[f] = val
            else:
                T[f] += val
                Told[f] += val

            val0 = DU[f] - old_vector[f] * C_u[u] + P[u,f] * C_u[u]
            DU[f] = val0

            for k in range(f+1):
                val1 = EU[f,k] - old_vector[f] * old_vector[k] + P[u,f] * P[u,k]
                EU[f,k] = val1
                EU[k,f] = val1

                val2 = HU[f,k] - old_vector[f] * old_vector[k] * C_u[u] + P[u,f] * P[u,k] * C_u[u]
                HU[f,k] = val2
                HU[k,f] = val2

            
        new_loss = loss(C_u, V_u, R_u, GR, GvR, LvR, T, Told, EU, DU, HU, EV, DV, HV, P, Q)
        diff_loss = old_loss - new_loss
                                              
        if diff_loss < 0. or new_loss < 0.:
            P = np.copy(P_old)
            iter=1
            
    return C_u, V_u, R_u, GR, GvR, LvR, T, Told, EU, DU, HU, EV, DV, HV, P, Q, new_loss

In [None]:
def update_item(P, Q, i, NN, old_loss):
    iter = 1
    new_loss = old_loss + 1
    P_old = P
    Q_old = Q
    learning_rate = 2
    threshold = 1
  
    while (new_loss > old_loss and new_loss > 0) or iter == 1:
        iter = 0
        learning_rate /= 2 
        
        C_u, V_u, R_u, GR, GvR, LvR, T, Told, EU, DU, HU, EV, DV, HV = caches(P_old, Q_old)
        
        if learning_rate < threshold:
            new_loss = old_loss
            break
      
        rating_users = R[:,i]
        w_users = w_0*df_weightRead_matrix.values[:,i]
        user_list = np.where(rating_users == 1)[0]

        click_users = C[:,i]
        user_click_list = np.where(click_users == 1)[0]

        R_i = len(user_list)
        V_i = len(user_click_list)
        
        if R_i == 0:
            return
        
        ## prediction cache for the item
        for u in user_list:
            prediction_users[u] = np.dot(P[u], np.transpose(Q[i])) 

        ind_u = np.zeros(V_i, dtype=int)
        cnt = 0
        GRold = np.zeros(V_i)
        GvRold = np.zeros(V_i)
        LvRold = np.zeros(V_i)
        DVold = 0.

        for u in user_click_list:
            ind_u[cnt] = u
            cnt += 1
            prediction_users[u] = np.dot(P[u], np.transpose(Q[i])) 
            GR[u] = 0
            for k in range(factors):
                GR[u] += P[u,k] * DV[k]
                Told[k] = Told[k] - C_i[i] * P[u,k] * prediction_users[u]

        old_vector = Q[i]

        for f in range(factors):
            for n in range(V_i):
                GRold[n] = GR[ind_u[n]] - P[ind_u[n],f] * DV[f]
                GvRold[n] = GvR[ind_u[n]] - prediction_users[ind_u[n]]
                LvRold[n] = LvR[ind_u[n]] - C_i[i] * prediction_users[ind_u[n]]
            DVold = DV[f] - Q[i,f]

            numer = 0.
            denom = 0.

            for k in range(f):
                if k != f:
                    numer += Q[i,k] * EU[f,k] * S_i[i] 
                    numer += Q[i,k] * HU[f,k]


            for u in user_list:
                prediction_users[u] -= P[u,f] * Q[i,f] #learning_rate *
                numer += -(w_users[u] * rating_users[u] - (w_users[u] - S_i[i]) * prediction_users[u]) * P[u,f]
                numer -= (gamma_1 + gamma_2) * C_u[u] * P[u,f]
                denom += (w_users[u] - S_i[i]) * P[u,f] * P[u,f]

            denom += S_i[i] * EU[f,f] + HU[f,f] + reg

            for u in user_click_list:
                prediction_users[u] -= P[u,f] * Q[i,f] #learning_rate *
                numer += -S_i[i] * prediction_users[u] * P[u,f] 
                numer -= C_i[i] * GR[u] * P[u,f] 
                numer += C_i[i] * GvR[u] * P[u,f] 
                numer += LvR[u] * P[u,f]
                numer += C_i[i] * (prediction_users[u] * (number_of_items - V_u[u]) + (gamma_1 + gamma_2) * R_u[u] - gamma_2 * (number_of_items - V_u[u])) * P[u,f]
                numer -= (prediction_users[u] + gamma_2) * C_u[u] * P[u,f]
                denom += -S_i[i] * P[u,f]  * P[u,f] 
                denom += C_i[i] * (number_of_items - V_u[u]) * P[u,f] * P[u,f] 
                denom -= C_u[u] * P[u,f] * P[u,f]

            numer += - T[f] + gamma_2 * DU[f]

            #update = Q_old[i,f] - learning_rate * (Q_old[i,f] - numer/denom)
            update = -numer/denom
        #Update Factor
            if NN == True:
                if update >= 0:
                    Q[i,f] = update
                else:
                    Q[i,f] = 0
            else:
                Q[i,f] = update

        #Update Prediction Cache
        for u in user_list:
            prediction_users[u] += P[u,f] * Q[i,f] # learning_rate *

        for u in user_click_list:
            prediction_users[u] += P[u,f] * Q[i,f] # learning_rate *
            
        tf = 0.
        tfp1 = 0.

        fp1 = f + 1

        if fp1 >= factors:
            fp1 = 0

        DV[f] = DVold + Q[i,f]
        for n in range(V_i):
            GR[ind_u[n]] = GRold[n] + P[ind_u[n],f] * DV[f]
            GvR[ind_u[n]] = GvRold[n] + prediction_users[ind_u[n]]
            LvR[ind_u[n]] = LvRold[n] + C_i[i] * prediction_users[ind_u[n]]
            tf += C_i[i] * P[ind_u[n],f] * prediction_users[ind_u[n]]
            tfp1 += C_i[i] * P[ind_u[n],fp1] * prediction_users[ind_u[n]]

        Told[f] = T[f] - tf
        T[fp1] = Told[fp1] + tfp1

        for u in user_click_list:
            for k in range(factors):
                Told[k] = Told[k] + C_i[i] * P[u,k] * prediction_users[u]

        #Update Cache
        for f in range(factors):
            for k in range(f+1):
                val1 = EV[f,k] - old_vector[f] * old_vector[k] + Q[i,f] * Q[i,k]
                EV[f,k] = val1
                EV[k,f] = val1

                val2 = HV[f,k] - old_vector[f] * old_vector[k] * S_i[i] + Q[i,f] * Q[i,k] * S_i[i]
                HV[f,k] = val2
                HV[k,f] = val2
                 
        new_loss = loss(C_u, V_u, R_u, GR, GvR, LvR, T, Told, EU, DU, HU, EV, DV, HV, P, Q)
        diff_loss = old_loss - new_loss
                                              
        if diff_loss < 0. or new_loss < 0.:
            Q = np.copy(Q_old)
            iter=1
            
    return C_u, V_u, R_u, GR, GvR, LvR, T, Told, EU, DU, HU, EV, DV, HV, P, Q, new_loss           

In [None]:
thresh = 0.1
diffloss = 1

#Top level iteration with stop criterion max epochs
C_u, V_u, R_u, GR, GvR, LvR, T, Told, EU, DU, HU, EV, DV, HV = caches(P, Q)
oldloss = loss(C_u, V_u, R_u, GR, GvR, LvR, T, Told, EU, DU, HU, EV, DV, HV, P, Q)
print(f'Start loss {oldloss}')
j = 0

while abs(diffloss) > thresh or abs(iterloss) > thresh:
    epochloss = oldloss
    
    #Update client factors
    for u in range(number_of_clients):
        Pold, Qold = P, Q
        C_u_new, V_u_new, R_u_new, GR_new, GvR_new, LvR_new, T_new, Told_new, EU_new, DU_new, HU_new, EV_new, DV_new, HV_new, P_new, Q_new, newloss = update_user(P, Q, u, NN = False, old_loss = oldloss)

        if newloss<oldloss:
            P, Q = P_new, Q_new
            oldloss = newloss
        elif oldloss == newloss:
            P, Q = P_new, Q_new
        elif oldloss<newloss:
            P, Q = Pold, Qold
    print(f'P_{j} loss {oldloss}')
    userloss = oldloss    
  #Update item factors
    for i in range(number_of_items):
        Pold, Qold = P, Q
        C_u_new, V_u_new, R_u_new, GR_new, GvR_new, LvR_new, T_new, Told_new, EU_new, DU_new, HU_new, EV_new, DV_new, HV_new, P_new, Q_new, newloss = update_user(P, Q, u, NN = False, old_loss = oldloss)
        
        if newloss<oldloss:
            P, Q = P_new, Q_new
            oldloss = newloss
        elif oldloss == newloss:
            P, Q = P_new, Q_new
        elif oldloss<newloss:
            P, Q = Pold, Qold
    print(f'Q_{j} loss {oldloss}')
    itemloss = oldloss
    diffloss = userloss - itemloss
    iterloss = epochloss - itemloss
    j += 1

else:
    print(f'Convergence after {j} iterations')
  
  

In [None]:
R_est = np.dot(P,np.transpose(Q))

In [None]:
np.savez("/content/drive/My Drive/Knab/Data/CleanData/ALS_inputs/P_matrix_15.npz",P)

In [None]:
np.savez("/content/drive/My Drive/Knab/Data/CleanData/ALS_inputs/Q_matrix_15.npz", Q)

In [None]:
for i in range(len(client_list)):
    read_articles = R[i]
    clicked_articles = C[i]
    for j in range(len(item_list)):
        if read_articles[j] == 1:
            R_est[i,j] = 0
        elif clicked_articles[j] == 1:
            R_est[i,j] = 0

In [None]:
hitrate = np.zeros(len(range(3,11)))
cnt = 0
for j in range(3,11):
    test_item_loc = np.zeros(len(df_test_set), dtype= int) #location of items in R_est
    test_client_loc = np.zeros(len(df_test_set), dtype= int) #location of users in R_est
    test_R_est = np.zeros(len(df_test_set)) #estimated rating of user-item pair
    max_indices_test = [] #list for indices of top-N rated articles

    hit = 0
    for i in range(len(df_test_set)):
        url = df_test_set.iloc[i]['URL'] #take url of row in test set
        clientid = df_test_set.iloc[i]['clientid_hashed'] #take clientid of row in test set

        #Find location in list and value in estimated ratings
        test_item_loc[i] = item_list.index(url) #take index for the url
        test_client_loc[i] = client_list.index(clientid) #take index for the client
        user_ratings = R_est[test_client_loc[i]] #take ratings for this client
        test_R_est[i] = R_est[test_client_loc[i],test_item_loc[i]] #store rating for the client-item pair

        #Find top-N rated articles in estimated ratings for users
        max_indices_test.append(user_ratings.argsort()[-j:][::-1]) #store top-n ratings of this client
        if test_item_loc[i] in max_indices_test[i]:
            hit += 1
    
    hitrate[cnt] = hit/len(df_test_set)
    max_indices_test =  np.array([max_indices_test])[0]
    cnt += 1

In [None]:
hitrate