In [26]:
# LIBRARIES
import pandas as pd
import numpy as np
import networkx as nx
import itertools as it
from operator import itemgetter

from sklearn.metrics import silhouette_score
from sklearn.preprocessing import RobustScaler
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.cluster import KMeans

import sys
import random
sys.path.append('../src')

from _nmf_ACCSLP import NMF_ACCSLP


In [27]:
# CONSTANTS 
WEIGHT_THRESHOLD = 1
TEST_SET_RATIO = 0.4

K = 3
range_K = [3,5,10]

In [28]:
pd.options.display.width = 0
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows',500)

In [29]:
def build_community_information_matrix(cluster_assignments):
    
    # Crear la matriz de membresía
    num_nodes = len(cluster_assignments)

    membership_matrix = np.zeros((num_nodes, num_nodes))

    for i in range(num_nodes):
        for j in range(num_nodes):
            # Verificar si los elementos pertenecen al mismo cluster
            if cluster_assignments[i] == cluster_assignments[j]:
                membership_matrix[i, j] = 1
    return membership_matrix

In [30]:
# Applying k-means algorithm 
def apply_kmeans(features, n_clusters, kmeans_kwargs, x, y, feature_names):

    k = n_clusters
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(features)
    kmeans_df =  pd.DataFrame(
        features,
        columns=feature_names
    )

    kmeans_df["predicted_cluster"] = kmeans.labels_

    return kmeans_df

In [31]:
def apply_rbf_to_get_similarity(features, gamma=1):
    rbf_matrix = rbf_kernel(features, gamma=gamma)  # Ajusta gamma según tu necesidad
    return rbf_matrix

In [32]:
def compareNodes(f_list, s_list):
    """
        Function that returns the number of users that have interact with both items
        Funcion que devuelve el numero de usuarios que han interactuado con ambos items
    """
    weight = len(np.intersect1d(f_list, s_list))
    
    return weight
    
def createLinks(prob_us_set, nodes, threshold):
    """
        Function that creates graph links with the information about the set. The weight has to be grater or equal to threshold.
        
        Funcion que crea los enlaces del grafo a partir de la informacion contenida en el conjunto que se le
        pasa a la funcion. El peso tiene que ser mayor o igual al umbral.
        
        Format of links list -> [(Node1, Node2, weight), ......]
    """
    result = list()  
    
    # hago todas las posibles combinaciones de problemas
    for fst, snd in it.combinations(nodes, 2):
        # obtengo el peso pasando la lista de usuarios que ha hecho cada problema
        weight = compareNodes(prob_us_set[fst], prob_us_set[snd])

        # If at least two users have been interacted with the same two items, then add link to list, else, omit the interactions.
        if weight >= threshold:
            result.append((fst, snd))
            
    return result

In [33]:
def create_graph_nx(list_nodes, list_links):
    """
        Function that creates a graph with the format from NetworkX 
        
        Funcion que crea un grafo de tipo Graph de la libreria NetworkX
        Construccion del grafo: http://networkx.readthedocs.io/en/networkx-1.11/tutorial/tutorial.html#what-to-use-as-nodes-and-edges
    """
    grafo = nx.Graph() # creo la variable grafo

    # incluyo los nodos del grafo 
    grafo.add_nodes_from(list_nodes)

    # se incluyen las tuplas de enlaces con el peso del enlace
    # es una lista de la forma [(Nodo1, Nodo2, peso), ......]
    grafo.add_edges_from(list_links)

    return grafo

def build_adjacency_matrix(graph, list_nodes):
    return nx.to_numpy_array(graph, nodelist=list_nodes)

In [34]:
def getKrecommendations(index, items_eval, item_index, items, S_prime):
    # list_recom = dict()
    # offset = item_index
    # total_columns = S_prime.shape[1]
    
    # for j in range(offset, total_columns):
    #     if j != offset and S_prime[index][j] > 0.00 and items[j] not in items_eval:
    #         list_recom[items[j]] = S_prime[index][j]

    list_recom = dict()
    total_columns = S_prime.shape[1]
    
    for j in range(total_columns):
        if j != item_index and S_prime[index][j] > 0.00 and items[j] not in items_eval:
            list_recom[items[j]] = S_prime[index][j]
 
    return list(list_recom.items())

In [35]:
def getWeighing(item, items_recom_with_values):
    weight = sum([value for (it, value) in items_recom_with_values if it == item])
    
    return weight

In [36]:
def delRepetitions(lista):
    """
        Funcion auxiliar para evitar que salgan repeticiones en las recomendaciones. Saco la lista de posibles 
        recomendaciones con valores unicos
    """
    conjunto_vacio = set()
    
    # esto sirve para que se haga mas rapido la comprobacion de si el elemento esta en la lista o no
    function_add = conjunto_vacio.add
    
    # hago la lista intensional, para mantener el orden dado en la lista original
    return [x for x in lista if not (x in conjunto_vacio or function_add(x))]

In [37]:
def getListRecom(items_train, items_train_indexes, S_prime, k, items, items_eval):
    list_recommendations = list()

    for idx, item in enumerate(items_train):
        # Esto va a contener una lista de tuplas del tipo (item-recomendar, valor similitud), donde el item puede aparecer varias veces repetido
        list_recommendations = list_recommendations + getKrecommendations(idx, items_train, items_train_indexes[idx], items, S_prime)

    # Contiene la suma de todos los valroes de similitud
    total = sum([x for (_,x) in list_recommendations])

    # Esto va a contener solo los items a recomendar sin repetición y sin valores
    items_recom_no_values = list(set([item for (item, _) in list_recommendations]))
    
    # Sistema de votación ponderada: para cada item que aparezca, sumar todos sus valores de similitud asociado / total
    items_recom = [(item, getWeighing(item, list_recommendations)/total) for item in items_recom_no_values]
    
    items_recom.sort(key=itemgetter(1), reverse=True)

    print ("Items a recomendar, con su peso: ", items_recom)
    
    # y me quedo con el primer elemento de la tupla, que es el item a recomendar
    list_sim_final = [x for (x,_) in items_recom] 
        
    # ahora elimino los items que estan en la lista de items con los que ha interactuado el target user
    list_final = [x for x in list_sim_final if x not in items_train]
    
    # y quito las repeticiones
    list_final = delRepetitions(list_final)

    print ("list_final", list_final)

    # list_fin_rec = [x for x in list_final if x in items_eval] # Esta linea de codigo es solo para filtrar cuales items ocultos han sido predecidos

    return list_final

In [38]:
def apply_getKrecommendations(df_new, items_train, items_train_indexes, S_prime, k, items, items_eval):
    """
    Function to generate a new column with the list of recommendations for each user
    """
    df_new['list_recommendations_original'] = df_new.apply(lambda row: getListRecom(items_train, items_train_indexes, S_prime, k, items, items_eval), axis=1)
    
    # df_new['list_recommendations'] = (df_new.copy()).apply(lambda row: row['list_recommendations_original'][:k], axis=1)

    df_new['list_recommendations'] = [row['list_recommendations_original'][:k] for _, row in df_new.copy().iterrows()]

    return df_new

In [39]:
def filter_prediction_matrix(S_prime, items_eval_indexes):
    return np.take(S_prime, items_eval_indexes, axis=1)

In [40]:
def calculateMetricsResults(list_recom_items, list_recom_items_original, user_list_to_recommend, list_train_items, list_eval_items, k):    
    set_df_metric = {'user_id': user_list_to_recommend, 'train_items': [list_train_items], 'eval_items': [list_eval_items], 'recom_items': list_recom_items, 'recom_items_original': list_recom_items_original}
    metric_df = pd.DataFrame.from_dict(set_df_metric)
    print (metric_df)
    return metric_df

In [41]:
def write_results_file(dir, result, k):
    f = open(dir, 'a')
    f.write('K, one_hit, precision, mrr, recall, f1 \n')
    f.write(str(k) + ',' + str(result['one_hit']) + ',' + str(result['precision']) + ',' + str(result['mrr']) + ',' + str(result['recall']) + ',' +  str(result['f1']) + '\n') 
    
    f.close()

In [42]:
def one_hit(row):
    """
        Funcion que implementa la metrica one hit. Devuelve un 1 si para un usuario dado, ese usuario ha interactuado 
        con al menos uno de los items que se le ha recomendado en el evaluation_set. 
        Cero si no hay ningun item de los recomendados con los que el usuario haya interactuado
    """
    num_items_common = np.intersect1d(row['recom_items'], row['eval_items'])
    
    if len(num_items_common) >= 1:
        return 1
    else:
        return 0

In [43]:
def mrr(row): 
    """
        Funcion que va a implementar la metrica de evaluacion mrr:
        mrr = 1/ranki, donde ranki es la posicion del primer item correcto
    """

    num_items_common = np.intersect1d(row['recom_items'], row['eval_items'])
    
    if len(num_items_common) >= 1:

        # hago la busqueda del primer elemento que esta en la lista de recomendados
        fst_correct_item = -1
        encontrado = False
        i = 0
        ranki = 0
        #print(ranki)
        while (i < len(row['recom_items'])) and (encontrado == False):
            if row['recom_items'][i] in row['eval_items']:
                # fst_correct_item = row['recom_items'][i]
                # print(fst_correct_item)
                ranki = i + 1
                encontrado = True
                #print("entro")
            else:
                i = i + 1
                
        if ranki == 0:
            return 0
        else:
            return (1/ranki)

    else:
        return 0


In [44]:
def precision(row):
    """
        Funcion que va a implementar la metrica precision en k: 
        (cuantos de los interactuados con el usuario estan entre los recomendados) / todos los recomendados
    """
    
    num_items_common = np.intersect1d(row['recom_items'], row['eval_items'])
    
    if len(row['recom_items']) == 0:
        return float(0)

    return float(len(num_items_common)/len(row['recom_items']))

In [45]:
def recall(row):
    """
        Funcion que implementa la metrica recall
        (cuantos de los interactuados con el usuario estan entre los recomendados) / todos los evaluados
    """  
    num_items_common = np.intersect1d(row['recom_items'], row['eval_items'])

    if (len(row['eval_items']) == 0):
        return 0
     
    return (len(num_items_common)/len(row['eval_items']))

In [46]:
def f1(row):
    """
        Funcion que calcula el f1 en funcion de precision y recall
    """
    denominador = row['precision'] + row['recall']
    
    if denominador == 0:
        return 0
    else:
        return (2 * row['precision'] * row['recall']) / denominador

In [47]:
def calculateScoreResults(metric_df):
    """
        Function to build a dataframe with the results for the evaluation metrics
    """
    metric_df['one_hit'] = metric_df.apply(lambda row: one_hit(row), axis=1)
    metric_df['mrr'] = metric_df.apply(lambda row: mrr(row), axis=1)
    metric_df['precision'] = metric_df.apply(lambda row: precision(row), axis=1)
    metric_df['recall'] = metric_df.apply(lambda row: recall(row), axis=1)
    metric_df['f1'] = metric_df.apply(lambda row: f1(row), axis=1)
   
    result_one_hit = metric_df['one_hit'].mean()
    result_precision = metric_df['precision'].mean()
    result_mrr = metric_df['mrr'].mean()
    result_recall = metric_df['recall'].mean()
    result_f1 = metric_df['f1'].mean()
   
    # voy a crear un diccionario con los resultados
    results_metrics = {'one_hit': result_one_hit, 'precision': result_precision, 'mrr': result_mrr, 'recall': result_recall, 'f1': result_f1}
    
    return results_metrics

In [48]:
"""
    items: List of all itemds
    user_items_train: items that user have interacted
    user_items_train_indexes: indexes of items that user have interacted
    df_interactions: Dataframe of all interactions
    user_items_eval: items that user have interacted and have been hidden from training set
    user: user id
"""
def main_process(items, user_items_train, user_items_train_indexes, df_interactions, user_items_eval, user):
    
    # diccionario que va a contener como key el user, como value, los items con los que ha interactuado el user
    df_users_simple = {}
    grouped_user = df_interactions.groupby('user')['item'].apply(list)
    for i,j in zip(grouped_user.index.tolist(), grouped_user.values.tolist()):
        df_users_simple[i] = j 

    # PARA LA CONSTRUCCION DE R PRECISION -----------
   
     # convierto la serie en un dataframe
    # df_user_eval = pd.DataFrame({'user_id': [user], 'list_item_id': [user_items_eval]})

    # df_user_eval["items_watched"] = [df_users_simple[row['user_id']] for _, row in df_user_eval.iterrows()]
    
    # BUILDING ADJACENCY MATRIX
      
    # 1. Recuperar todos los items (nodes) de la base de datos
    nodes = items
 
    # 2. I create a dictionary: keys are the items, and values are the list of users that are interacted with this item
    grouped = df_interactions.groupby('item')['user'].apply(list)

    links = list()
    
    nodes_with_interaction = df_interactions.item.unique()

    # 4. create the links with the suitable format for nx
    links = createLinks(grouped, nodes_with_interaction, WEIGHT_THRESHOLD)
 
    # 5. I create the graph
    graph = create_graph_nx(nodes, links)

    adjacency_matrix = build_adjacency_matrix(graph, list_nodes=nodes)
   
    # # GETTING DATA TO BUILD THE COMMUNITY MEMBERSHIP INFORMATION MATRIX AND ATTRIBUTE SIMILARITY MATRIX

    column_names = ["Actual_price_log", "Category_name_encoded_log"]
    features = df_items[column_names]

    scaler = RobustScaler()
    scaled_features = scaler.fit_transform(features)

    # BUILDING COMMUNITY MEMBERSHIP INFORMATION MATRIX
    
    # Applying K-MEANS to build commnunity membership information matrix
    kmeans_kwargs = {
        "init": "k-means++",
        "n_init": 10,
        "max_iter": 300,
    }

    # Applying k-means clustering
    
    # n_clusters must be selected according to elbow_method and silhouette_score
    n_clusters = 2 
    kmeans_df = apply_kmeans(scaled_features, n_clusters, kmeans_kwargs,
            x=column_names[0], y=column_names[1], feature_names=column_names)
    community_matrix = build_community_information_matrix(cluster_assignments=kmeans_df["predicted_cluster"]) 

    # BUILDING THE ATTRIBUTE SIMILARITY MATRIX
    
    # Applying RBF to build attribute similarity matrix
    similarity_matrix = apply_rbf_to_get_similarity(features=scaled_features)

    # APPLYING ACCSLP MODEL FOR ADJACENCY MATRIX PREDICTIONS       
    nmf_accslp = NMF_ACCSLP(verbose=1)

    U = nmf_accslp.fit_transform(S=adjacency_matrix, Z=similarity_matrix, X=community_matrix)
    H = nmf_accslp.components_

    S_prime = U @ H

    S_prime = S_prime[user_items_train_indexes]
 
    user_list_to_recommend = [user]

    df_users = pd.DataFrame({'user_id':grouped_user.index, 'list_item_id':grouped_user.values})
    
    df_new = df_users[df_users['user_id'].isin(user_list_to_recommend)]

    dataframe_k_measures_original = list()

    dataframe_k_measures_original = [apply_getKrecommendations(df_new, user_items_train, user_items_train_indexes, S_prime, k, items, user_items_eval).copy() for k in range_K]
    
    metrics_results = [calculateMetricsResults(dataframe_k_measures_original[k]['list_recommendations'].tolist(), dataframe_k_measures_original[k]['list_recommendations_original'].tolist(), user_list_to_recommend, user_items_train, user_items_eval, range_K[k]) for k in range(K)]
    
    return metrics_results

In [49]:
df_items = pd.read_csv('./../datasets/reduced_electronic_products.csv')
df_interactions = pd.read_csv('./../datasets/interactions.csv')

user = 2
user_test_interactions = df_interactions[df_interactions.user == user].sample(frac=TEST_SET_RATIO)
df_interactions = df_interactions[~df_interactions.index.isin(user_test_interactions.index)]

items = df_items['id'].tolist()

user_items_eval = sorted(user_test_interactions["item"].tolist())
user_items_train = df_interactions[df_interactions.user == user]["item"].tolist()
user_items_train_indexes = np.where(np.isin(items, user_items_train)) # This returns a tuple
user_items_train_indexes = np.array(user_items_train_indexes[0])

metric_results = main_process(items, user_items_train, user_items_train_indexes, df_interactions, user_items_eval, user)



Initializing  Matrix S
Initializing  Matrix X
Initializing  Matrix Z
Epoch 10 reached after 0.379 seconds, error: 17295636.049920
(previous_error - error) / error_at_init): 0.0015078607324064141
Epoch 20 reached after 0.718 seconds, error: 17295449.831717
(previous_error - error) / error_at_init): 1.075053909973802e-05
Items a recomendar, con su peso:  [(1, 0.2011453683354821), (0, 0.1960755275615854), (8, 0.1721595299126907), (5, 0.17192760209963465), (2, 0.11487511218472488), (6, 0.11342409779976768), (7, 0.030392762106114316)]
list_final [1, 0, 8, 5, 2, 6, 7]
Items a recomendar, con su peso:  [(1, 0.2011453683354821), (0, 0.1960755275615854), (8, 0.1721595299126907), (5, 0.17192760209963465), (2, 0.11487511218472488), (6, 0.11342409779976768), (7, 0.030392762106114316)]
list_final [1, 0, 8, 5, 2, 6, 7]
Items a recomendar, con su peso:  [(1, 0.2011453683354821), (0, 0.1960755275615854), (8, 0.1721595299126907), (5, 0.17192760209963465), (2, 0.11487511218472488), (6, 0.113424097799767

In [50]:
# metrics_results = [calculateScoreResults(metric_results[k]) for k in range(K)]
# [write_results_file("./../results/results.csv", metrics_results[k], range_K[k]) for k in range(K)]