Versión -apificada- del fichero -marIA-azure-16ago2024.ipynb-, con carga y salvado de matrices y modelos

In [1]:
import numpy as np
import pandas as pd
import pickle
import io
import os
from datetime import datetime

#from scipy.sparse import csr_matrix
import scipy.sparse as sps

rom tqdm.auto import tqdm

from Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender
from Pipeline.utils import create_mapping, get_mapped_sessions_to_recommend, get_items_to_exclude, predict_my_purchase

tqdm.pandas()

In [2]:
def create_csr_matrix(df, M, N):
    return sps.csr_matrix((df['score'].values,
                          (df['session_id'].values, df['item_id'].values)),
                         shape=(M, N))

def split_dataframes_test(
        item_features_df, candidate_items_df, test_sessions_df,
        unique_interactions=True, view_weight=1,
):
    test_sessions_df['score'] = view_weight
    test_sessions_df = test_sessions_df.sort_values(by=['session_id', 'date'], ascending=[True, True]).reset_index(drop=True)

    item_mapping = create_mapping(item_features_df['item_id'])
    test_session_mapping = create_mapping(test_sessions_df['session_id'])

    mapped_test_sessions_df = get_mapped_sessions_to_recommend(test_sessions_df, test_session_mapping)

    recommendable_items = candidate_items_df['item_id'].values
    items_to_ignore = get_items_to_exclude(item_features_df, recommendable_items)

    mapped_items_to_ignore = [item_mapping[item] for item in items_to_ignore]

    test_sessions_df['session_id'] = test_sessions_df['session_id'].map(test_session_mapping)
    test_sessions_df['item_id'] = test_sessions_df['item_id'].map(item_mapping)

    if unique_interactions:
        test_sessions_df.drop_duplicates(subset=['session_id', 'item_id'], inplace=True, keep='last')

    return test_sessions_df, test_session_mapping, item_mapping, mapped_items_to_ignore, mapped_test_sessions_df


def get_URM_test(
        item_features_df, candidate_items_df, test_sessions_df,
        unique_interactions=True, view_weight=1, 
):
    test_sessions_df, test_session_mapping, \
    item_mapping, mapped_items_to_ignore, mapped_test_sessions_df = \
        split_dataframes_test(
            item_features_df, candidate_items_df, test_sessions_df,
            unique_interactions=unique_interactions, view_weight=view_weight, 
        )

    URM_test_views = create_csr_matrix(test_sessions_df, len(test_session_mapping), len(item_mapping))

    return URM_test_views, mapped_items_to_ignore, mapped_test_sessions_df, test_session_mapping, item_mapping

In [3]:
W_sparse = None
item_features_df = None 
candidate_items_df = None
unique_interactions = None
purch_weight = None
view_weight = None

def load_model(folder_path, file_name=None):
    
    if file_name is None:
         file_name = 'RP3betaRecommender.pkl' 
    
    # Cargar los parámetros del archivo PKL
    with open(folder_path + file_name, 'rb') as f:
        parameters = pickle.load(f)

    # Recuperar el buffer de NPZ
    npz_data = parameters['npz_data']

    # Leer la matriz NPZ desde el buffer de bytes
    npz_buffer = io.BytesIO(npz_data)
    W_sparse = sps.load_npz(npz_buffer)

    # Recuperar otros parámetros
    candidate_items_df = parameters['candidate_items_df']
    item_features_df = parameters['item_features_df']
    unique_interactions = parameters['unique_interactions']
    purch_weight = parameters['purch_weight']
    view_weight = parameters['view_weight']
    
    print (f"Datos del modelo {file_name[:-4]} cargados!")
    
    return W_sparse, item_features_df, candidate_items_df, unique_interactions, purch_weight, view_weight

W_sparse, item_features_df, candidate_items_df, unique_interactions, purch_weight, view_weight = load_model("./","RP3betaRecommender.pkl")    
    

Datos del modelo RP3betaRecommender cargados!


In [4]:
item_features_df

Unnamed: 0,item_id,feature_category_id,feature_value_id
0,2,56,365
1,2,62,801
2,2,68,351
3,2,33,802
4,2,72,75
...,...,...,...
471746,28143,68,351
471747,28143,55,390
471748,28143,11,109
471749,28143,73,91


In [5]:
# Inicializar una URM_train vacía 
URM_train_dummy = csr_matrix((0, 0), dtype=np.float32)

# Crear una instancia de RP3betaRecommender
rp3_recommender = RP3betaRecommender(URM_train_dummy)

if (W_sparse is None):
    # Cargar el modelo desde el archivo zip
    folder_path = "./"  # El directorio actual
    file_name = "RP3betaRecommender.zip"

    # Llamar al método load_model para cargar los datos en la instancia, la matriz npz
    rp3_recommender.load_model(folder_path, file_name)
else:
    rp3_recommender.set_W_sparse(W_sparse)
    print("Cargados desde el fichero pkl!")


Cargados desde el fichero pkl!


In [6]:
test_sessions_df = pd.read_csv('./datasets/test_full_sessions.csv',sep=',')
test_sessions_df['date'] = pd.to_datetime(test_sessions_df['date'], format='mixed')
#sessions_data['date'] = pd.to_datetime(sessions_data['date'])

test_purchases_df = pd.read_csv('./datasets/test_full_purchases.csv')

print(f'Número de interacciones de prueba: {len(test_sessions_df)}')
print(test_sessions_df)

if candidate_items_df is None:
    candidate_items_df = pd.read_csv('./datasets/candidate_items.csv')
    item_features_df = pd.read_csv('./datasets/item_features.csv')

Número de interacciones de prueba: 12
    session_id  item_id                    date
0           26    19185 2021-06-16 09:53:54.158
1           61    27088 2021-06-01 08:12:39.664
2           61     5581 2021-06-01 08:12:40.534
3           96    11693 2021-06-19 17:48:05.227
4           96    18298 2021-06-19 17:49:08.589
5           96     4738 2021-06-19 17:49:15.838
6           96      495 2021-06-19 17:49:20.880
7           96    16052 2021-06-19 17:52:34.781
8           96     6871 2021-06-19 17:56:21.317
9      4439966    19483 2021-06-06 20:05:06.457
10     4439966    20782 2021-06-06 20:06:47.779
11     4439966    20782 2021-06-06 20:07:17.146


In [7]:
if unique_interactions is None:
    unique_interactions = True
    #purch_weight = 1
    view_weight = 0.2
    

URM_test_views, mapped_items_to_ignore, mapped_test_sessions_df, test_session_mapping, item_mapping = get_URM_test(
    test_sessions_df=test_sessions_df,
    candidate_items_df=candidate_items_df,
    item_features_df=item_features_df,
    unique_interactions=unique_interactions,
    view_weight=view_weight,
)

In [8]:
rp3_recommender.set_URM_train(URM_test_views)
rp3_recommender.set_items_to_ignore(mapped_items_to_ignore)

In [9]:
cutoff = 5

dataframe_list = predict_my_purchase(
    models=[rp3_recommender],
    session_ids=mapped_test_sessions_df,
    add_item_score=True, #Originamente False
    cutoff=cutoff, # Los 100 mejores resultados originalmente
)

Recommending...


  0%|          | 0/100 [00:00<?, ?it/s]

Done!


In [10]:
#print(dataframe_list[0][:10])
#print(dataframe_list[0][-10:])
#print(dataframe_list[0][10:20])
df = dataframe_list[0]
df

Unnamed: 0,session_id,item_id,item_score
0,0,2735,0.014295
1,0,4520,0.006539
2,0,22360,0.00412
3,0,17349,0.004061
4,0,1843,0.003378
5,1,21878,0.007172
6,1,11740,0.006662
7,1,14241,0.006309
8,1,6074,0.006053
9,1,9807,0.005101


In [13]:
def save_results(prediction_df, item_mapping, session_mapping, save_path="./", cutoff=5, output_format="console"):
    """
    Save the prediction results in the specified format (csv, json, etc.)

    Parameters:
    - prediction_df: DataFrame containing the predictions.
    - item_mapping: Dictionary mapping item IDs to original item IDs.
    - session_mapping: Dictionary mapping session IDs to original session IDs.
    - save_path: Directory where the results will be saved.
    - cutoff: Number of top items to rank.
    - output_format: The format in which to save the results ("csv", "json","console").

    Returns:
    - The modified prediction DataFrame.
    """

    # Preparing the DataFrame
    prediction_df = prediction_df[['session_id', 'item_id']]
    num_sessions = len(np.unique(prediction_df['session_id']))
    rank_col = list(range(1, cutoff + 1)) * num_sessions
    prediction_df['rank'] = rank_col

    # Create inverse mappings
    inv_item_map = {v: k for k, v in item_mapping.items()}
    inv_session_map = {v: k for k, v in session_mapping.items()}

    # Map to original IDs
    prediction_df['item_id'] = prediction_df['item_id'].map(inv_item_map)
    prediction_df['session_id'] = prediction_df['session_id'].map(inv_session_map)

    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Current date and time for file naming
    now = datetime.now()

    # Determine the file format and save accordingly
    if output_format == "csv":
        final_path = os.path.join(save_path, f'results_{now:%Y_%m_%d_at_%H_%M_%S}.csv')
        prediction_df.to_csv(final_path, index=False)
        print(f"Resultados salvados en {final_path}")
    elif output_format == "json":
        final_path = os.path.join(save_path, f'results_{now:%Y_%m_%d_at_%H_%M_%S}.json')
        prediction_df.to_json(final_path, orient="records", lines=True)
        print(f"Resultados salvados en {final_path}")
    elif output_format == "console":
        #final_path = os.path.join(save_path, f'results_{now:%Y_%m_%d_at_%H_%M_%S}.json')
        #prediction_df.to_json(final_path, orient="records", lines=True)
        # No sé si esto funciona, o sólo hace falta simplemente devolverlo!
        #prediction_df = json.dumps(prediction_df)
        print("No hacer nada, se devuelve y ya está!")
    else:
        raise ValueError(f"Unsupported output format: {output_format}")

    return prediction_df



save_results(
    prediction_df=df,
    item_mapping=item_mapping,
    session_mapping=test_session_mapping,
    save_path='./', cutoff = cutoff,
    output_format="console"
)

No hacer nada, se devuelve y ya está!


Unnamed: 0,session_id,item_id,rank
0,26,3260,1
1,26,5383,2
2,26,26538,3
3,26,20541,4
4,26,2213,5
5,61,25956,1
6,61,13936,2
7,61,16877,3
8,61,7225,4
9,61,11641,5


In [12]:
#import io

data_dict_to_save = {
        'npz_data':None,
        'candidate_items_df': candidate_items_df,
        'item_features_df': item_features_df,
        'unique_interactions': unique_interactions,
        'purch_weight': purch_weight,
        'view_weight': view_weight
}

def save_model(rec, folder_path, data_dict_to_save):

    #if file_name is None:
    file_name = rec.RECOMMENDER_NAME + '.pkl'   
    
    # Guardar la matriz NPZ en un buffer de bytes (del recomendador)
    npz_buffer = io.BytesIO()

    # Este son métodos get/set, añadido a la clase -RP3betaRecommender-, para la carga y salvado del modelo, 
    # la matriz de dispersión.
    # def get_W_sparse(self):
    #    return self.W_sparse
    
    #def set_W_sparse(self, W_sparse):
    #    self.W_sparse = W_sparse
    
    # en vez de utilizar el que usa el modelo, que es el que se ha utilizado inicialmente 
    # para cargar la matriz dispersa -npz-
    sps.save_npz(npz_buffer, rec.get_W_sparse()) 
    
    npz_buffer.seek(0)  # Reiniciar el buffer al inicio

    data_dict_to_save['npz_data'] = npz_buffer.getvalue()
    
    # Guarda todos los parámetros en un archivo PKL, incluyendo el buffer de NPZ

    with open(folder_path + file_name, 'wb') as f:
        pickle.dump(data_dict_to_save, f)



In [24]:
save_model(rp3_recommender, "./", data_dict_to_save)