In [8]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv("data/reviews_cleaned.csv")
df = df.dropna(subset=["Summary"])
df.isnull().sum()

Unnamed: 0      0
UserId          0
ProductId       0
Time            0
Score           0
Summary         0
Text            0
text            0
Summary_stem    0
Text_stem       0
Summary_lem     0
Text_lem        0
dtype: int64

In [10]:
df.columns

Index(['Unnamed: 0', 'UserId', 'ProductId', 'Time', 'Score', 'Summary', 'Text',
       'text', 'Summary_stem', 'Text_stem', 'Summary_lem', 'Text_lem'],
      dtype='object')

In [13]:
useful_columns = [
    "UserId",
    "ProductId",
    "Score",
    "Time",  
]
df = df[useful_columns]
df = df.head(3000)
df.to_csv("data/reviews_sample3000.csv")

In [17]:
import pandas as pd
from pathlib import Path

df = pd.read_csv('data/reviews_sample3000.csv')
df = df.sort_values(by=['UserId', 'Time'])  

train, valid, test = [], [], []

for user, group in df.groupby("UserId"):
    interactions = group.values
    if len(interactions) >= 3:
        train.extend(interactions[:-2])
        valid.append(interactions[-2])
        test.append(interactions[-1])
    elif len(interactions) == 2:
        train.append(interactions[0])
        test.append(interactions[1])
    else:
        train.append(interactions[0])

save_path = Path("data/reviews_sample3000/")
save_path.mkdir(parents=True, exist_ok=True)

for name, data in zip(["train", "valid", "test"], [train, valid, test]):
    pd.DataFrame(data).to_csv(save_path / f"{name}.txt", sep="\t", index=False, header=False)


In [20]:
import json
import os

# Dossier des fichiers .txt
data_txt_dir = "data"

# Dossier de sortie pour les fichiers .json
output_dir = "data"
os.makedirs(output_dir, exist_ok=True)

# Fonction de conversion
def convert_txt_to_json(txt_filename, json_filename):
    data = []
    with open(os.path.join(data_txt_dir, txt_filename), "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split('\t')  # ou change en ',' si besoin
            if len(parts) == 2:
                text, label = parts
                data.append({
                    "text": text.strip(),
                    "label": int(label.strip())
                })
    with open(os.path.join(output_dir, json_filename), "w", encoding="utf-8") as f_out:
        json.dump(data, f_out, indent=2, ensure_ascii=False)

# Conversion des trois fichiers
convert_txt_to_json("reviews_sample3000/train.txt", "reviews_sample3000/train_dataset.json")
convert_txt_to_json("reviews_sample3000/valid.txt", "reviews_sample3000/val_dataset.json")
convert_txt_to_json("reviews_sample3000/test.txt", "reviews_sample3000/test_dataset.json")

print("✅ Conversion terminée.")


✅ Conversion terminée.


In [21]:
import os
import shutil

# Créer la structure attendue par Recbole
os.makedirs("data/recommender_data", exist_ok=True)
os.makedirs("config", exist_ok=True)
os.makedirs("saved_model", exist_ok=True)

# Copie ton fichier CSV dans le dossier dataset
shutil.copy("data/reviews_sample3000.csv", "data/recommender_data/reviews_sample3000.inter")


'data/recommender_data/reviews_sample3000.inter'

In [22]:
config_text = """
# Dataset
dataset: data/recommender_data/reviews_sample3000.inter
field_separator: ","
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp
load_col:
    inter: [user_id, item_id, timestamp]

# Model
model: BERT4Rec

# Training
epochs: 30
train_batch_size: 16
eval_batch_size: 16
learning_rate: 0.001
neg_sampling: ~
eval_args:
    split: {'RS': [0.8, 0.1, 0.1]}
    group_by: user
    order: TO
    mode: full
metrics: [Recall, NDCG]
topk: [3]

# BERT4Rec specific
hidden_size: 64
num_attention_heads: 2
num_hidden_layers: 2
mask_ratio: 0.2

save_model: True
saved_model_file: saved_model/bert4rec_model.pth
"""

with open("config/bert4rec.yaml", "w", encoding="utf-8") as f:
    f.write(config_text)

In [None]:
import torch
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import BERT4Rec
from recbole.trainer import Trainer

config = Config(model='BERT4Rec', config_file_list=['config/bert4rec.yaml'])
dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

model = BERT4Rec(config, train_data.dataset).to(config['device'])
checkpoint = torch.load('saved_model/bert4rec_model.pth', map_location=config['device'])
model.load_state_dict(checkpoint['state_dict'])

trainer = Trainer(config, model)

In [None]:
from recbole.utils.case_study import full_sort_topk

user_id = 1  # à adapter selon tes données
top_scores, top_items = full_sort_topk(user_id, model, test_data, k=5, device=config['device'])

print(f"Top 5 recommandations pour l'utilisateur {user_id}:")
print(top_items)

In [25]:
# Installer RecBole (si nécessaire)
# !pip install recbole

from recbole.quick_start import run_recbole
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import BERT4Rec
from recbole.trainer import Trainer
import torch

  from .autonotebook import tqdm as notebook_tqdm
2025-05-28 09:32:19,311	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-05-28 09:32:21,605	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [38]:
# 1. Config minimaliste pour BERT4Rec
config_dict = {
    'model': 'BERT4Rec',
    'dataset': 'reviews_sample3000',
    'dataset_path': './dataset/data/',
    'epochs': 1,
    'train_batch_size': 32,
    'eval_batch_size': 32,
    'hidden_size': 32,
    'max_seq_length': 20,
    'learning_rate': 0.001,
    'use_gpu': torch.cuda.is_available(),
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'metrics': ['Recall', 'NDCG', 'MRR'],
    'topk': [5],
    'train_neg_sample_args': None,
    'save_model': False,
    'save_dataset': False,
    'save_dataloaders': False,
    'log_wandb': False,
    'debug_mode': True,
    'sampling_rate': 0.2,
}


# 2. Créer la config RecBole
config = Config(model=config_dict['model'], dataset=config_dict['dataset'], config_dict=config_dict)



In [None]:
# 3. Charger dataset + préparation
dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

# 4. Instancier le modèle
model = BERT4Rec(config, train_data.dataset).to(config['device'])

# 5. Entraîner le modèle (peu d'epochs)
trainer = Trainer(config, model)
best_valid_score, best_valid_result = trainer.fit(train_data, valid_data)

# 6. Tester / évaluer
test_result = trainer.evaluate(test_data)
print("Résultats test :", test_result)



# Recommender system

In [47]:
df = pd.read_csv("data/reviews_sample3000.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,UserId,ProductId,Score,Time
0,0,A3SGXH7AUHU8GW,B001E4KFG0,5,1303862400
1,1,A1D87F6ZCVE5NK,B00813GRG4,1,1346976000
2,2,ABXLMWJIXXAIN,B000LQOCH0,4,1219017600
3,3,A395BORC6FGVXV,B000UA0QIQ,2,1307923200
4,4,A1UQRSCLF8GW1T,B006K2ZZ7K,5,1350777600


In [48]:
# 1. Création de la matrice utilisateur-article
user_item_matrix = df.pivot_table(index='UserId', columns='ProductId', values='Score').fillna(0)
user_item_matrix

ProductId,B00002NCJC,B00002Z754,B000084E1U,B0000CGFV4,B0000DC5IY,B0000VLH8S,B00015BQB6,B0001FQVCK,B0001OINNQ,B0001PB9FE,...,B0087HW5E2,B0089PI9OC,B0089SPDUW,B008BEGP9W,B008L19ZQ0,B008MMLXEK,B008YAXFWI,B0093NIWVO,B009HINRX8,B009UOFU20
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A10317LUD1C1VJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A103EZCS9H8WW1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A104Y49ZQ4CYJ2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A105FP1ZT88EPL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A107MO1RZUQ8V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZOF9E17RGZH8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZS05OYE0XGNF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZTEQJCI6N5WL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZV26LP92E6WU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# 2. Calcul des similarités entre items
from sklearn.metrics.pairwise import cosine_similarity

item_similarity = pd.DataFrame(
    cosine_similarity(user_item_matrix.T),
    index=user_item_matrix.columns,
    columns=user_item_matrix.columns
)
item_similarity

ProductId,B00002NCJC,B00002Z754,B000084E1U,B0000CGFV4,B0000DC5IY,B0000VLH8S,B00015BQB6,B0001FQVCK,B0001OINNQ,B0001PB9FE,...,B0087HW5E2,B0089PI9OC,B0089SPDUW,B008BEGP9W,B008L19ZQ0,B008MMLXEK,B008YAXFWI,B0093NIWVO,B009HINRX8,B009UOFU20
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B00002NCJC,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00002Z754,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B000084E1U,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B0000CGFV4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B0000DC5IY,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B008MMLXEK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
B008YAXFWI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
B0093NIWVO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
B009HINRX8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [50]:
#3. Fonction de prédiction item-based
def predict_item_based(user_id, user_item_matrix, item_similarity_df, top_k=5):
    user_ratings = user_item_matrix.loc[user_id]
    scores = {}

    for item in user_item_matrix.columns:
        if user_ratings[item] == 0:
            rated_items = user_ratings[user_ratings > 0].index
            sim_scores = item_similarity_df.loc[item, rated_items]
            ratings = user_ratings[rated_items]

            if not sim_scores.empty:
                weighted_sum = np.dot(sim_scores.values, ratings.values)
                norm = np.sum(np.abs(sim_scores.values))
                if norm > 0:
                    scores[item] = weighted_sum / norm

    return sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]



In [56]:
# Exemple d'utilisation
user_id = 'AZS05OYE0XGNF'  # à remplacer par l'ID souhaité
recommendations = predict_item_based(user_id, user_item_matrix, item_similarity)
print("Recommandations pour l'utilisateur", user_id)
for item_id, score in recommendations:
    print(f"Item {item_id}: score estimé {score:.2f}")

Recommandations pour l'utilisateur AZS05OYE0XGNF
Item B000EH2AMA: score estimé 4.00
Item B001E50UEQ: score estimé 4.00
Item B001EO5QW8: score estimé 4.00
Item B002JX7GVM: score estimé 4.00
Item B0064MEUS6: score estimé 4.00
