In [6]:
cd ..

/home/abdalrhman/Desktop/Graduation Project/AiStore


In [7]:
import os
import sys
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch_geometric.data import Data
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.utils import negative_sampling
from srcs.utils.logger import get_module_logger
from srcs.utils.settings import (
    CLEANED_REVIEWS_PATH_CSV,
    CLEANED_METADATA_PATH_CSV,
    FULL_GRAPH_PATH,
    TRAIN_GRAPH_PATH,
    VAL_GRAPH_PATH,
    TEST_GRAPH_PATH,
    IMAGES_DIR,
    GNN_MODEL_SAVE_PATH,
    PREDICTOR_MODEL_SAVE_PATH
)

# Configure paths
NOTEBOOK_DIR = os.getcwd()
PROJECT_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR, os.pardir, os.pardir))
sys.path.append(PROJECT_ROOT)

# Initialize logger
logger = get_module_logger("graph_builder")

In [8]:
import pandas as pd

# Load the full dataset
reviews_df = pd.read_csv(CLEANED_REVIEWS_PATH_CSV)
meta_df = pd.read_csv(CLEANED_METADATA_PATH_CSV)

# Randomly sample 50,000 reviews
reviews_df = reviews_df.sample(n=10_000, random_state=42)

# Filter metadata to only include items in sampled reviews
filtered_item_ids = reviews_df['parent_asin'].unique()
meta_df = meta_df[meta_df['parent_asin'].isin(filtered_item_ids)]

# Logging
logger.info(f"Sampled {len(reviews_df):,} reviews")
logger.info(f"Filtered metadata to {len(meta_df):,} items from {len(meta_df):,}")
logger.info(f"Unique items in sampled reviews: {len(filtered_item_ids):,}")


[2025-05-08 17:59:57] [INFO] graph_builder: Sampled 10,000 reviews
[2025-05-08 17:59:57] [INFO] graph_builder: Filtered metadata to 638 items from 638
[2025-05-08 17:59:57] [INFO] graph_builder: Unique items in sampled reviews: 8,354


In [9]:
reviews_df.head(5)

Unnamed: 0,user_id,parent_asin,rating,timestamp,year,month,day,hour,minute,recency,recency_weight
13162781,AFCO6LEANZBTDWKI4BH6BO7H4PIA,B0BKQWX8ZJ,5.0,2019-06-26 17:12:09.179,2019,6,26,17,12,2137,0.000468
2232846,AH3L645CVARFM3WRPSP3G26WOAEA,B005BH3QOY,4.0,2014-06-17 21:14:07.000,2014,6,17,21,14,3971,0.000252
8284619,AHIAQCSWKTDLBS4AV7TZMMHD5J2Q,B01M4NU4OM,5.0,2018-03-19 17:34:21.529,2018,3,19,17,34,2601,0.000384
6794797,AH7VRATJ52IOBIL3HQPYFKYLHWIQ,B0BYYJPGQB,5.0,2018-06-29 13:18:45.132,2018,6,29,13,18,2499,0.0004
5364485,AFUZ3QNYGXTLGGWTUTXD6PY4GLQA,B00AJFTHX2,5.0,2016-05-16 12:27:58.000,2016,5,16,12,27,3273,0.000305


In [10]:
meta_df.head(5)

Unnamed: 0,main_category,title,average_rating,rating_number,price,store,parent_asin,n_features,n_description_items,first_image,brand,color,date_first_available,primary_category,rating_bin
21,Computers,KHOMO - iPad 2 3 and 4 Generation Case - DUAL ...,4.5,2745,11.95,Khomo,B06XKRXLDR,5,1,https://m.media-amazon.com/images/I/31+mP+y8Uo...,Khomo,Black,2011-05-13,Electronics,Medium
35,All Electronics,"Charger for MacBook Pro 10FT, 96W USB C Charge...",4.5,2141,35.99,Ifeart,B07WZT643Q,5,0,https://m.media-amazon.com/images/I/21QlbdFXAG...,Ifeart,White,2019-09-23,Electronics,Medium
66,Home Audio & Theater,"C&E High Speed HDMI Cable with Ethernet Black,...",4.6,531,8.99,C&E,B07Q1JN792,5,16,https://m.media-amazon.com/images/I/41IfnleVoM...,C&E,1 Pack,2015-01-21,Electronics,High
73,Computers,Laptop Sleeve Elastic Neoprene Case Compatible...,4.6,8298,15.99,Hseok,B071YJFTV4,5,0,https://m.media-amazon.com/images/I/51wJ5C7w1+...,Hseok,Butterfly,2020-07-10,Electronics,High
83,Computers,ProCase 14-15.6 Inch Laptop Bag Messenger Shou...,4.6,1015,25.99,Procase,B07CRQDTKM,5,0,https://m.media-amazon.com/images/I/510HCN8zHb...,Procase,Grey,2016-09-22,Electronics,High


In [11]:
# Merge reviews with metadata
df = pd.merge(reviews_df, meta_df, on='parent_asin', how='inner')

# Drop irrelevant columns
df = df.drop(columns=['title', 'first_image', 'n_description_items'], errors='ignore')

print(f"Final merged dataset: {len(df):,} rows")

Final merged dataset: 832 rows


In [12]:
# Define threshold for positive interaction
POSITIVE_RATING_THRESHOLD = 4.0

# Positive samples: user-item pairs with high ratings
positive_samples = df[df['rating'] >= POSITIVE_RATING_THRESHOLD][['user_id', 'parent_asin']]
positive_samples['label'] = 1

# Generate negative samples: random user-item pairs not in reviews
np.random.seed(42)
all_users = df['user_id'].unique()
all_items = df['parent_asin'].unique()

# Create negative samples
negative_samples = []
seen_pairs = set(zip(df['user_id'], df['parent_asin']))

for user in np.random.choice(all_users, size=len(positive_samples), replace=True):
    for _ in range(1):  # 1 negative per positive
        item = np.random.choice(all_items)
        while (user, item) in seen_pairs:
            item = np.random.choice(all_items)
        negative_samples.append((user, item))

# Convert to DataFrame
negative_samples = pd.DataFrame(negative_samples, columns=['user_id', 'parent_asin'])
negative_samples['label'] = 0

# Combine and shuffle
samples = pd.concat([positive_samples, negative_samples]).sample(frac=1, random_state=42).reset_index(drop=True)

In [13]:
# User-level features
user_features = df.groupby('user_id').agg(
    user_avg_rating=('rating', 'mean'),
    user_total_interactions=('parent_asin', 'count'),
    user_unique_items=('parent_asin', 'nunique')
).reset_index()

# Item-level features
item_features = df.groupby('parent_asin').agg(
    item_avg_rating=('rating', 'mean'),
    item_popularity=('user_id', 'count'),
    item_price=('price', 'mean'),
    item_category=('main_category', 'first')
).reset_index()

# One-hot encode category
item_features = pd.get_dummies(item_features, columns=['item_category'], prefix='cat')

# Merge samples with features
samples = pd.merge(samples, user_features, on='user_id', how='left')
samples = pd.merge(samples, item_features, on='parent_asin', how='left')

# Fill missing values
samples.fillna({'item_price': item_features['item_price'].mean()}, inplace=True)

# Final feature list
features = [
    'user_avg_rating', 'user_total_interactions', 'user_unique_items',
    'item_avg_rating', 'item_popularity', 'item_price'
] + [c for c in item_features.columns if c.startswith('cat_')]

In [14]:
from sklearn.model_selection import train_test_split

# Train/test split (grouped by user)
train, test = train_test_split(samples, test_size=0.2, stratify=samples['label'], random_state=42)

X_train = train[features]
y_train = train['label']
X_test = test[features]
y_test = test['label']

print("Features:", features)

Features: ['user_avg_rating', 'user_total_interactions', 'user_unique_items', 'item_avg_rating', 'item_popularity', 'item_price', 'cat_All Electronics', 'cat_Amazon Devices', 'cat_Amazon Fashion', 'cat_Amazon Home', 'cat_Automotive', 'cat_Camera & Photo', 'cat_Car Electronics', 'cat_Cell Phones & Accessories', 'cat_Computers', 'cat_Gps & Navigation', 'cat_Health & Personal Care', 'cat_Home Audio & Theater', 'cat_Industrial & Scientific', 'cat_Musical Instruments', 'cat_Office Products', 'cat_Sports & Outdoors', 'cat_Tools & Home Improvement', 'cat_Video Games']


In [18]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, classification_report

# Train model
model = lgb.LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    num_leaves=64,
    class_weight='balanced',
    metric='auc',
    random_state=42
)

model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

# Evaluate
preds = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, preds)
print(f"\nAUC: {auc:.4f}")
print(classification_report(y_test, (preds > 0.5)))

[LightGBM] [Info] Number of positive: 607, number of negative: 607
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000184 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 272
[LightGBM] [Info] Number of data points in the train set: 1214, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

AUC: 0.6363
              precision    recall  f1-score   support

           0       0.56      0.53      0.55       152
           1       0.56      0.59      0.57       152

    accuracy                           0.56       304
   macro avg       0.56      0.56      0.56       304
weighted avg       0.56      0.56      0.56       304



In [None]:
def recommend_items(user_id, top_k=10):
    user_data = pd.DataFrame({
        'user_id': [user_id] * len(item_features),
        'parent_asin': item_features['parent_asin']
    })

    user_data = pd.merge(user_data, user_features, on='user_id', how='left')
    user_data = pd.merge(user_data, item_features, on='parent_asin', how='left')
    user_data.fillna({
        'item_price': item_features['item_price'].mean(),
        'item_avg_rating': item_features['item_avg_rating'].mean(),
        'item_popularity': 0
    }, inplace=True)

    user_data['score'] = model.predict_proba(user_data[features])[:, 1]

    # Exclude items already interacted with
    interacted_items = set(df[df['user_id'] == user_id]['parent_asin'])
    user_data = user_data[~user_data['parent_asin'].isin(interacted_items)]

    return user_data[['parent_asin', 'score']].sort_values('score', ascending=False).head(top_k)

In [None]:
# Get top 10 recommendations for user
recommendations = recommend_items('AFCO6LEANZBTDWKI4BH6BO7H4PIA', top_k=10)
print(recommendations)