In [1]:
import pandas as pd
df = pd.read_excel('test.xlsx', sheet_name='Sheet1')
df['clean_text'] = (
    df['reviewText'].str.lower()
                     .str.replace(r'[^a-z0-9\s]', ' ', regex=True)
                     .str.replace(r'\s+', ' ', regex=True)
                     .str.strip()
)

In [2]:
import torch
print("Torch:", torch.__version__)
print("CUDA:", torch.version.cuda)


Torch: 2.0.1
CUDA: 11.8


In [4]:
# Paste into a notebook cell (with the bang) or your terminal.
# Replace cu118 with whatever torch.version.cuda reports (e.g. cu121)
!pip install --no-cache-dir \
    torch-scatter     -f https://data.pyg.org/whl/torch-2.0.1+cu118.html \
    torch-sparse      -f https://data.pyg.org/whl/torch-2.0.1+cu118.html \
    torch-cluster     -f https://data.pyg.org/whl/torch-2.0.1+cu118.html \
    torch-spline-conv -f https://data.pyg.org/whl/torch-2.0.1+cu118.html \
    torch-geometric


Looking in links: https://data.pyg.org/whl/torch-2.0.1+cu118.html, https://data.pyg.org/whl/torch-2.0.1+cu118.html, https://data.pyg.org/whl/torch-2.0.1+cu118.html, https://data.pyg.org/whl/torch-2.0.1+cu118.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu118/torch_scatter-2.1.2%2Bpt20cu118-cp39-cp39-win_amd64.whl (3.7 MB)
     ---------------------------------------- 0.0/3.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/3.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/3.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/3.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/3.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/3.7 MB 131.3 kB/s eta 0:00:29
     ---------------------------------------- 0.0/3.7 MB 131.3 kB/s eta 0:00:29
     ---------------------------------------- 0.0/3.7 MB 131.3 kB/s eta 0:00:29
     --------------------------------------

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score
)
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import matplotlib.pyplot as plt

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [42]:
# --- 2) BERT embeddings ---
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert = BertModel.from_pretrained('bert-base-multilingual-cased').to(device)
bert.eval()
def encode(texts, batch=16, maxlen=200):
    embs=[]
    for i in range(0,len(texts),batch):
        enc = tokenizer(texts[i:i+batch], padding=True,
                        truncation=True, max_length=maxlen,
                        return_tensors='pt').to(device)
        with torch.no_grad():
            out = bert(**enc)
        embs.append(out.last_hidden_state[:,0,:].cpu())
    return torch.cat(embs,0)
print("Encoding BERT…")
rev_emb = encode(df['clean_text'].tolist())  # (N,768)

Encoding BERT…


In [43]:
import os
import contextlib
import threadpoolctl

# 1) Force single‑thread BLAS/MKL:
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"]     = "1"
os.environ["OMP_NUM_THREADS"]     = "1"

# 2) Stub out threadpoolctl’s internals so it never introspects libraries
threadpoolctl._ThreadpoolInfo    = lambda *args, **kwargs: []
threadpoolctl.threadpool_limits  = lambda *args, **kwargs: contextlib.nullcontext()

# Now it's safe to import sklearn and run KMeans without errors
from sklearn.cluster import KMeans


In [45]:
# --- 3) Build a kNN graph with PyG (sparse, GPU‑friendly) ---
from torch_geometric.nn import knn_graph

# rev_emb: torch.Tensor shape [N, 768] (on device or CPU)
# choose k (e.g. 10 nearest neighbors)
k = 10

# loop=False omits self‑loops; set to True if you want them
edge_index = knn_graph(
    x=rev_emb.to(device),
    k=k,
    loop=False
)

print("kNN graph edges:", edge_index.size(1))


kNN graph edges: 1406254


In [46]:
# --- 4) User / Item Feature Extraction (fixed column names) ---

import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler

# 4.0) (Re)load your DataFrame if needed; for JSON:
# df = pd.read_json('Cell_Phones_and_Accessories.json', lines=True)

# 4.1) Per‑user statistics (group by reviewerID)
u_stats = (
    df
    .groupby('reviewerID')['overall']
    .agg(count_u='count', mean_u='mean', std_u='std')
    .fillna(0)
)
df = df.join(u_stats, on='reviewerID')

# 4.2) Per‑item statistics (group by asin)
i_stats = (
    df
    .groupby('asin')['overall']
    .agg(count_i='count', mean_i='mean', std_i='std')
    .fillna(0)
)
df = df.join(i_stats, on='asin')

# 4.3) Normalize the new features
user_cols = ['count_u', 'mean_u', 'std_u']
item_cols = ['count_i', 'mean_i', 'std_i']
scaler_u = StandardScaler()
scaler_i = StandardScaler()

df[user_cols] = scaler_u.fit_transform(df[user_cols])
df[item_cols] = scaler_i.fit_transform(df[item_cols])

# 4.4) Convert to PyTorch tensors
user_feats = torch.tensor(df[user_cols].values, dtype=torch.float)
item_feats = torch.tensor(df[item_cols].values, dtype=torch.float)

print("User feature tensor shape:", user_feats.shape)
print("Item feature tensor shape:", item_feats.shape)


User feature tensor shape: torch.Size([140625, 3])
Item feature tensor shape: torch.Size([140625, 3])


In [47]:
# --- 5) Labels & masks ---
y = torch.tensor(df['class'].values, dtype=torch.long)
idx = np.arange(N)
# 70% train, 15% val, 15% test stratified
train_idx, tmp_idx = train_test_split(idx, stratify=y.numpy(),
                                      test_size=0.3, random_state=42)
val_idx,   test_idx= train_test_split(tmp_idx, stratify=y[tmp_idx].numpy(),
                                      test_size=0.5, random_state=42)
mask = lambda arr: torch.tensor(np.isin(idx, arr), dtype=torch.bool)

train_mask = mask(train_idx)
val_mask   = mask(val_idx)
test_mask  = mask(test_idx)

In [5]:
# --- 6) Build PyG Data (fixed variable names) ---

from torch_geometric.data import Data

# Make sure these variables exist:
# rev_emb      : torch.Tensor of shape [N, 768]
# edge_index   : torch.LongTensor of shape [2, E]
# user_feats   : torch.Tensor of shape [N, 3]   (count_u, mean_u, std_u)
# item_feats   : torch.Tensor of shape [N, 3]   (count_i, mean_i, std_i)
# y            : torch.LongTensor of shape [N]  (0 or 1 labels)
# train_mask   : torch.BoolTensor of shape [N]
# val_mask     : torch.BoolTensor of shape [N]
# test_mask    : torch.BoolTensor of shape [N]
# device       : torch.device('cuda:0') or torch.device('cpu')

data = Data(
    x=rev_emb.to(device),
    edge_index=edge_index.to(device),
    user_x=user_feats.to(device),
    item_x=item_feats.to(device),
    y=y.to(device),
    train_mask=train_mask.to(device),
    val_mask=val_mask.to(device),
    test_mask=test_mask.to(device)
)

print(data)
print("Num nodes:", data.num_nodes)
print("Num edges:", data.num_edges)
print("Train/Val/Test sizes:", 
      data.train_mask.sum().item(), 
      data.val_mask.sum().item(), 
      data.test_mask.sum().item())


NameError: name 'rev_emb' is not defined

In [6]:
# After Step 6, once `data` is fully built:
import torch

# 1.1) Save the Data object
torch.save(data, 'processed_amazon_data.pt')
print("Saved processed data to processed_amazon_data.pt")

# 1.2) Later (in a fresh notebook), just do:
data = torch.load('processed_amazon_data.pt')
data = data.to(device)   # move all tensors to GPU if needed
print(data)


# 2.1) Save only weights
torch.save(model.state_dict(), 'gdfn_efficient_weights.pth')
print("Model weights saved to gdfn_efficient_weights.pth")

# 2.2) And to reload later:
model = GDFN_Efficient(768, 64, len(user_cols), len(item_cols), 128).to(device)
model.load_state_dict(torch.load('gdfn_efficient_weights.pth'))
model.eval()


NameError: name 'data' is not defined

In [49]:
# --- 7) Define GDFN model with GCNConv ---
class GDFN_PyG(nn.Module):
    def __init__(self, rev_dim, gcn_dim, u_dim, i_dim, fusion_dim):
        super().__init__()
        self.conv1 = GCNConv(rev_dim, gcn_dim)
        self.conv2 = GCNConv(gcn_dim, gcn_dim)
        self.u_lin  = nn.Sequential(nn.Linear(u_dim, gcn_dim), nn.ReLU())
        self.i_lin  = nn.Sequential(nn.Linear(i_dim, gcn_dim), nn.ReLU())
        self.fuse  = nn.Sequential(nn.Linear(gcn_dim**3, fusion_dim), nn.ReLU())
        self.cls   = nn.Linear(fusion_dim, 2)
    def forward(self, x, edge_index, ux, ix):
        h = F.relu(self.conv1(x, edge_index))
        h = F.relu(self.conv2(h, edge_index))
        u = self.u_lin(ux)
        i = self.i_lin(ix)
        B,d = h.size()
        h3 = h.view(B,d,1,1); u3 = u.view(B,1,d,1); i3 = i.view(B,1,1,d)
        fused = (h3*u3*i3).view(B,-1)
        fused = self.fuse(fused)
        return self.cls(fused)

In [51]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.metrics import (
    accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score
)

# --- Efficient GDFN: elementwise triple product fusion ---
class GDFN_Efficient(nn.Module):
    def __init__(self, rev_dim, gcn_dim, u_dim, i_dim, fusion_dim, num_classes=2):
        super().__init__()
        # two‑layer GCN
        self.conv1 = GCNConv(rev_dim, gcn_dim)
        self.conv2 = GCNConv(gcn_dim, gcn_dim)
        # user/item MLPs
        self.user_lin = nn.Sequential(nn.Linear(u_dim, gcn_dim), nn.ReLU())
        self.item_lin = nn.Sequential(nn.Linear(i_dim, gcn_dim), nn.ReLU())
        # fuse & classify
        self.fuse_lin = nn.Sequential(nn.Linear(gcn_dim, fusion_dim), nn.ReLU())
        self.cls      = nn.Linear(fusion_dim, num_classes)

    def forward(self, x, edge_index, ux, ix):
        # GCN on reviews
        h = F.relu(self.conv1(x, edge_index))
        h = F.relu(self.conv2(h, edge_index))
        # project user/item
        u = self.user_lin(ux)
        i = self.item_lin(ix)
        # element‑wise triple product (N, d)
        fused = h * u * i
        # fuse down to fusion_dim
        fused = self.fuse_lin(fused)
        return self.cls(fused)  # (N, num_classes)

# --- Instantiate & train/evaluate as before, just swapping in GDFN_Efficient ---
model = GDFN_Efficient(
    data.x.size(1),   # 768
    64,               # gcn_dim
    len(user_cols),   # user feature dim
    len(item_cols),   # item feature dim
    128               # fusion_dim
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()
    out  = model(data.x, data.edge_index, data.user_x, data.item_x)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate(mask):
    model.eval()
    with torch.no_grad():
        out   = model(data.x, data.edge_index, data.user_x, data.item_x)
        probs = F.softmax(out, dim=1)[mask,1].cpu().numpy()
        preds = out[mask].argmax(1).cpu().numpy()
        true  = data.y[mask].cpu().numpy()
    return {
        'accuracy' : accuracy_score(true, preds),
        'precision': precision_score(true, preds),
        'recall'   : recall_score(true, preds),
        'f1'       : f1_score(true, preds),
        'auc'      : roc_auc_score(true, probs)
    }

# Training loop
for epoch in range(1, 51):
    loss = train()
    if epoch % 10 == 0:
        tr = evaluate(data.train_mask)
        vl = evaluate(data.val_mask)
        print(f"Epoch {epoch:02d}  Loss={loss:.4f}  "
              f"TrainAcc={tr['accuracy']:.3f}  ValAcc={vl['accuracy']:.3f}")

# Final test
tm = evaluate(data.test_mask)
print("\nTest Metrics:")
for k,v in tm.items():
    print(f"  {k.capitalize():<9}: {v:.4f}")


Epoch 10  Loss=0.6598  TrainAcc=0.657  ValAcc=0.657
Epoch 20  Loss=0.6481  TrainAcc=0.665  ValAcc=0.665
Epoch 30  Loss=0.6326  TrainAcc=0.725  ValAcc=0.726
Epoch 40  Loss=0.6098  TrainAcc=0.790  ValAcc=0.793
Epoch 50  Loss=0.5766  TrainAcc=0.841  ValAcc=0.844

Test Metrics:
  Accuracy : 0.8382
  Precision: 0.8041
  Recall   : 0.9963
  F1       : 0.8899
  Auc      : 0.9603
