In [1]:
!pip install transformers

!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
!pip install torch-geometric

!pip install scikit-learn
!pip install sparsemax
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.0+cu117.html


Looking in links: https://data.pyg.org/whl/torch-2.0.0+cpu.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcpu/torch_scatter-2.1.2%2Bpt20cpu-cp311-cp311-linux_x86_64.whl (494 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.0/494.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt20cpu
Looking in links: https://data.pyg.org/whl/torch-2.0.0+cpu.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcpu/torch_sparse-0.6.18%2Bpt20cpu-cp311-cp311-linux_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt20cpu
Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel

In [None]:
from google.colab import files
uploaded = files.upload()

Saving amazon_reviews.csv to amazon_reviews (1).csv


In [None]:
df = pd.read_csv("amazon_reviews.csv")

df = df.dropna(subset=["text", "rating"])
df = df.reset_index(drop=True)

def combine_review_and_rating(row):
    review = str(row["text"])
    rating = str(row["rating"])
    return f"Review: {review}. Rating: {rating} stars."

df["combined_input"] = df.apply(combine_review_and_rating, axis=1)

df["combined_input"].head(2)


Unnamed: 0,combined_input
0,Review: First & most offensive: they reek of g...
1,Review: These didn’t work. Idk if they were da...


In [None]:
class CombinedReviewDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=64):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = str(self.data.loc[idx, "combined_input"])
        label = self.data.loc[idx, "label"] if "label" in self.data.columns else 0

        encoded = self.tokenizer(text, padding="max_length", truncation=True,
                                 max_length=self.max_len, return_tensors="pt")

        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.float)
        }

dataset = CombinedReviewDataset(df, tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


In [None]:
import torch
import torch.nn as nn
from torch_geometric.nn import GATConv
from sparsemax import Sparsemax
from transformers import RobertaModel

class FullRecommendationModel(nn.Module):
    def __init__(
        self,
        roberta_model_name='roberta-base',
        freeze_roberta=True,
        unfreeze_last_n=0,
        bigru_hidden_dim=256,
        bigru_num_layers=1,
        mhsa_heads=8,
        gat_output_dim=256,
        deepfm_input_dim=256,
        dropout=0.1
    ):
        super().__init__()


        self.roberta = RobertaModel.from_pretrained(roberta_model_name)

        if freeze_roberta:
            for param in self.roberta.parameters():
                param.requires_grad = False

            if unfreeze_last_n > 0:
                for layer in self.roberta.encoder.layer[-unfreeze_last_n:]:
                    for param in layer.parameters():
                        param.requires_grad = True


        self.roberta_output_dim = self.roberta.config.hidden_size


        self.bigru = nn.GRU(
            input_size=self.roberta_output_dim,
            hidden_size=bigru_hidden_dim,
            num_layers=bigru_num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if bigru_num_layers > 1 else 0.0
        )
        self.bigru_layernorm = nn.LayerNorm(bigru_hidden_dim * 2)
        self.bigru_dropout = nn.Dropout(dropout)


        self.mhsa = nn.MultiheadAttention(
            embed_dim=bigru_hidden_dim * 2,
            num_heads=mhsa_heads,
            batch_first=True
        )
        self.mhsa_dropout = nn.Dropout(dropout)
        self.mhsa_layernorm = nn.LayerNorm(bigru_hidden_dim * 2)


        self.swn_weight = nn.Sequential(
            nn.Linear(bigru_hidden_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
        self.swn_dropout = nn.Dropout(dropout)


        self.gat = GATConv(
            in_channels=bigru_hidden_dim * 2,
            out_channels=gat_output_dim,
            heads=1,
            concat=False
        )
        self.gat_dropout = nn.Dropout(dropout)
        self.gat_layernorm = nn.LayerNorm(gat_output_dim)


        self.linear_part = nn.Linear(deepfm_input_dim, 1)
        self.mlp_part = nn.Sequential(
            nn.Linear(deepfm_input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        self.sparsemax = Sparsemax(dim=1)
    def forward(self, input_ids, attention_mask, edge_index):


        roberta_out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)

        x = roberta_out.last_hidden_state


        bigru_out, _ = self.bigru(x)
        bigru_out = self.bigru_dropout(bigru_out)
        bigru_out = self.bigru_layernorm(bigru_out)



        attn_out, _ = self.mhsa(bigru_out, bigru_out, bigru_out, key_padding_mask=(attention_mask==0))
        attn_out = self.mhsa_dropout(attn_out) + bigru_out
        attn_out = self.mhsa_layernorm(attn_out)



        weights = self.swn_weight(attn_out)
        swn_out = attn_out * weights
        swn_out = self.swn_dropout(swn_out)


        node_features = torch.mean(swn_out, dim=1)
        gat_out = self.gat(node_features, edge_index)
        gat_out = self.gat_dropout(gat_out)
        gat_out = self.gat_layernorm(gat_out)


        linear_score = self.linear_part(gat_out)
        mlp_score = self.mlp_part(gat_out)
        total_score = linear_score + mlp_score
        out = self.sparsemax(total_score)

        return out


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
from torch_geometric.nn import GATConv
from sparsemax import Sparsemax


device = "cuda" if torch.cuda.is_available() else "cpu"


df = pd.read_csv("amazon_reviews.csv")
df = df.dropna(subset=["review_text"])
df["label"] = df["label"].astype(int)


tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaModel.from_pretrained("roberta-base").to(device).eval()


class AmazonReviewDataset(Dataset):
    def __init__(self, df, text_col="review_text", label_col="label"):
        self.df = df
        self.text_col = text_col
        self.label_col = label_col
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        return {
            "text": str(self.df.loc[idx, self.text_col]),
            "label": int(self.df.loc[idx, self.label_col])
        }


def chunked_tokenize(text, tokenizer, max_length=128, stride=32):
    encoded = tokenizer(
        text, max_length=max_length, truncation=True,
        stride=stride, return_overflowing_tokens=True,
        return_tensors='pt'
    )
    return [
        {"input_ids": encoded["input_ids"][i], "attention_mask": encoded["attention_mask"][i]}
        for i in range(len(encoded["input_ids"]))
    ]

def roberta_chunk_embeddings(chunks, roberta_model):
    all_outputs = []
    with torch.no_grad():
        for chunk in chunks:
            input_ids = chunk["input_ids"].unsqueeze(0).to(device)
            attention_mask = chunk["attention_mask"].unsqueeze(0).to(device)
            output = roberta_model(input_ids=input_ids, attention_mask=attention_mask)
            all_outputs.append(output.last_hidden_state.squeeze(0))
    return torch.cat(all_outputs, dim=0)


def collate_fn(batch):
    all_embeddings, labels = [], []
    for sample in batch:
        chunks = chunked_tokenize(sample["text"], tokenizer)
        emb = roberta_chunk_embeddings(chunks, roberta_model)
        all_embeddings.append(emb)
        labels.append(sample["label"])
    max_len = max(x.shape[0] for x in all_embeddings)
    padded = []
    for x in all_embeddings:
        pad_len = max_len - x.shape[0]
        if pad_len > 0:
            x = torch.cat([x, torch.zeros(pad_len, x.shape[1]).to(device)], dim=0)
        padded.append(x.unsqueeze(0))
    return torch.cat(padded, dim=0), torch.tensor(labels, dtype=torch.float).to(device)


class FullRecommendationModel(nn.Module):
    def __init__(self, hidden_dim=128, mhsa_heads=4, gat_output_dim=128, dropout=0.2):
        super().__init__()
        self.bigru = nn.GRU(input_size=768, hidden_size=hidden_dim, batch_first=True,
                            bidirectional=True, dropout=dropout)
        self.mhsa = nn.MultiheadAttention(embed_dim=hidden_dim*2, num_heads=mhsa_heads, batch_first=True)
        self.swn = nn.Sequential(nn.Linear(hidden_dim*2, 128), nn.ReLU(), nn.Linear(128, 1), nn.Sigmoid())
        self.gat = GATConv(hidden_dim*2, gat_output_dim, heads=1, concat=False)
        self.linear = nn.Linear(gat_output_dim, 1)
        self.mlp = nn.Sequential(nn.Linear(gat_output_dim, 128), nn.ReLU(), nn.Linear(128, 64),
                                 nn.ReLU(), nn.Linear(64, 1))
        self.sparsemax = Sparsemax(dim=1)

    def forward(self, x, edge_index):
        x, _ = self.bigru(x)
        x, _ = self.mhsa(x, x, x)
        w = self.swn(x)
        x = x * w
        x = x.mean(dim=1)
        x = self.gat(x, edge_index)
        out = self.linear(x) + self.mlp(x)
        return self.sparsemax(out)


dataset = AmazonReviewDataset(df)
loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)


model = FullRecommendationModel().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.BCELoss()


def fully_connected_edge_index(B):
    rows, cols = [], []
    for i in range(B):
        for j in range(B):
            rows.append(i)
            cols.append(j)
    return torch.tensor([rows, cols], dtype=torch.long).to(device)


EPOCHS = 2
model.train()
for epoch in range(EPOCHS):
    total_loss = 0
    for xb, yb in loader:
        edge_index = fully_connected_edge_index(xb.shape[0])
        preds = model(xb.to(device), edge_index)
        loss = criterion(preds, yb.unsqueeze(1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss:.4f}")


torch.save(model.state_dict(), "full_model.pt")
print("✅ Model saved as full_model.pt")


Using device: cuda


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/2] - Loss: 50.0000
Epoch [2/2] - Loss: 50.0000
Training complete!
