In [1]:
import pathlib

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader



In [2]:
if pathlib.Path('dredze_amazon_reviews.zip').exists():
    print("Already downloaded")
else:
    !wget http://www.cse.chalmers.se/~richajo/waspnlp2024/dredze_amazon_reviews.zip
    !unzip dredze_amazon_reviews.zip

Already downloaded


In [3]:
class BertEncoder(nn.Module):

    def __init__(self, bert_model_name):
        super().__init__()
        self.bert_model = AutoModel.from_pretrained(bert_model_name)

    def forward(self, Xbatch, Xmask):
        ber_out = self.bert_model(input_ids=Xbatch, attention_mask=Xmask)
        return ber_out.last_hidden_state[:, 0, :]
    
class DocumentBatcher:
    """A collator that builds a batch from a number of documents."""

    def __init__(self, pad_id):
        self.pad_id = pad_id

    def make_batch_1(self, X):
        """Build a batch from a number of documents.
        Returns a tensor of shape [n_docs, max_doc_length]."""

        # How long is the longest document in this batch?
        max_len = max(len(x) for x in X)

        # Build the document tensor. We pad the shorter documents so that all documents
        # have the same length.
        Xpadded = torch.as_tensor([x + [self.pad_id]*(max_len-len(x)) for x in X])
        return Xpadded


    def make_batch_2(self, XY):
        """Build a batch from a number of documents and their labels.
        Returns two tensors X and Y, where X is the document tensor,
        of shape [n_docs, max_doc_length]

        and

        Y is the label tensor, of shape [n_docs].
        """

        # How long is the longest document in this batch?
        max_len = max(len(x) for x, _ in XY)

        # Build the document tensor. We pad the shorter documents so that all documents
        # have the same length.
        Xpadded = torch.as_tensor([x + [self.pad_id]*(max_len-len(x)) for x, _ in XY])

        # Build the label tensor.
        Y = torch.as_tensor([y for _, y in XY])

        return Xpadded, Y


    def __call__(self, instances):
        if isinstance(instances[0], tuple):
            return self.make_batch_2(instances)
        else:
            return self.make_batch_1(instances)
        
def encoder_loader(data, tokenizer, max_length=512, batch_size=32):
        if isinstance(data, pd.Series):
            data = list(data)
        data_encoded = tokenizer(data, truncation=True, max_length=max_length).input_ids
        batcher = DocumentBatcher(tokenizer.pad_token_id)
        data_index = list(zip(data_encoded, range(len(data_encoded))))
        data_loader = DataLoader(data_index, batch_size, collate_fn=batcher)
        return data_loader

In [4]:
amazon_corpus = pd.read_csv('dredze_amazon_reviews.tsv', sep='\t', header=None, names=['product', 'sentiment', 'text'])

In [5]:
model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = BertEncoder(model_name)

In [6]:
emb_amazon_corpus = amazon_corpus.copy()
emb_amazon_corpus['text_embedding'] = None
bert_model.eval()
bert_model.to('cuda')
with torch.no_grad():
    for batch, ids in encoder_loader(amazon_corpus.text, tokenizer):
        mask = (batch != tokenizer.pad_token_id).long().to('cuda')
        batch = batch.to('cuda')
        bert_output = bert_model(batch, mask)
        for i, idx in enumerate(ids):
            emb_amazon_corpus.at[idx.item(), 'text_embedding'] = bert_output[i].cpu().detach().numpy()


In [7]:
emb_amazon_corpus

Unnamed: 0,product,sentiment,text,text_embedding
0,music,neg,i bought this album because i loved the title ...,"[0.17977853, -0.18256074, -0.3078683, 0.090951..."
1,music,neg,i was misled and thought i was buying the enti...,"[-0.08570315, -0.16243924, 0.14063138, 0.26266..."
2,books,neg,"i have introduced many of my ell , high school...","[-0.042635042, -0.29660478, 0.17880535, -0.525..."
3,books,pos,anything you purchase in the left behind serie...,"[0.18510504, 0.033130858, 0.17519405, -0.29427..."
4,dvd,pos,"i loved these movies , and i cant wiat for the...","[-0.011192498, -0.039981, -0.025789035, -0.035..."
...,...,...,...,...
11909,dvd,neg,the story here dose n't matter . the main char...,"[0.08249482, 0.30193207, -0.0017076576, 0.0646..."
11910,software,pos,i liked everything about this product except i...,"[0.014326348, -0.3186368, 0.26025128, -0.46153..."
11911,camera,pos,this flash is the perfect back-up for a studio...,"[-0.23396452, -0.028989198, 0.41473043, -0.179..."
11912,health,neg,i had boughten this as a gift which turned out...,"[0.22810979, 0.027687859, 0.12501115, -0.54732..."


In [8]:
pd.to_pickle(emb_amazon_corpus, 'emb_amazon_corpus.pkl')

In [9]:
emb_amazon_corpus = pd.read_pickle('emb_amazon_corpus.pkl')