In [2]:
import json
from sklearn.model_selection import train_test_split
import pandas as pd
from IPython.core.display import HTML, display
from torchtext.data.utils import get_tokenizer
from torch.utils.data import DataLoader
from torchtext.vocab import Vocab
import torch
from torch import nn
from collections import Counter

In [3]:
# Path to the All_Beauty reviews file you downloaded
reviews_file_path = 'All_Beauty.jsonl'
SEED = 0

In [4]:
# Load the reviews and the ratings
texts, ratings = [], []
with open(reviews_file_path, 'r') as file:
    for line in file:
        review = json.loads(line.strip())
        texts.append(review['text'])
        ratings.append(review['rating'])

In [5]:
# get indices where number of words is > 5 and number of words is < 100
indices = [i for i, text in enumerate(texts) if 5 < len(text.split()) < 100]
texts = [texts[i] for i in indices]
ratings = [ratings[i] for i in indices]

In [6]:
# Convert ratings to binary sentiment (1 for positive, 0 for negative)
sentiments = [1 if rating >= 4 else 0 for rating in ratings]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, sentiments, test_size=0.2, random_state=SEED)

In [7]:
# pkl the train and test data
train_df = pd.DataFrame({'text': X_train, 'sentiment': y_train})
test_df = pd.DataFrame({'text': X_test, 'sentiment': y_test})
train_df.to_pickle('beauty-train.pkl')
test_df.to_pickle('beauty-test.pkl')

In [8]:
len(X_train), len(X_test)

(434663, 108666)

In [12]:
tokenizer = get_tokenizer('basic_english')
word_counter = Counter()
for (line, label) in zip(X_train, y_train):
    word_counter.update(tokenizer(line))
voc = Vocab(word_counter, min_freq=10)

print('Vocabulary size:', len(voc))

num_class = len(set(y_train))
print('Num of classes:', num_class)

Vocabulary size: 14635
Num of classes: 2


In [13]:
class EmbeddingBagModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, num_class)

    def forward(self, inputs, offsets):
        embedded = self.embedding(inputs, offsets)
        return self.linear(embedded)

In [45]:
BATCH_SIZE = 64

def collate_batch(batch):
    labels = torch.tensor([label for _, label in batch]) 
    text_list = [tokenizer(line) for line, _ in batch]
    
    # flatten tokens across the whole batch
    text = torch.tensor([voc[t] for tokens in text_list for t in tokens])
    tokenized_list = [torch.tensor([voc[t] for t in tokens]) for tokens in text_list]
    # the offset of each example
    offsets = torch.tensor(
        [0] + [len(tokens) for tokens in text_list][:-1]
    ).cumsum(dim=0)

    return labels, text, offsets, tokenized_list

train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=BATCH_SIZE,
                          shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(list(zip(X_test, y_test)), batch_size=BATCH_SIZE,
                        shuffle=False, collate_fn=collate_batch)

In [19]:
EPOCHS = 2
EMB_SIZE = 64
CHECKPOINT = './models/embedding_bag_beauty.pt'
USE_PRETRAINED = False  # change to False if you want to retrain your own model

def train_model(train_loader, val_loader):
    model = EmbeddingBagModel(len(voc), EMB_SIZE, num_class)
    
    loss = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())
    
    for epoch in range(1, EPOCHS + 1):      
        # training
        model.train()
        total_acc, total_count = 0, 0
        
        for idx, (label, text, offsets, _) in enumerate(train_loader):
            optimizer.zero_grad()
            predited_label = model(text, offsets)
            loss(predited_label, label).backward()
            optimizer.step()
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)

            if (idx + 1) % 500 == 0:
                print('epoch {:3d} | {:5d}/{:5d} batches | accuracy {:8.3f}'.format(
                    epoch, idx + 1, len(train_loader), total_acc / total_count
                ))
                total_acc, total_count = 0, 0       
        
        # evaluation
        model.eval()
        total_acc, total_count = 0, 0

        with torch.no_grad():
            for label, text, offsets, _ in val_loader:
                predited_label = model(text, offsets)
                total_acc += (predited_label.argmax(1) == label).sum().item()
                total_count += label.size(0)

        print('-' * 59)
        print('end of epoch {:3d} | valid accuracy {:8.3f} '.format(epoch, total_acc / total_count))
        print('-' * 59)
    
    torch.save(model, CHECKPOINT)
    return model
        
eb_model = torch.load(CHECKPOINT) if USE_PRETRAINED else train_model(train_loader, test_loader)

epoch   1 |   500/ 6792 batches | accuracy    0.729
epoch   1 |  1000/ 6792 batches | accuracy    0.834
epoch   1 |  1500/ 6792 batches | accuracy    0.862
epoch   1 |  2000/ 6792 batches | accuracy    0.875
epoch   1 |  2500/ 6792 batches | accuracy    0.876
epoch   1 |  3000/ 6792 batches | accuracy    0.881
epoch   1 |  3500/ 6792 batches | accuracy    0.885
epoch   1 |  4000/ 6792 batches | accuracy    0.886
epoch   1 |  4500/ 6792 batches | accuracy    0.888
epoch   1 |  5000/ 6792 batches | accuracy    0.887
epoch   1 |  5500/ 6792 batches | accuracy    0.888
epoch   1 |  6000/ 6792 batches | accuracy    0.888
epoch   1 |  6500/ 6792 batches | accuracy    0.894
-----------------------------------------------------------
end of epoch   1 | valid accuracy    0.891 
-----------------------------------------------------------
epoch   2 |   500/ 6792 batches | accuracy    0.896
epoch   2 |  1000/ 6792 batches | accuracy    0.898
epoch   2 |  1500/ 6792 batches | accuracy    0.897
epoc

In [46]:
from torch.nn import functional as F
test_label = 1 
test_line = ('I sort of like this product, its really not my favorite, but it really isnt the best. could be way better.')

test_labels, test_text, test_offsets, _ = collate_batch([(test_line, test_label)])

probs = F.softmax(eb_model(test_text, test_offsets), dim=1).squeeze(0)
print('Prediction probability:', round(probs[test_labels[0]].item(), 4))

Prediction probability: 0.757


In [47]:
probs

tensor([0.2430, 0.7570], grad_fn=<SqueezeBackward1>)

In [48]:
from captum.attr import Lime, LimeBase
from captum._utils.models.linear_model import SkLearnLinearRegression, SkLearnLasso

In [49]:
# remove the batch dimension for the embedding-bag model
def forward_func(text, offsets):
    return eb_model(text.squeeze(0), offsets)

# encode text indices into latent representations & calculate cosine similarity
def exp_embedding_cosine_distance(original_inp, perturbed_inp, _, **kwargs):
    original_emb = eb_model.embedding(original_inp, None)
    perturbed_emb = eb_model.embedding(perturbed_inp, None)
    distance = 1 - F.cosine_similarity(original_emb, perturbed_emb, dim=1)
    return torch.exp(-1 * (distance ** 2) / 2)

# binary vector where each word is selected independently and uniformly at random
def bernoulli_perturb(text, **kwargs):
    probs = torch.ones_like(text) * 0.5
    return torch.bernoulli(probs).long()

# remove absenst token based on the intepretable representation sample
def interp_to_input(interp_sample, original_input, **kwargs):
    return original_input[interp_sample.bool()].view(original_input.size(0), -1)

lasso_lime_base = LimeBase(
    forward_func, 
    interpretable_model=SkLearnLasso(alpha=0.08),
    similarity_func=exp_embedding_cosine_distance,
    perturb_func=bernoulli_perturb,
    perturb_interpretable_space=True,
    from_interp_rep_transform=interp_to_input,
    to_interp_rep_transform=None
)

In [71]:
test_text, test_labels

(tensor([   3, 1526,   16,   34,   11,   31,    7,  150,   46,   18,   12,  312,
            7,   17,    5,   46, 2860,    4,  120,    2,  179,   45,  128,  112,
            2]),
 tensor([1]))

In [57]:
batch_tokenized_sents = next(iter(train_loader))[3]

In [60]:
bernoulli_perturb(batch_tokenized_sents[0])

tensor([0, 1, 0, 0, 1, 1, 1, 1, 0, 1])

In [75]:
attrs = lasso_lime_base.attribute(
    test_text.unsqueeze(0), # add batch dimension for Captum
    target=test_labels,
    additional_forward_args=(test_offsets,),
    n_samples=16,
    show_progress=True
).squeeze(0)

print('Attribution range:', attrs.min().item(), 'to', attrs.max().item())

Lime Base attribution:   0%|          | 0/16 [00:00<?, ?it/s]

Attribution range: -0.8227580189704895 to 1.4743865728378296


In [80]:
def show_text_attr(attrs):
    rgb = lambda x: '255,0,0' if x < 0 else '0,255,0'
    alpha = lambda x: abs(x) ** 0.5
    token_marks = [
        f'<mark style="background-color:rgba({rgb(attr)},{alpha(attr)})">{token}</mark>'
        for token, attr in zip(tokenizer(test_line), attrs.tolist())
    ]
    
    display(HTML('<p>' + ' '.join(token_marks) + '</p>'))

In [100]:
# positive
attrs = lasso_lime_base.attribute(
    test_text.unsqueeze(0), # add batch dimension for Captum
    target=test_labels,
    additional_forward_args=(test_offsets,),
    n_samples=5000,
    show_progress=True
).squeeze(0)

print('Attribution range:', attrs.min().item(), 'to', attrs.max().item())

Lime Base attribution:   0%|          | 0/5000 [00:00<?, ?it/s]

Attribution range: -0.8754809498786926 to 1.2592147588729858


In [101]:
# positive
show_text_attr(attrs)

In [97]:
# positive
attrs = lasso_lime_base.attribute(
    test_text.unsqueeze(0), # add batch dimension for Captum
    target=test_labels-1,
    additional_forward_args=(test_offsets,),
    n_samples=5000,
    show_progress=True
).squeeze(0)

print('Attribution range:', attrs.min().item(), 'to', attrs.max().item())
show_text_attr(attrs)

Lime Base attribution:   0%|          | 0/5000 [00:00<?, ?it/s]

Attribution range: -1.2927688360214233 to 0.8326038718223572


In [41]:
import numpy as np

def generate_ordered_sentence_neighborhood(sentence, num_samples=5000):
    words = sentence.split()
    num_words = len(words)
    
    # Initialize the list to store neighborhood sentences
    neighborhood_sentences = [sentence]  # Include the original sentence as the first sample
    
    for _ in range(num_samples - 1):  # We already have the original sentence, hence num_samples - 1
        num_words_to_remove = np.random.randint(1, num_words)  # Number of words to remove
        words_to_remove = np.random.choice(range(num_words), size=num_words_to_remove, replace=False)
        perturbed_sentence = ' '.join([word for idx, word in enumerate(words) if idx not in words_to_remove])
        neighborhood_sentences.append(perturbed_sentence)
    
    return neighborhood_sentences

# Example usage:
original_sentence = "This is an example sentence to demonstrate how to randomly remove words."

# Generating 5000 sentences (including the original)
neighborhood = generate_ordered_sentence_neighborhood(original_sentence)
print(f"Generated {len(neighborhood)} sentences. Showing the first 10:")
for idx, sentence in enumerate(neighborhood[:10]):
    print(f"Sample {idx}: {sentence}")


Generated 5000 sentences. Showing the first 10:
Sample 0: This is an example sentence to demonstrate how to randomly remove words.
Sample 1: is an sentence to demonstrate to remove
Sample 2: This demonstrate
Sample 3: This demonstrate remove words.
Sample 4: demonstrate how remove words.
Sample 5: This is an example to demonstrate how to randomly remove words.
Sample 6: This an example sentence demonstrate how randomly remove words.
Sample 7: example sentence demonstrate how randomly words.
Sample 8: is an to demonstrate how to randomly remove words.
Sample 9: is to to remove
