In [4]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [5]:
import torch
import pandas as pd
import transformers
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
checkmodel = 'sentence-transformers/bert-base-nli-mean-tokens'

tokenizer45 = AutoTokenizer.from_pretrained(checkmodel)
model45 = AutoModel.from_pretrained(checkmodel)

In [7]:
def compute_tokens(sentences, tokenizer):
    
    input_ids = []
    attention_mask = []
    
    # encoding all sentences for bert input
    for sentence in sentences:
        sentence_encoding = tokenizer.encode_plus(
            sentence,
            max_length=128,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        input_ids.append(sentence_encoding['input_ids'][0])
        attention_mask.append(sentence_encoding['attention_mask'][0])
    
    # stacking all the input_ids and attention_mask along 1 dim
    input_ids = torch.stack(input_ids)
    attention_mask = torch.stack(attention_mask)
    # final shape of input_ids & attention_mask = torch.Size([6, 128]), initially they were list.

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask
    }

In [8]:
def compute_sentence_vector(tokens, model):
    
    last_hidden_state, pooled_output = model(**tokens, return_dict=False)
    '''
    Now let's apply mean pooling on last_hidden_state vector of shape torch.Size([6,128,768])
    to convert it into meaningful sentence embedding.

    For this we need to create a sentence vector by multiplying the attention_mask
    with last_hidden_state so that we ignore non-real tokens i.e. ignore padding tokens. 
    
    The final sentence vector will have 768 embeddings for those
    words where there was 1 else 0 = padding tokens.
    In order to multiply, we need to expand attention_mask dim by 1 so that both becomes [6,128,768].
    '''    
    attention_mask = tokens['attention_mask'].unsqueeze(-1).expand(last_hidden_state.shape).float()
    masked_embeddings = last_hidden_state * attention_mask 

    # applying mean pooling
    '''
    This pooling operation will take the mean of all token embeddings and compress them into a 
    single 768 vector space — creating a ‘sentence vector’.

    At the same time, we can’t just take the mean activation as is. We need to consider 
    null padding tokens (which we should not include).
    '''
    summed = torch.sum(masked_embeddings, dim=1) # shape = [6,768]
    counts = torch.clamp(attention_mask.sum(dim=1), min=1e-9) # shape = [6,768]
    mean_pooled_embedding = summed / counts # shape = [6, 768] i.e. our final sentence vector.

    return mean_pooled_embedding

In [9]:
def compute_similarity(sentences, tokenizer, model):
    
    sentences_tokens = compute_tokens(sentences, tokenizer)
    sentences_embeddings = compute_sentence_vector(sentences_tokens, model)
    sentences_embeddings_detached = sentences_embeddings.detach().numpy()
    similarity_scores = cosine_similarity([sentences_embeddings_detached[0]], sentences_embeddings_detached[1:])

    d = {
        'column-1': [sentences[0] for _ in range(len(sentences)-1)],
        'column-2': [sent for sent in sentences[1:]],
        'scores': similarity_scores[0]
    }

    output = pd.DataFrame(data=d)

    return output

In [10]:
sentences = [
             "Three years later, the coffin was still full of Jello.",
             "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
             "The person box was packed with jelly many dozens of months later.",
             "Standing on one's head at job interviews forms a lasting impression.",
             "It took him a month to finish the meal.",
             "Finishing the meal took him 3 weeks."
]

In [11]:
sentences =["We're sorry to hear about your experience","We're sorry to hear that you had a negative experience with our food delivery service.","We take all feedback seriously and are committed to providing the best possible service to our customers.","We would like to apologize for any inconvenience or frustration that you may have experienced.","We understand that mistakes can happen, but we are dedicated to making things right for our customers.","Please let us know more about the issue you faced so that we can investigate and take appropriate action.","We value your feedback and want to ensure that we are meeting your expectations."]

In [13]:
output = compute_similarity(sentences, tokenizer45, model45)

In [12]:
sentences

["We're sorry to hear about your experience",
 "We're sorry to hear that you had a negative experience with our food delivery service.",
 'We take all feedback seriously and are committed to providing the best possible service to our customers.',
 'We would like to apologize for any inconvenience or frustration that you may have experienced.',
 'We understand that mistakes can happen, but we are dedicated to making things right for our customers.',
 'Please let us know more about the issue you faced so that we can investigate and take appropriate action.',
 'We value your feedback and want to ensure that we are meeting your expectations.']

In [14]:
output

Unnamed: 0,column-1,column-2,scores
0,We're sorry to hear about your experience,We're sorry to hear that you had a negative ex...,0.718817
1,We're sorry to hear about your experience,We take all feedback seriously and are committ...,0.253664
2,We're sorry to hear about your experience,We would like to apologize for any inconvenien...,0.769503
3,We're sorry to hear about your experience,"We understand that mistakes can happen, but we...",0.534049
4,We're sorry to hear about your experience,Please let us know more about the issue you fa...,0.494491
5,We're sorry to hear about your experience,We value your feedback and want to ensure that...,0.435926
