In [None]:
# Uncomment line below to install exlib
# !pip install exlib

In [2]:
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import numpy as np
import pandas as pd
import tqdm
from tqdm import tqdm
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
import sentence_transformers

import exlib
from exlib.utils.politeness_helper import load_lexica

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load Data and Model

In [3]:
DATASET_REPO = "BrachioLab/multilingual_politeness"
MODEL_REPO = "BrachioLab/xlm-roberta-politeness"
TOKENIZER_REPO = "xlm-roberta-base"

def load_data():
    hf_dataset = load_dataset(DATASET_REPO)
    return hf_dataset

def load_model():
    model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_REPO)
    model.to(device)
    return model

class PolitenessDataset(torch.utils.data.Dataset):
    def __init__(self, split, language="english"):
        dataset = load_dataset(DATASET_REPO)[split]
        dataset = dataset.filter(lambda x: x["language"] == language)
        dataset = dataset.rename_column("politeness", "label")
        self.dataset = dataset
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(TOKENIZER_REPO)
        self.max_len = max([len(text.split()) for text in dataset['Utterance']])


    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset["Utterance"][idx]
        label = self.dataset["label"][idx]
        encoding = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        word_list = text.split()
        for i in range(len(word_list), self.max_len):
            word_list.append('')
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "label": torch.tensor(label),
            'word_list': word_list
        }

class PolitenessClassifier(nn.Module):
    def __init__(self):
        super(PolitenessClassifier, self).__init__()
        self.model = load_model()

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask)
        logits = outputs.logits
        return logits


### Sample inference on dataset

In [4]:
dataset = PolitenessDataset("train")
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)
model = PolitenessClassifier()
model.to(device)
model.eval()

for batch in tqdm(dataloader): 
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    utterances = [dataset.tokenizer.decode(input_id, skip_special_tokens=True) for input_id in input_ids]
    for utterance, label in zip(utterances, output):
        print("Text: {}\nPoliteness: {}\n".format(utterance, label.item()))
    break

Filter:   0%|          | 0/18238 [00:00<?, ? examples/s]

  0% 0/1140 [00:00<?, ?it/s]

Text: That is why he is a ‘pretender’. He has never claimed to be a King - or a Kaiser, for that matter. He is in the same class as the Comte de Paris, who is not the King of France, but would be if the Bourbons were placed on a restored French throne.
Politeness: -0.11096354573965073

Text: Let's knock any 'EngVar' shenanigans on the head right away, shall we? The Manual of Style, as I understand it, makes it clear that the subject's national ties and own language set the course.
Politeness: -0.4824686050415039

Text: Thank you for your contributions. There are some conventions that apply to articles, and medical articles in particular. Secondary sources were available for the material, and should be cited to validate the medical information from the studies.
Politeness: 1.592445731163025

Text: The conversion of tacit to explicit knowledge is seen in for example the bread making machine's case. In response to your question, culture is a broad term. I would like to narrow it down to o




### Define Alignment Metric

In [6]:
class Metric(nn.Module): 
    def __init__(self, model_name:str="distiluse-base-multilingual-cased"): 
        super(Metric, self).__init__()
        self.model = sentence_transformers.SentenceTransformer(model_name)
        self.centroids = self.get_centroids()
    
    def get_centroids(self):
        # read lexica files
        languages = ["english", "spanish", "chinese", "japanese"]
        lexica = {}
        for l in languages:
            lexica[l] = load_lexica(l)

        # create centroids
        all_centroids = {}        
        for l in languages:
            categories = lexica[l]["CATEGORY"].unique()
            centroids = {}
            for c in categories:
                words = lexica[l][lexica[l]["CATEGORY"] == c]["word"].tolist()
                embeddings = self.model.encode(words)
                centroid = np.mean(embeddings, axis=0)
                centroids[c] = centroid
            assert len(categories) == len(centroids.keys())
            all_centroids[l] = centroids
            print(f"Centroids for {l} created.")
        return all_centroids

    # input: list of words
    def calculate_single_group_alignment(self, group:list, language:str="english"):
        #find max avg cos sim between word embeddings and centroids
        category_similarities = {}
        centroids = self.centroids[language]
        for category, centroid_emb in centroids.items():
            #calculate cosine similarity
            cos_sim = []
            for word in group:
                word_emb = self.model.encode(word)
                cos_sim.append(np.dot(word_emb, centroid_emb) / (np.linalg.norm(word_emb) * np.linalg.norm(centroid_emb)))
            avg_cos_sim = np.mean(cos_sim)
            category_similarities[category] = avg_cos_sim
        #return highest similarity score
        return max(category_similarities.values())

    # input: list of words
    def calculate_single_group_alignment(self, group:list, language:str="english"):
        #find max avg cos sim between word embeddings and centroids
        category_similarities = {}
        centroids = self.centroids[language]
        word_embs = []
        for word in group:
            word_emb = self.model.encode(word)
            word_embs.append(torch.tensor(word_emb))

        # word_embs = self.model.encode(group)
        word_embs = torch.stack(word_embs).to(device)
        word_emb_pt = torch.tensor(word_embs).to(device)
        centroid_embs = list(centroids.values())
        centroid_emb_pt = torch.tensor(centroid_embs).to(device)

        # Compute the norms for each batch
        norm_word = torch.norm(word_emb_pt, dim=1, keepdim=True)  # Shape becomes (n, 1)
        norm_centroid = torch.norm(centroid_emb_pt, dim=1, keepdim=True)  # Shape becomes (m, 1)

        # Compute the dot products
        # Transpose centroid_emb_pt to make it (d, m) for matrix multiplication
        dot_product = torch.mm(word_emb_pt, centroid_emb_pt.T)  # Resulting shape is (n, m)

        # Compute the outer product of the norms
        norms_product = torch.mm(norm_word, norm_centroid.T)  # Resulting shape is (n, m)

        # Calculate the cosine similarity matrix
        cosine_similarity = dot_product / norms_product

        group_alignment = cosine_similarity.mean(0).max().item()
        return group_alignment

    def calculate_group_alignment(self, groups:list, language:str="english"):
        group_alignments = []
        for group in groups:
            group_alignments.append(self.calculate_single_group_alignment(group, language))
        return group_alignments
    
    def forward(self, group_masks:list, original_data:PolitenessDataset, language="english"):
        #create groups
        groups = []
        for i in range(len(group_masks)):
            word_list_ex = original_data[i]['word_list']
            mask = group_masks[i]
            print(word_list_ex, mask)
            group = [word_list_ex[j] for j in range(len(mask)) if mask[j] == 1]
            groups.append(group)
        return np.mean(self.calculate_group_alignment(groups, language))

### Example Expert Alignment Calculation

In [7]:
metric = Metric()
sample_groups = [ ["hello", "goodbye", "please"], 
                ["computer", "laptop", "phone"], 
                ["idiot", "stupid", "dumb"], 
                ["thank you", "grateful", "thanks"]]
alignments = metric.calculate_group_alignment(sample_groups)
for group, alignment in zip(sample_groups, alignments):
    print(f"Group: {group}, Alignment: {alignment}")



FileNotFoundError: [Errno 2] No such file or directory: '/home/runai-home/.local/lib/python3.10/site-packages/exlib/utils/politeness_lexica/english_politelex.csv'

### Baselines

In [24]:
for batch in dataloader: 
    word_lists = batch['word_list']
    word_lists = list(map(list, zip(*word_lists)))
    processed_word_lists = []
    for word_list in word_lists:
        processed_word_lists.append([word for word in word_list if word != ''])
    print("---- Word Level Groups ----")
    # word_alignments = []
    for word_list in processed_word_lists:
        word_groups = []
        for word in word_list:
            word_groups.append([word])
        print(word_groups)
        alignments = metric.calculate_group_alignment(word_groups)
        print(np.mean(alignments))

    print("\n---- Phrase Level Groups ----")
    # phrase_alignments = []
    for word_list in processed_word_lists:
        phrase_groups = []
        #each group is 3 consecutive words
        for i in range(0, len(word_list), 3):
            phrase_groups.append(word_list[i:i+3])
        print(phrase_groups)
        alignments = metric.calculate_group_alignment(phrase_groups)
        print(np.mean(alignments))

    print("\n---- Sentence Level Groups ----")
    # sentence_alignments = []
    for word_list in processed_word_lists:
        sentence_groups = []

        #reconstruct sentences from word list
        sentence = ""
        for word in word_list:
            sentence += word + " "
            if word[-1] == "." or word[-1] == "!" or word[-1] == "?":
                sentence_groups.append(sentence.split())
                sentence = ""
        if(len(sentence) > 0):
            sentence_groups.append(sentence.split())

        print(sentence_groups)
        alignments = metric.calculate_group_alignment(sentence_groups)
        print(np.mean(alignments))

    break
    

---- Word Level Groups ----
[['That'], ['is'], ['why'], ['he'], ['is'], ['a'], ['‘pretender’.'], ['He'], ['has'], ['never'], ['claimed'], ['to'], ['be'], ['a'], ['King'], ['-'], ['or'], ['a'], ['Kaiser,'], ['for'], ['that'], ['matter.'], ['He'], ['is'], ['in'], ['the'], ['same'], ['class'], ['as'], ['the'], ['Comte'], ['de'], ['Paris,'], ['who'], ['is'], ['not'], ['the'], ['King'], ['of'], ['France,'], ['but'], ['would'], ['be'], ['if'], ['the'], ['Bourbons'], ['were'], ['placed'], ['on'], ['a'], ['restored'], ['French'], ['throne.']]


  word_emb_pt = torch.tensor(word_embs).to(device)


0.7166317347085701
[["Let's"], ['knock'], ['any'], ["'EngVar'"], ['shenanigans'], ['on'], ['the'], ['head'], ['right'], ['away,'], ['shall'], ['we?'], ['The'], ['Manual'], ['of'], ['Style,'], ['as'], ['I'], ['understand'], ['it,'], ['makes'], ['it'], ['clear'], ['that'], ['the'], ["subject's"], ['national'], ['ties'], ['and'], ['own'], ['language'], ['set'], ['the'], ['course.']]
0.712827032103258
[['Thank'], ['you'], ['for'], ['your'], ['contributions.'], ['There'], ['are'], ['some'], ['conventions'], ['that'], ['apply'], ['to'], ['articles,'], ['and'], ['medical'], ['articles'], ['in'], ['particular.'], ['Secondary'], ['sources'], ['were'], ['available'], ['for'], ['the'], ['material,'], ['and'], ['should'], ['be'], ['cited'], ['to'], ['validate'], ['the'], ['medical'], ['information'], ['from'], ['the'], ['studies.']]
0.6841673021380966
[['The'], ['conversion'], ['of'], ['tacit'], ['to'], ['explicit'], ['knowledge'], ['is'], ['seen'], ['in'], ['for'], ['example'], ['the'], ['bread']