In [1]:
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import TextClassificationPipeline
import pandas as pd
import numpy as np
import tqdm
from tqdm import tqdm
import sys, os
from transformers import Pipeline
from torch import Tensor 
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
import torch.nn as nn
import sentence_transformers

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load Data and Model

In [2]:
DATASET_REPO = "shreyahavaldar/multilingual_politeness"
MODEL_REPO = "shreyahavaldar/xlm-roberta-politeness"
TOKENIZER_REPO = "xlm-roberta-base"

def load_data():
    hf_dataset = load_dataset(DATASET_REPO)
    return hf_dataset

def load_model():
    model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_REPO)
    model.to(device)
    return model

class PolitenessDataset(torch.utils.data.Dataset):
    def __init__(self, split, language="english"):
        dataset = load_dataset(DATASET_REPO)[split]
        dataset = dataset.filter(lambda x: x["language"] == language)
        dataset = dataset.rename_column("politeness", "label")
        self.dataset = dataset
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(TOKENIZER_REPO)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset["Utterance"][idx]
        label = self.dataset["label"][idx]
        encoding = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "label": torch.tensor(label)
        }

class PolitenessClassifier(nn.Module):
    def __init__(self):
        super(PolitenessClassifier, self).__init__()
        self.model = load_model()

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask)
        logits = outputs.logits
        return logits


### Sample inference on dataset

In [3]:
dataset = PolitenessDataset("train")
dataloader = DataLoader(dataset, batch_size=8, shuffle=False)

model = PolitenessClassifier()
model.eval()

for batch in tqdm(dataloader): 
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    utterances = [dataset.tokenizer.decode(input_id, skip_special_tokens=True) for input_id in input_ids]
    for utterance, label in zip(utterances, output):
        print("Text: {}\nPoliteness: {}\n".format(utterance, label.item()))
    break

  0%|          | 0/570 [00:01<?, ?it/s]

Text: That is why he is a ‘pretender’. He has never claimed to be a King - or a Kaiser, for that matter. He is in the same class as the Comte de Paris, who is not the King of France, but would be if the Bourbons were placed on a restored French throne.
Politeness: -0.11096464097499847

Text: Let's knock any 'EngVar' shenanigans on the head right away, shall we? The Manual of Style, as I understand it, makes it clear that the subject's national ties and own language set the course.
Politeness: -0.4824700355529785

Text: Thank you for your contributions. There are some conventions that apply to articles, and medical articles in particular. Secondary sources were available for the material, and should be cited to validate the medical information from the studies.
Politeness: 1.5924450159072876

Text: The conversion of tacit to explicit knowledge is seen in for example the bread making machine's case. In response to your question, culture is a broad term. I would like to narrow it down to 




### Define Alignment Metric

In [4]:
class Metric(nn.Module): 
    def __init__(self, model_name:str="distiluse-base-multilingual-cased"): 
        super(Metric, self).__init__()
        self.model = sentence_transformers.SentenceTransformer(model_name)
        self.centroids = self.get_centroids()
    
    def get_centroids(self):
        # read lexica files
        languages = ["english", "spanish", "chinese", "japanese"]
        lexica = {}
        for l in languages:
            filepath = f"../src/exlib/utils/politeness_lexica/{l}_politelex.csv"
            lexica[l] = pd.read_csv(filepath)

        # create centroids
        all_centroids = {}        
        for l in languages:
            categories = lexica[l]["CATEGORY"].unique()
            centroids = {}
            for c in categories:
                words = lexica[l][lexica[l]["CATEGORY"] == c]["word"].tolist()
                embeddings = self.model.encode(words)
                centroid = np.mean(embeddings, axis=0)
                centroids[c] = centroid
            assert len(categories) == len(centroids.keys())
            all_centroids[l] = centroids
            print(f"Centroids for {l} created.")
        return all_centroids

    # input: list of words
    def calculate_single_group_alignment(self, group:list, language:str="english"):
        #find max avg cos sim between word embeddings and centroids
        category_similarities = {}
        centroids = self.centroids[language]
        for category, centroid_emb in centroids.items():
            #calculate cosine similarity
            cos_sim = []
            for word in group:
                word_emb = self.model.encode(word)
                cos_sim.append(np.dot(word_emb, centroid_emb) / (np.linalg.norm(word_emb) * np.linalg.norm(centroid_emb)))
            avg_cos_sim = np.mean(cos_sim)
            category_similarities[category] = avg_cos_sim
        #return highest similarity score
        return max(category_similarities.values())

    def calculate_group_alignment(self, groups:list, language:str="english"):
        group_alignments = []
        for group in groups:
            group_alignments.append(self.calculate_single_group_alignment(group, language))
        return group_alignments

### Example Group Alignment Calculation

In [5]:
metric = Metric()
sample_groups = [["dog", "cat", "fish"], 
                ["hello", "goodbye", "please"], 
                ["computer", "laptop", "phone"], 
                ["idiot", "stupid", "dumb"], 
                ["thank you", "grateful", "thanks"]]
alignments = metric.calculate_group_alignment(sample_groups)
for group, alignment in zip(sample_groups, alignments):
    print(f"Group: {group}, Alignment: {alignment}")

Centroids for english created.
Centroids for spanish created.
Centroids for chinese created.
Centroids for japanese created.
Group: ['dog', 'cat', 'fish'], Alignment: 0.5292773842811584
Group: ['hello', 'goodbye', 'please'], Alignment: 0.7011184692382812
Group: ['computer', 'laptop', 'phone'], Alignment: 0.4826013147830963
Group: ['idiot', 'stupid', 'dumb'], Alignment: 0.7102837562561035
Group: ['thank you', 'grateful', 'thanks'], Alignment: 0.9256609082221985
