In [1]:
import sys

sys.path.append('../')

In [2]:
import json
import os

import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer

import metrics
from dialog import DialogTriplet, dialog_from_file
from model import Embedder

In [3]:
CONFIDENCE_THRESHOLD = 0.75

In [4]:
def get_dialog_filepath(root_dir: str, dialog_id: str):
    return os.path.join(root_dir, '{0}.json'.format(dialog_id))

In [5]:
def load_dialog_triplets(
    metadata: pd.DataFrame, root_dir: str,
) -> list[DialogTriplet]:
    dialog_triplets = []
    for triplet in metadata.to_dict('records'):
        label = triplet['more_similar_conv'] - 1
        confidence_score = triplet['more_similar_conv_confidence']

        anchor_filepath = get_dialog_filepath(root_dir, triplet['anchor_conv'])
        dialog_1_filepath = get_dialog_filepath(root_dir, triplet['conv_1'])
        dialog_2_filepath = get_dialog_filepath(root_dir, triplet['conv_2'])

        dialog_triplets.append(
            DialogTriplet(
                anchor_dialog=dialog_from_file(anchor_filepath),
                dialog_1=dialog_from_file(dialog_1_filepath),
                dialog_2=dialog_from_file(dialog_2_filepath),
                label=label,
                confidence_score=confidence_score,
            ),
        )
    return dialog_triplets

In [6]:
def evaluate_metrics(
    metrics_to_evaluate: dict[str, metrics.BaseMetric],
    dialog_triplets: list[DialogTriplet],
    confidence_threshold: float,
) -> pd.DataFrame:
    metric_names, scores = [], []
    for metric_name, metric in metrics_to_evaluate.items():
        score = metrics.get_metric_agreement(
            dialog_triplets=dialog_triplets,
            metric=metric,
            confidence_threshold=confidence_threshold,
        )
        metric_names.append(metric_name)
        scores.append(score)
    return pd.DataFrame({'Metric': metric_names, 'Score': scores})

In [7]:
metadata_filepath = '../conversation-similarity/conved.csv'
dialogs_dir = '../conversation-similarity/dialogs'
embeddings_dir = '../conversation-similarity/cache'

os.makedirs(embeddings_dir, exist_ok=True)

In [8]:
triplets = load_dialog_triplets(
    pd.read_csv(metadata_filepath), dialogs_dir,
)

In [9]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [10]:
metrics_without_embeddings = {
    'Constant distance': metrics.ExampleMetric(is_inverted=False),
    'StructED': metrics.StructuralEditDistance(is_inverted=False, normalize=False, substitution_weight=2.0),
    'StructED (normalized)': metrics.StructuralEditDistance(is_inverted=False, normalize=True, substitution_weight=2.0),
    'StructED (transpositions)': metrics.StructuralEditDistance(is_inverted=False, normalize=False, substitution_weight=2.0, transpositions=False),
    'StructED (transpositions, normalized)': metrics.StructuralEditDistance(is_inverted=False, normalize=True, substitution_weight=2.0, transpositions=True),
}

metrics_with_embeddings = {
    'ConvED': metrics.ConversationalEditDistance(is_inverted=False, normalize=False, substitution_weight=2.2),
    'ConvED (normalized)': metrics.ConversationalEditDistance(is_inverted=False, normalize=True, substitution_weight=2.2),
    'Cosine distance (average embedding)': metrics.CosineDistance(is_inverted=False, embedding_type='turn'),
    'Lp distance p=1 (average embedding)': metrics.LpDistance(is_inverted=False, embedding_type='turn', p=1),
    'Lp distance p=2 (average embedding)': metrics.LpDistance(is_inverted=False, embedding_type='turn', p=2),
    'Dot product similarity (average embedding)': metrics.DotProductSimilarity(is_inverted=True, embedding_type='turn'),
    'Cosine distance (dialog embedding)': metrics.CosineDistance(is_inverted=False, embedding_type='dialog'),
    'Lp distance p=1 (dialog embedding)': metrics.LpDistance(is_inverted=False, embedding_type='dialog', p=1),
    'Lp distance p=2 (dialog embedding)': metrics.LpDistance(is_inverted=False, embedding_type='dialog', p=2),
    'Dot product similarity (dialog embedding)': metrics.DotProductSimilarity(is_inverted=True, embedding_type='dialog'),
}

### Metrics without embeddings

In [11]:
results = evaluate_metrics(
    metrics_without_embeddings,
    triplets,
    CONFIDENCE_THRESHOLD,
)

In [12]:
results

Unnamed: 0,Metric,Score
0,Constant distance,0.521739
1,StructED,0.743478
2,StructED (normalized),0.708696
3,StructED (transpositions),0.743478
4,"StructED (transpositions, normalized)",0.708696


### Metrics with embeddings

#### all-MiniLM-L12-v2

In [13]:
model_name = 'all-MiniLM-L12-v2'
model = SentenceTransformer(model_name, device=device)
embedder = Embedder(model=model, tokenizer=None, device=device)

In [14]:
for triplet in tqdm(triplets):
    triplet.compute_embeddings(embeddings_dir, embedder, model_name)

100%|███████████████████████████████████████████████████████████████████████████████| 502/502 [00:02<00:00, 187.15it/s]


In [15]:
results = evaluate_metrics(
    metrics_with_embeddings,
    triplets,
    CONFIDENCE_THRESHOLD,
)

In [16]:
results

Unnamed: 0,Metric,Score
0,ConvED,0.721739
1,ConvED (normalized),0.726087
2,Cosine distance (average embedding),0.526087
3,Lp distance p=1 (average embedding),0.491304
4,Lp distance p=2 (average embedding),0.5
5,Dot product similarity (average embedding),0.452174
6,Cosine distance (dialog embedding),0.434783
7,Lp distance p=1 (dialog embedding),0.426087
8,Lp distance p=2 (dialog embedding),0.434783
9,Dot product similarity (dialog embedding),0.434783


#### DSE

In [17]:
model_name = 'dse-bert-base'
model = AutoModel.from_pretrained('aws-ai/dse-bert-base')
tokenizer = AutoTokenizer.from_pretrained('aws-ai/dse-bert-base')
embedder = Embedder(model=model, tokenizer=tokenizer, device=device)

Some weights of the model checkpoint at aws-ai/dse-bert-base were not used when initializing BertModel: ['contrast_head.0.weight', 'contrast_head.2.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
for triplet in tqdm(triplets):
    triplet.compute_embeddings(embeddings_dir, embedder, model_name)

100%|███████████████████████████████████████████████████████████████████████████████| 502/502 [00:03<00:00, 164.21it/s]


In [19]:
results = evaluate_metrics(
    metrics_with_embeddings,
    triplets,
    CONFIDENCE_THRESHOLD,
)

In [20]:
results

Unnamed: 0,Metric,Score
0,ConvED,0.717391
1,ConvED (normalized),0.713043
2,Cosine distance (average embedding),0.613043
3,Lp distance p=1 (average embedding),0.491304
4,Lp distance p=2 (average embedding),0.486957
5,Dot product similarity (average embedding),0.513043
6,Cosine distance (dialog embedding),0.421739
7,Lp distance p=1 (dialog embedding),0.413043
8,Lp distance p=2 (dialog embedding),0.421739
9,Dot product similarity (dialog embedding),0.378261


In [21]:
model_name = 'dse-roberta-large'
model = AutoModel.from_pretrained('aws-ai/dse-roberta-large')
tokenizer = AutoTokenizer.from_pretrained('aws-ai/dse-roberta-large')
embedder = Embedder(model=model, tokenizer=tokenizer, device=device)

Some weights of the model checkpoint at aws-ai/dse-roberta-large were not used when initializing RobertaModel: ['contrast_head.0.weight', 'contrast_head.2.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
for triplet in tqdm(triplets):
    triplet.compute_embeddings(embeddings_dir, embedder, model_name)

100%|███████████████████████████████████████████████████████████████████████████████| 502/502 [00:03<00:00, 163.29it/s]


In [23]:
results = evaluate_metrics(
    metrics_with_embeddings,
    triplets,
    CONFIDENCE_THRESHOLD,
)

In [24]:
results

Unnamed: 0,Metric,Score
0,ConvED,0.704348
1,ConvED (normalized),0.678261
2,Cosine distance (average embedding),0.66087
3,Lp distance p=1 (average embedding),0.491304
4,Lp distance p=2 (average embedding),0.513043
5,Dot product similarity (average embedding),0.504348
6,Cosine distance (dialog embedding),0.469565
7,Lp distance p=1 (dialog embedding),0.504348
8,Lp distance p=2 (dialog embedding),0.513043
9,Dot product similarity (dialog embedding),0.413043
