# Validation Baselines

This notebook implements and evaluates baseline similarity measures used for comparison with our ConDynS measure, where the results are demonstrated in the other notebook. It computes metrics such as SBERT cosine similarity, BERTScore, and naive LLM-prompted similarity on both transcript and SCD representations of conversations. These baselines serve as reference points in the validation experiment, allowing us to assess the unique contribution of ConDynS in capturing conversational dynamics beyond topic or surface-level features. Detailed discussion can be found in Section 5 of our [paper: A Similarity Measure for Comparing Conversational Dynamics](https://arxiv.org/abs/2507.18956).

In [None]:
import json
from convokit import Corpus, download
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from tqdm import tqdm

from convokit.convo_similarity.utils import get_human_summary_pair_lst

In [None]:
corpus = Corpus(filename=download("conversations-gone-awry-cmv-corpus"))
corpus.print_summary_stats()

human_pair_lst = get_human_summary_pair_lst(corpus)
convo_pairs = human_pair_lst + [(j, i) for i, j in human_pair_lst]

In [None]:
ARTEFACTS_DIR = "./artefacts/"

# Running All Baseline Methods

The following script runs all baseline similarity metrics (e.g., SBERT cosine similarity, BERTScore, naive prompting) on the provided conversation pairs, with different input types (raw transcript of the conversation, or its SCD).

In [None]:
with open(ARTEFACTS_DIR + "validation_gpt/transcript_simulations.json", "r") as f:
    transcript_simulations = json.load(f)
with open(ARTEFACTS_DIR + "validation_gpt/transcript_simulations_topic_shuffled.json", "r") as f:
    transcript_simulations_topic_shuffled = json.load(f)

In [None]:
### Calling GPT to make naive comparisons, run with caution.
from convokit.convo_similarity.utils import format_transcript_from_convokit, get_human_summary
from convokit.convo_similarity.baseline import ConDynSBaselines
from convokit.genai.genai_config import GenAIConfigManager

config = GenAIConfigManager() ## make sure to set your own config if this is never set before
MODEL_PROVIDER = "gpt"
MODEL = "gpt-4o-mini"
config.set_api_key("gpt", "YOUR API KEY")
baselines = ConDynSBaselines(model_provider="gpt", config=config)

self_results = {}
self_scores = []
for convo_id1, convo_id2 in tqdm(convo_pairs, desc="Calculating self sim similarity"):
    transcript1 = "\n\n".join(format_transcript_from_convokit(corpus, convo_id1))
    transcript2 = transcript_simulations[convo_id1]['generated_transcript']

    scd1 = get_human_summary(corpus, convo_id1)['summary_text']
    scd2 = transcript_simulations[convo_id1]['summary']['summary_text']

    transcript_bertscore = baselines.get_bertscore(transcript1, transcript2)['f1'][0]
    transcript_cos_sim = baselines.get_cosine_similarity(transcript1, transcript2)
    transcript_naive_gpt, _ = baselines.get_naive_gpt_compare_score_Transcripts(transcript1, transcript2)

    scd_bertscore = baselines.get_bertscore(scd1, scd2)['f1'][0]
    scd_cos_sim = baselines.get_cosine_similarity(scd1, scd2)
    scd_naive_gpt, _ = baselines.get_naive_gpt_compare_score_SCDs(scd1, scd2)

    results = {"transcript_bertscore" : transcript_bertscore,
              "transcript_cos_sim" : transcript_cos_sim,
              "transcript_naive_gpt" : transcript_naive_gpt,
              "scd_bertscore" : scd_bertscore,
              "scd_cos_sim" : scd_cos_sim,
              "scd_naive_gpt" : scd_naive_gpt}
    self_results[str(convo_id1)] = results
    self_scores.append(results)

pair_results = {}
pair_scores = []
for convo_id1, convo_id2 in tqdm(convo_pairs, desc="Calculating pair sim similarity"):
    transcript1 = "\n\n".join(format_transcript_from_convokit(corpus, convo_id1))
    transcript2 = transcript_simulations[convo_id2]['generated_transcript']
    
    scd1 = get_human_summary(corpus, convo_id1)['summary_text']
    scd2 = transcript_simulations[convo_id2]['summary']['summary_text']

    transcript_bertscore = baselines.get_bertscore(transcript1, transcript2)['f1'][0]
    transcript_cos_sim = baselines.get_cosine_similarity(transcript1, transcript2)
    transcript_naive_gpt, _ = baselines.get_naive_gpt_compare_score_Transcripts(transcript1, transcript2)

    scd_bertscore = baselines.get_bertscore(scd1, scd2)['f1'][0]
    scd_cos_sim = baselines.get_cosine_similarity(scd1, scd2)
    scd_naive_gpt, _ = baselines.get_naive_gpt_compare_score_SCDs(scd1, scd2)

    results = {"transcript_bertscore" : transcript_bertscore,
              "transcript_cos_sim" : transcript_cos_sim,
              "transcript_naive_gpt" : transcript_naive_gpt,
              "scd_bertscore" : scd_bertscore,
              "scd_cos_sim" : scd_cos_sim,
              "scd_naive_gpt" : scd_naive_gpt}
    
    pair_results[str(convo_id2)] = results
    pair_scores.append(results)


all_baseline_scores = {"self_results" : self_results,
                       "self_scores" : self_scores,
                       "pair_results" : pair_results,
                       "pair_scores" : pair_scores}

with open(ARTEFACTS_DIR + "validation_gpt/baseline/baseline_results.json", "w") as f:
    json.dump(all_baseline_scores, f, indent=4)

In [None]:
with open(ARTEFACTS_DIR + "validation_gpt/baseline/baseline_results.json", "r") as f:
    all_baseline_scores = json.load(f)

self_scores = all_baseline_scores["self_scores"]
pair_scores = all_baseline_scores["pair_scores"]

In [None]:
def plot_baseline_scores(self_scores, pair_scores, input_type, score_method):
    """
    Input:
        input_type: transcript, scd
        score_method: cos_sim, bertscore, naive_gpt
    """
    accuracy = []
    self_raw_scores = [x[f"{input_type}_{score_method}"] for x in self_scores]
    pair_raw_scores = [x[f"{input_type}_{score_method}"] for x in pair_scores]
    accuracy = [x > y for x, y in zip(self_raw_scores, pair_raw_scores)]

    print(sum(accuracy) / len(accuracy))
    print(np.mean(self_raw_scores))
    print(np.mean(pair_raw_scores))
    print(stats.wilcoxon(self_raw_scores, pair_raw_scores))

    plt.hist(self_raw_scores, alpha = 0.6, label = "self simulation")
    plt.hist(pair_raw_scores, alpha = 0.6, label = "pair simulation")
    plt.xlabel("number of conversations")
    plt.ylabel("similarity scores")
    plt.title(f"Baseline ({score_method}) Score distribution of sim vs sim ({input_type})")
    plt.legend()
    plt.show()

def get_baseline_acc(self_scores, pair_scores, input_type, score_method):
    """
    Input:
        input_type: transcript, scd
        score_method: cos_sim, bertscore, naive_gpt
    """
    accuracy = []
    self_raw_scores = [x[f"{input_type}_{score_method}"] for x in self_scores]
    pair_raw_scores = [x[f"{input_type}_{score_method}"] for x in pair_scores]
    accuracy = [x > y for x, y in zip(self_raw_scores, pair_raw_scores)]

    return sum(accuracy) / len(accuracy)

In [None]:
### Outputting the results from all baseline methods ###
input_types = ["transcript", "scd"]
score_methods = ["cos_sim", "bertscore", "naive_gpt"]

for score_method in score_methods:
    for input_type in input_types:
        acc = get_baseline_acc(self_scores, pair_scores, input_type, score_method)
        print(f"###### {score_method} + {input_type} ######")
        print(f"Acc: {acc:.4f}\n")
        print()