# Application of ConDynS on Friends Dataset

Dataset information can be found: https://convokit.cornell.edu/documentation/friends.html

In [None]:
import json
from convokit import Corpus, download
from tqdm import tqdm
import matplotlib.pyplot as plt
import ast
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import numpy as np
import re
import scipy.stats as stats
from itertools import combinations
import random
import string
import math
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import squareform
from sklearn.feature_extraction.text import CountVectorizer as CV
import string

from convokit.genai.genai_config import GenAIConfigManager
from convokit.convo_similarity.summary import SCDWriter
from convokit.convo_similarity.condyns import ConDynS

In [None]:
corpus = Corpus(filename=download("friends-corpus", data_dir = "YOUR DATA PATH"))

Downloading friends-corpus to /reef/sj597_kz88/scd-sim/wiki_exploration/friends-corpus
Downloading friends-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/friends-corpus/friends-corpus.zip (6.1MB)... Done


In [59]:
convo = corpus.random_conversation()
utt_lst = convo.get_utterance_ids()
speaker_ids = {}
transcript = ""
for utt_id in utt_lst:
    utt = corpus.get_utterance(utt_id)
    if "TRANSCRIPT_NOTE" not in utt.speaker.id:
        if utt.speaker.id not in speaker_ids:
            print(utt.speaker.id, ":", utt.text)
            speaker_ids[utt.speaker.id] = 1 + len(speaker_ids)
        transcript += "Speaker"+str(speaker_ids[utt.speaker.id]) + " : " + utt.text+ "\n"

Monica Geller : Here you go. You can wear this.
Phoebe Buffay : Thanks!
Hold Voice : Please, stay on the line. Your call is important to us.
Chandler Bing : Hey! Can you take a duck and a chick to the theatre?


In [None]:
### Setup path for data and corpus ###

DATA_PATH = "./data"
filepath = DATA_PATH + "PATH TO WIKI GERMAN DATA"

### Set up config for GenAI ###
config = GenAIConfigManager() ### make sure to set your own config if this is never set before

### Select which model provider to use for ConDynS ###
MODEL_PROVIDER = "gemini"
MODEL = "gemini-2.0-flash-001"
config.set_google_cloud_config("YOUR PROJECT", "YOUR LOCATION")

with open(filepath, "r") as f:
    dataset = json.load(f)

random.seed(4300)
dataset = random.sample(dataset, 100)
len(dataset)

In [39]:
friends_summary_prompt = """
Write a short summary capturing the trajectory of a casual conversation.
Do not include specific topics, events, or arguments from the conversation. The style you should avoid:
Example Sentence 1: “Speaker1 said they had a difficult day at work, and mentioned that their boss was unfair. Speaker2 listened and agreed that bosses can be tough, then suggested they go out for dinner to forget about it..”

Instead, do include indicators of sentiments (e.g., warmth, empathy, humor, nostalgia, vulnerability, support), individual intentions (e.g., building rapport, offering reassurance, seeking validation, self-disclosure, active listening, gentle disagreement, creating distance), and conversational strategies (if any) such as 'collaborative storytelling', 'inside jokes', 'mirroring emotions,' and 'affectionate teasing.'
The following sentences demonstrate the style you should follow:
* Example Sentence 2: “Both speakers have similar feelings and appeared mutually supportive. Speaker1 initiates with a moment of self-disclosure, and Speaker2 responds with empathy and validation. Both speakers build on this exchange, strengthening their rapport.”
* Example Sentence 3: “The two speakers connected with back-and-forth affectionate teasing. Throughout the conversation, they kept building on each other’s humor with playful remarks, creating a lighthearted and comfortable discussion.”

Overall, the trajectory summary should capture the key moments where the emotional connection of the conversation notably changes. Here is an example of a complete trajectory summary.
Trajectory Summary: The conversation begins with two speakers exchanging neutral, surface-level comments. Speaker1 then shifts the tone by sharing a personal anecdote, prompting Speaker2 to respond with warmth and empathy. Speaker1 elaborates on their story and their need, but Speaker2 does not extend their support but retracts it.
Now, provide the trajectory summary for the following conversation.
Conversation Transcript:{transcript}
Now, summarize this conversation. Remember, do not include specific topics, claims, or arguments from the conversation. Instead, try to capture the speakers' sentiments, intentions, and conversational/persuasive strategies. Limit the trajectory summary to 80 words.
Trajectory Summary:"""

In [None]:
scd_writer = SCDWriter(model_provider=MODEL_PROVIDER, 
                       model=MODEL, 
                       config=config, 
                       custom_scd_prompt=friends_summary_prompt, 
                       custom_prompt_dir="friends_prompts")
condyns = ConDynS(model_provider=MODEL_PROVIDER, 
                  model=MODEL, 
                  config=config)

In [None]:
def format_friends_transcript_from_convokit(corpus, convo_id):
    convo = corpus.get_conversation(convo_id)
    utt_lst = convo.get_utterance_ids()
    speaker_ids = {}
    transcript = ""
    for utt_id in utt_lst:
        utt = corpus.get_utterance(utt_id)
        if "TRANSCRIPT_NOTE" not in utt.speaker.id:
            if utt.speaker.id not in speaker_ids:
                speaker_ids[utt.speaker.id] = 1 + len(speaker_ids)
            transcript += "Speaker"+str(speaker_ids[utt.speaker.id]) + " : " + utt.text+ "\n\n"
    return transcript

def count_real_utterance_num(convo_id):
    convo = corpus.get_conversation(convo_id)
    utt_lst = convo.get_utterance_ids()
    count = 0
    for utt_id in utt_lst:
        utt = corpus.get_utterance(utt_id)
        if "TRANSCRIPT_NOTE" not in utt.speaker.id:
            count += 1
    return count

# Generating Sop

In [None]:
random.seed(4300)
convo_ids = []
while len(convo_ids) < 100:
    convo_id =  random.choice(corpus.get_conversation_ids())
    if count_real_utterance_num(convo_id) >= 4:
        convo_ids.append(convo_id)

In [None]:
time_analysis_scd = {}
bulletpoints = {}
for convo_id in tqdm(convo_ids, desc="Generating SCDs and SoPs for conversations"):
    convo = corpus.get_conversation(convo_id)
    utt_lst = convo.get_utterance_ids()
    transcript = format_friends_transcript_from_convokit(corpus, convo_id)
    scd, sop = scd_writer.get_scd_and_sop(friends_summary_prompt.format(transcript=transcript))
    time_analysis_scd[convo_id] = scd
    bulletpoints[convo_id] = sop

Generating SCDs for conversations: 100%|██████| 100/100 [01:47<00:00,  1.07s/it]


In [None]:
with open(DATA_PATH + f"friends_100_scd.json", 'w') as file:
    json.dump(time_analysis_scd, file, indent=4)

with open(DATA_PATH + f"friends_100_sop.json", 'w') as file:
    json.dump(bulletpoints, file, indent=4)

# Calculate scores

In [None]:
num = 50
all_combos = list(combinations(convo_ids[:num], 2))
convo_scores = {}
for convo_id1, convo_id2 in tqdm(all_combos, desc="Calculating pairs similarity"):
    if convo_id1 + "_" + convo_id2 in convo_scores or convo_id2 + "_" + convo_id1 in convo_scores:
        continue
    convo1 = corpus.get_conversation(convo_id1)
    convo2 = corpus.get_conversation(convo_id2)
    transcript1 = "\n\n".join(format_friends_transcript_from_convokit(corpus, convo_id1))
    transcript2 = "\n\n".join(format_friends_transcript_from_convokit(corpus, convo_id2))

    sop1 = bulletpoints[convo_id1]
    sop2 = bulletpoints[convo_id2]
    
    result = condyns.compute_bidirectional_similarity(transcript1, transcript2, sop1, sop2)
    score = condyns.compute_score_from_results(result)
    
    convo_scores[convo_id1 + "_" + convo_id2]["result"] = result
    convo_scores[convo_id1 + "_" + convo_id2]["score"] = score

Calculating pairs similarity: 100%|█████████| 1225/1225 [11:05<00:00,  1.84it/s]


In [None]:
with open(DATA_PATH + f"friends_50_scores.json", 'w') as file:
    json.dump(convo_scores, file, indent=4)

In [None]:
def get_similarity(convo_id1, convo_id2):
    if convo_id1 + "_" + convo_id2 in convo_scores:
        return convo_scores[convo_id1 + "_" + convo_id2]["score"]
    elif convo_id2 + "_" + convo_id1 in convo_scores:
        return convo_scores[convo_id2 + "_" + convo_id1]["score"]
    else:
        print("Did not find the score")
        return

### Clustering

In [None]:
# Step 1: Create the distance matrix
n = len(convo_ids[:num])
distance_matrix = np.zeros((n, n))

# Fill the distance matrix
for i in range(n):
    for j in range(i + 1, n):
        convo1, convo2 = convo_ids[i], convo_ids[j]
        similarity = np.sum(get_similarity(convo1, convo2))
        distance = 2 - similarity  # Convert similarity to distance
        distance_matrix[i, j] = distance_matrix[j, i] = distance  # Symmetric matrix

# Convert to condensed format for linkage function
condensed_dist_matrix = squareform(distance_matrix)

# Step 2: Perform hierarchical clustering
linkage_matrix = linkage(condensed_dist_matrix, method="ward")  # Ward's method minimizes variance

top_level_clusters = fcluster(linkage_matrix, t=2, criterion='maxclust')

clusters = defaultdict(list)
for idx, label in enumerate(top_level_clusters):
    clusters[label].append(idx)

In [None]:
# from https://github.com/jmhessel/FightingWords/blob/master/fighting_words_py3.py
exclude = set(string.punctuation)

def basic_sanitize(in_string):
    '''Returns a very roughly sanitized version of the input string.'''
    in_string = ''.join([ch for ch in in_string if ch not in exclude])
    in_string = in_string.lower()
    in_string = ' '.join(in_string.split())
    return in_string

def bayes_compare_language(l1, l2, ngram = 1, prior=.01, cv = None):
    '''
    Arguments:
    - l1, l2; a list of strings from each language sample
    - ngram; an int describing up to what n gram you want to consider (1 is unigrams,
    2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed.
    - prior; either a float describing a uniform prior, or a vector describing a prior
    over vocabulary items. If you're using a predefined vocabulary, make sure to specify that
    when you make your CountVectorizer object.
    - cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired.

    Returns:
    - A list of length |Vocab| where each entry is a (n-gram, zscore) tuple.'''
    if cv is None and type(prior) is not float:
        print("If using a non-uniform prior:")
        print("Please also pass a count vectorizer with the vocabulary parameter set.")
        quit()
    l1 = [basic_sanitize(l) for l in l1]
    l2 = [basic_sanitize(l) for l in l2]
    if cv is None:
        cv = CV(decode_error = 'ignore', min_df=2, max_df=0.9, ngram_range=(1,ngram),
                binary = False,
                max_features = 15000)
    counts_mat = cv.fit_transform(l1+l2).toarray()
    # Now sum over languages...
    vocab_size = len(cv.vocabulary_)
    print("Vocab size is {}".format(vocab_size))
    if type(prior) is float:
        priors = np.array([prior for i in range(vocab_size)])
    else:
        priors = prior
    z_scores = np.empty(priors.shape[0])
    count_matrix = np.empty([2, vocab_size], dtype=np.float32)
    count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
    count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
    a0 = np.sum(priors)
    n1 = 1.*np.sum(count_matrix[0,:])
    n2 = 1.*np.sum(count_matrix[1,:])
    print("Comparing language...")
    for i in range(vocab_size):
        #compute delta
        term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
        term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))
        delta = term1 - term2
        #compute variance on delta
        var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
        #store final score
        z_scores[i] = delta/np.sqrt(var)
    index_to_term = {v:k for k,v in cv.vocabulary_.items()}
    sorted_indices = np.argsort(z_scores)
    return_list = []
    for i in sorted_indices:
        return_list.append((index_to_term[i], z_scores[i]))
    return return_list

In [None]:
def get_fighting_words_matching_bullets(cluster1, cluster2, similarity_result=convo_scores, summaries_and_bullets=similarity_and_bulletpoints):
    cluster1_combo = list(combinations(cluster1, 2))
    matched_cluster1 = []
    for convo_id1, convo_id2 in cluster1_combo:
        key = f"{convo_id1}_{convo_id2}" if f"{convo_id1}_{convo_id2}" in similarity_result.keys() else f"{convo_id2}_{convo_id1}"
        for k, result in enumerate(similarity_result[key]["result"]):
            for index in result.keys():
                if result[index]['score'] > 0.5:
                    if k == 0:
                        matched_cluster1.append(summaries_and_bullets['bulletpoints'][convo_id1][index])
                    else:
                        try:
                            matched_cluster1.append(summaries_and_bullets['bulletpoints'][convo_id2][index])
                        except Exception:
                            continue
                        
    cluster2_combo = list(combinations(cluster2, 2))
    matched_cluster2 = []
    for convo_id1, convo_id2 in cluster2_combo:
        key = f"{convo_id1}_{convo_id2}" if f"{convo_id1}_{convo_id2}" in similarity_result.keys() else f"{convo_id2}_{convo_id1}"
        for k, result in enumerate(similarity_result[key]["result"]):
            for index in result.keys():
                if result[index]['score'] > 0.5:
                    if k == 0:
                        matched_cluster2.append(summaries_and_bullets['bulletpoints'][convo_id1][index])
                    else:
                        matched_cluster2.append(summaries_and_bullets['bulletpoints'][convo_id2][index])
    
    z_scores = bayes_compare_language(matched_cluster1, matched_cluster2, ngram = 3) 
    top_k = 15
    top_k_class1 = list(reversed([(x[0], round(x[1],2)) for x in z_scores[-top_k:]]))
    top_k_class2 = [(x[0], round(x[1],2)) for x in z_scores[:top_k]]
    top_k_class1 = list(reversed([(x[0], round(x[1],2)) for x in z_scores[-top_k:]]))
    top_k_class2 = [(x[0], round(x[1],2)) for x in z_scores[:top_k]]
    print(f"Fighting Words Comments between:")
    print("Cluster1: ", top_k_class1)
    print("Cluster2: ", top_k_class2)
    return matched_cluster1, matched_cluster2

In [None]:
cluster1_ids = [convo_ids[i] for i in clusters[1]]
cluster2_ids = [convo_ids[i] for i in clusters[2]]

In [None]:
cluster1_bulletpoints, cluster2_bulletpoints = get_fighting_words_matching_bullets(cluster1_ids, cluster2_ids)

In [None]:
cluster1_bulletpoints

In [None]:
cluster2_bulletpoints