In [41]:
import spacy
import numpy as np
from collections import defaultdict

In [42]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [48]:
# loading spacy's small model for pos tagging
nlp = spacy.load("en_core_web_sm")

# loading spacy's large model for senetence embedding
nlp_lg = spacy.load("en_core_web_lg")

In [44]:
pos_tags = nlp.get_pipe("tagger").labels
for tag in pos_tags:
    print(f"{tag}: {spacy.explain(tag)}")

$: symbol, currency
'': closing quotation mark
,: punctuation mark, comma
-LRB-: left round bracket
-RRB-: right round bracket
.: punctuation mark, sentence closer
:: punctuation mark, colon or ellipsis
ADD: email
AFX: affix
CC: conjunction, coordinating
CD: cardinal number
DT: determiner
EX: existential there
FW: foreign word
HYPH: punctuation mark, hyphen
IN: conjunction, subordinating or preposition
JJ: adjective (English), other noun-modifier (Chinese)
JJR: adjective, comparative
JJS: adjective, superlative
LS: list item marker
MD: verb, modal auxiliary
NFP: superfluous punctuation
NN: noun, singular or mass
NNP: noun, proper singular
NNPS: noun, proper plural
NNS: noun, plural
PDT: predeterminer
POS: possessive ending
PRP: pronoun, personal
PRP$: pronoun, possessive
RB: adverb
RBR: adverb, comparative
RBS: adverb, superlative
RP: adverb, particle
SYM: symbol
TO: infinitival "to"
UH: interjection
VB: verb, base form
VBD: verb, past tense
VBG: verb, gerund or present participle
VBN:

### functions

In [64]:
def extract_pos_tags(sentence):
    doc = nlp(sentence)
    return [token.pos_ for token in doc]

In [65]:
def compute_sentence_embedding(sentence):
    doc = nlp_lg(sentence)
    return doc.vector  # Returns a 300-dimensional vector

In [66]:
def compute_pos_distribution(pos_tags):
    total_tags = len(pos_tags)
    pos_freq = defaultdict(int)
    for tag in pos_tags:
        pos_freq[tag] += 1
    return {tag: count / total_tags for tag, count in pos_freq.items()}

In [67]:
def kl_divergence(p, q):
    p = np.clip(p, 1e-10, 1)  # Clip to avoid zero values
    q = np.clip(q, 1e-10, 1)  # Clip to avoid zero values
    return np.sum(p * np.log(p / q))

In [68]:
def process_transcript(transcript):
    pos_a = []
    pos_b = []
    embeddings_a = []
    embeddings_b = []
    
    for turn in transcript:
        speaker, sentence = turn
        pos_tags = extract_pos_tags(sentence)
        embedding = compute_sentence_embedding(sentence)
        
        if speaker == "A":
            pos_a.extend(pos_tags)
            embeddings_a.append(embedding)
        elif speaker == "B":
            pos_b.extend(pos_tags)
            embeddings_b.append(embedding)
    
    distribution_a = compute_pos_distribution(pos_a)
    distribution_b = compute_pos_distribution(pos_b)
    
    avg_embedding_a = np.mean(embeddings_a, axis=0)
    avg_embedding_b = np.mean(embeddings_b, axis=0)
    
    return distribution_a, distribution_b, avg_embedding_a, avg_embedding_b


In [69]:
def compute_jsd_pos_distributions(distribution_a, distribution_b):
    
    # Combine all unique keys from both distributions
    all_tags = set(distribution_a.keys()).union(set(distribution_b.keys()))
    
    # Create aligned probability vectors
    vec_a = np.array([distribution_a.get(tag, 0) for tag in all_tags])
    vec_b = np.array([distribution_b.get(tag, 0) for tag in all_tags])
    
    # Normalize the vectors to ensure they are probability distributions
    vec_a /= vec_a.sum()
    vec_b /= vec_b.sum()
    
    # Compute the average distribution M
    M = (vec_a + vec_b) / 2
    
    # Compute the Jensen-Shannon Divergence
    jsd = np.sqrt(0.5 * kl_divergence(vec_a, M) + 0.5 * kl_divergence(vec_b, M))
    
    return jsd

In [70]:
def compute_jsd_sentence_embeddings(embedding_a, embedding_b):
    
    # Normalize embeddings
    embedding_a = embedding_a / np.linalg.norm(embedding_a)
    embedding_b = embedding_b / np.linalg.norm(embedding_b)
    
    # Add epsilon to avoid zero values
    epsilon = 1e-10
    embedding_a = np.where(embedding_a == 0, epsilon, embedding_a)
    embedding_b = np.where(embedding_b == 0, epsilon, embedding_b)
    
    # Compute the average embedding M
    M = (embedding_a + embedding_b) / 2
    
    # Compute the Jensen-Shannon Divergence
    jsd = np.sqrt(0.5 * kl_divergence(embedding_a, M) + 0.5 * kl_divergence(embedding_b, M))
    
    return jsd


### within and between synchrony (to be experimented)

In [32]:
def validate_pseudosynchrony(transcript):

    # Within-Conversation Test (shuffle Speaker B's responses)
    shuffled_transcript = []
    speaker_b_sentences = [turn[1] for turn in transcript if turn[0] == "B"]
    np.random.shuffle(speaker_b_sentences)
    shuffled_idx = 0
    
    for turn in transcript:
        speaker, sentence = turn
        if speaker == "B":
            shuffled_transcript.append(("B", speaker_b_sentences[shuffled_idx]))
            shuffled_idx += 1
        else:
            shuffled_transcript.append((speaker, sentence))
    
    jsd_shuffled = compute_linguistic_synchrony(shuffled_transcript)
    
    # Between-Conversations Test (pair Speaker A with an unrelated Speaker C)
    # For simplicity, assume Speaker C's sentences are randomly generated or from another transcript
    unrelated_transcript = [
        ("A", "Hello, how are you?"),
        ("C", "The weather is nice today."),
        ("A", "Iâ€™m doing great!"),
        ("C", "Do you like coffee?"),
    ]
    
    jsd_unrelated = compute_linguistic_synchrony(unrelated_transcript)
    
    return jsd_shuffled, jsd_unrelated

### demo

In [61]:
transcript = [
    ("A", "Hello, how are you? I hope you're doing well."),
    ("B", "I'm doing great, thanks for asking! How about you?"),
    ("A", "I'm good too. Just working on some code."),
    ("B", "That sounds interesting! What are you working on?")
]

In [72]:
distribution_a, distribution_b, avg_embedding_a, avg_embedding_b = process_transcript(transcript)

print("POS Distribution for Speaker A:", distribution_a)
print("POS Distribution for Speaker B:", distribution_b)
print("Average Sentence Embedding for Speaker A:", avg_embedding_a)
print("Average Sentence Embedding for Speaker B:", avg_embedding_b)

POS Distribution for Speaker A: {'INTJ': 0.041666666666666664, 'PUNCT': 0.20833333333333334, 'SCONJ': 0.041666666666666664, 'AUX': 0.125, 'PRON': 0.16666666666666666, 'VERB': 0.125, 'ADV': 0.125, 'ADJ': 0.041666666666666664, 'ADP': 0.041666666666666664, 'DET': 0.041666666666666664, 'NOUN': 0.041666666666666664}
POS Distribution for Speaker B: {'PRON': 0.21739130434782608, 'AUX': 0.08695652173913043, 'VERB': 0.17391304347826086, 'ADJ': 0.08695652173913043, 'PUNCT': 0.21739130434782608, 'NOUN': 0.043478260869565216, 'ADP': 0.13043478260869565, 'SCONJ': 0.043478260869565216}
Average Sentence Embedding for Speaker A: [-5.69377318e-02  2.49313548e-01 -3.16121608e-01 -1.59784883e-01
  8.01380277e-02  8.70480984e-02  7.68873319e-02 -2.61855364e-01
 -3.09461672e-02  2.17093873e+00 -2.89643228e-01  9.02478769e-02
  8.43014270e-02 -9.96730402e-02 -1.71613261e-01 -5.42834848e-02
 -7.56438449e-02  1.21766973e+00 -2.25790292e-01  3.35530681e-03
  4.65768240e-02  5.09702563e-02 -8.64678994e-03 -4.33

In [73]:
# Process the transcript to get distributions and embeddings
distribution_a, distribution_b, avg_embedding_a, avg_embedding_b = process_transcript(transcript)

# Compute JSD for POS distributions
jsd_pos = compute_jsd_pos_distributions(distribution_a, distribution_b)
print("JSD between POS distributions:", jsd_pos)

# Compute JSD for sentence embeddings
jsd_embeddings = compute_jsd_sentence_embeddings(avg_embedding_a, avg_embedding_b)
print("JSD between sentence embeddings:", jsd_embeddings)

JSD between POS distributions: 0.30621430496699265
JSD between sentence embeddings: 0.988402890646616
