In [10]:
%pip install datasets convokit ipykernel pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [11]:
from convokit import Corpus, download
import pandas as pd
import re
from collections import Counter


In [3]:
corpus = Corpus(filename=download("switchboard-corpus"))

Downloading switchboard-corpus to /Users/byc324/.convokit/downloads/switchboard-corpus
Downloading switchboard-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/switchboard-corpus/switchboard-corpus.zip (5.8MB)... Done


In [4]:
corpus.print_summary_stats()

Number of Speakers: 440
Number of Utterances: 122646
Number of Conversations: 1155


In [5]:
convo = corpus.random_conversation()
print(convo)

Conversation('id': '3196-0', 'utterances': ['3196-0', '3196-1', '3196-2', '3196-3', '3196-4', '3196-5', '3196-6', '3196-7', '3196-8', '3196-9', '3196-10', '3196-11', '3196-12', '3196-13', '3196-14', '3196-15', '3196-16', '3196-17', '3196-18', '3196-19', '3196-20', '3196-21', '3196-22', '3196-23', '3196-24', '3196-25', '3196-26', '3196-27', '3196-28', '3196-29', '3196-30', '3196-31', '3196-32', '3196-33', '3196-34', '3196-35', '3196-36', '3196-37', '3196-38', '3196-39', '3196-40', '3196-41', '3196-42', '3196-43', '3196-44', '3196-45', '3196-46', '3196-47', '3196-48', '3196-49', '3196-50', '3196-51', '3196-52', '3196-53', '3196-54'], 'meta': {'filename': './swda/sw06utt/sw_0662_3196.utt.csv', 'talk_day': '1991-10-3', 'topic_description': 'CREDIT CARD USE', 'length': 5, 'prompt': 'PLEASE DISCUSS CREDIT CARDS.  FIND OUT HOW THE OTHER CALLER MAKES USE OF CREDIT CARDS.  HOW DO THEY COMPARE TO YOUR OWN?', 'from_caller': '1361', 'to_caller': '1366'})


In [None]:
convo = corpus.random_conversation()
print(convo)

In [6]:
speaker = corpus.random_speaker()
print(speaker)

Speaker(id: 1589, vectors: [], meta: {'sex': 'FEMALE', 'education': 2, 'birth_year': 1968, 'dialect_area': 'NORTHERN'})


In [21]:
utt = corpus.random_utterance()
print(utt)

Utterance(id: '3340-77', conversation_id: 3340-0, reply-to: 3340-76, speaker: Speaker(id: 1124, vectors: [], meta: {'sex': 'MALE', 'education': 3, 'birth_year': 1960, 'dialect_area': 'NORTH MIDLAND'}), timestamp: None, text: 'Another thing that we saw here recently.   [ One of, + some of ] our high school kids were out goofing around in a park area that [ a, + a ] train track ran through -- /', vectors: [], meta: {'tag': [['Another thing that we saw here recently.   [ One of, + some of ] our high school kids were out goofing around in a park area that [ a, + a ] train track ran through -- /', 'sd']]})


In [25]:
# -------- tag helpers --------
def normalize_code(code: str) -> str:
    # e.g., 'b^m' -> 'b', 'bh^r' -> 'bh', '^h' -> '^h'
    m = re.match(r"^\^?[a-z]+", str(code).lower().strip())
    return m.group(0) if m else str(code).lower().strip()

def get_all_tags(utt):
    """
    Extract ALL dialog-act codes for an utterance (handles list-of-lists).
    Returns a de-duplicated list of normalized codes, e.g. ['b','sd'].
    """
    raw = utt.meta.get("tag") or ""
    tags = []
    if isinstance(raw, list):
        for item in raw:
            if isinstance(item, list) and len(item) > 1:
                tags.append(normalize_code(item[-1]))
            elif isinstance(item, str):
                tags.append(normalize_code(item))
    else:
        tags.append(normalize_code(raw))
    # dedupe, keep order
    seen, out = set(), []
    for t in tags:
        if t not in seen:
            seen.add(t); out.append(t)
    return out

In [26]:
BACKCHANNEL = {"b","bh"}
CLOSING = {"fc"}


In [27]:
# -------- collect backchannel utterances as a DataFrame (clean columns) --------
bc_rows = []
for conv in corpus.iter_conversations():
    for utt in conv.iter_utterances():
        tags = get_all_tags(utt)
        if any(t in BACKCHANNEL for t in tags):
            bc_rows.append({
                "conv_id": conv.id,
                "utt_id": utt.id,
                "speaker": utt.speaker.id,
                "tags": tags,
                "text": utt.text
            })

bc_df = pd.DataFrame(bc_rows)
print(f"Total backchannel utterances: {len(bc_df)}")
print(bc_df.head(3))  # nicer than printing raw tuples


Total backchannel utterances: 37265
  conv_id   utt_id speaker     tags                text
0  4325-0   4325-7    1632  [b, sd]       Okay  /[ I, +
1  4325-0  4325-14    1519      [b]           Uh-huh. /
2  4325-0  4325-17    1632     [bh]  {F Oh, } really? /


In [28]:
# -------- conversation-level filtering --------
# 1) conversations that contain b/bh
conv_has_bc = sorted(bc_df["conv_id"].unique().tolist())

# 2) conversations that also contain fc
conv_has_fc = set()
for conv in corpus.iter_conversations():
    tags = []
    for utt in conv.iter_utterances():
        tags.extend(get_all_tags(utt))
    if any(t in CLOSING for t in tags):
        conv_has_fc.add(conv.id)

conv_bc_and_fc = [cid for cid in conv_has_bc if cid in conv_has_fc]

print("\n[Counts]")
print("Conversations with b/bh :", len(conv_has_bc))
print("Conversations with b/bh AND fc :", len(conv_bc_and_fc))


[Counts]
Conversations with b/bh : 1155
Conversations with b/bh AND fc : 316


In [30]:
# -------- pretty printing for assignment E --------
def print_tail_window(conv_id, window_n=12):
    conv = corpus.get_conversation(conv_id)
    seq = [(u.speaker.id, get_all_tags(u), u.text) for u in conv.iter_utterances()]
    tags_only = [t for _, tg, _ in seq for t in [tg[-1]] if tg]  # last code per utt (for locating fc)

    # find last fc if present
    last_fc_idx = None
    for i, (_, tg_list, _) in enumerate(seq):
        if "fc" in tg_list:
            last_fc_idx = i
    if last_fc_idx is not None:
        start = max(0, last_fc_idx - window_n + 1)
        window = list(enumerate(seq[start:last_fc_idx+1], start=start))
        header = f"[Conversation {conv_id}] Tail window ending at <fc> (size={len(window)})"
    else:
        window = list(enumerate(seq[-window_n:], start=len(seq)-window_n))
        header = f"[Conversation {conv_id}] Tail window (no <fc>) (size={len(window)})"

    print("\n" + "="*len(header))
    print(header)
    print("="*len(header))
    for idx, (spk, tg_list, text) in window:
        marks = []
        if any(t in BACKCHANNEL for t in tg_list): marks.append("BACKCHANNEL")
        if "fc" in tg_list: marks.append("CLOSING")
        flag = ("  <-- " + ", ".join(marks)) if marks else ""
        print(f"{idx:>4d}  [{spk}]  <{','.join(tg_list)}>  {text}{flag}")


In [40]:
def tail_bc_count(conv, window_n=12):
    """Count b/bh in the tail window ending at the last fc (or last N if no fc)."""
    seq = [(u.speaker.id, get_all_tags(u), u.text) for u in conv.iter_utterances()]
    last_fc_idx = None
    for i, (_, tg, _) in enumerate(seq):
        if "fc" in tg: last_fc_idx = i
    if last_fc_idx is not None:
        start = max(0, last_fc_idx - window_n + 1)
        window = seq[start:last_fc_idx+1]
    else:
        window = seq[-window_n:]
    return sum(any(t in BACKCHANNEL for t in tg) for _, tg, _ in window)

def pick_examples_by_tail_bc(window_n=12, top_k=2, low_k=2):
    # build (conv_id, tail_bc, has_fc) list only for conversations that have fc
    stats = []
    for cid in corpus.conversations:
        conv = corpus.get_conversation(cid)
        # 只考虑有 fc 的对话，和你的任务更相关
        has_fc = any("fc" in get_all_tags(u) for u in conv.iter_utterances())
        if not has_fc: continue
        stats.append((cid, tail_bc_count(conv, window_n=window_n)))
    # sort by tail backchannels
    stats.sort(key=lambda x: x[1], reverse=True)
    highs = stats[:top_k]
    lows  = list(reversed(stats))[:low_k]  # 末尾的就是尾段最少的
    # 去重并打印
    seen = set()
    for label, bunch in [("High-bc near closing", highs), ("Low-bc near closing", lows)]:
        for cid, _ in bunch:
            if cid in seen: continue
            seen.add(cid)
            print(f"\n=== {label}: conv_id={cid} ===")
            print_tail_window(cid, window_n=window_n)


In [41]:
pick_examples_by_tail_bc(window_n=12, top_k=2, low_k=2)


=== High-bc near closing: conv_id=2465-0 ===

[Conversation 2465-0] Tail window ending at <fc> (size=12)
 144  [1107]  <ba>  Man that sounds really nice. /
 145  [1090]  <b,sd>  Yeah.  /It is real nice, {D you know, }  /I just deal with them on the telephone,  /half the people don't even know who they're talking to. /  <-- BACKCHANNEL
 146  [1107]  <b>  Uh-huh. /  <-- BACKCHANNEL
 147  [1090]  <sd,h>  {C But } I, {D you know, } worked, - /I know most of them, {D well, }  /I can't say that,  /a lot of people changed around there  /{C but, }  most people know who I am,  /{C but } I don't know who they are, {D you know. } /
 148  [1107]  <b>  Uh-huh. /  <-- BACKCHANNEL
 149  [1090]  <sd>  <Laughter> A lot of them just know me by name and phone voice. /
 150  [1107]  <b,sd>  Right.  /Yeah,  /I've got a lot of folks that I've worked with like that as well. /  <-- BACKCHANNEL
 151  [1090]  <b,sd,fc,sv>  Yeah,  /you just know them by the telephone,  /{C so. }    {C But, }  {D well, } it's be

In [34]:
pick_three_examples(window_n=12)


=== High-bc near closing: conv_id=2768-0 ===

[Conversation 2768-0] Tail window ending at <fc> (size=12)
 231  [1074]  <sd>  It would be a last resort. /
 232  [1098]  <sd>  {C And } I would hate it if anyone put me in one. /
 233  [1074]  <ba,sd,nn,qh>  {F Oh, } I know.  /I'm always afraid like, {F oh, } - /no,  /what if I lose my mind. /
 234  [1098]  <b>  Yeah. /  <-- BACKCHANNEL
 235  [1074]  <qy>  Am I going to end up in a place like that <laughter>? /
 236  [1098]  <ad>  <Laughter>  Somebody take care of me. /
 237  [1074]  <b>  Yeah,  /that's right. /  <-- BACKCHANNEL
 238  [1098]  <b>  Okay. /  <-- BACKCHANNEL
 239  [1074]  <%>  {D Well. } -/
 240  [1098]  <b,fc>  Okay,  /{D well, } nice to talk to you. /  <-- BACKCHANNEL, CLOSING
 241  [1074]  <fc>  Yeah.  /You, too,  (( Tonya )) .  /Thanks for calling. /  <-- CLOSING
 242  [1098]  <fc>  Uh-huh.  /Bye-bye. /  <-- CLOSING

=== Low-bc near closing: conv_id=2450-0 ===

[Conversation 2450-0] Tail window ending at <fc> (size=12)
 

In [33]:
# Optional: save conversation ID lists
pd.Series(conv_has_bc, name="conv_id").to_csv("conversations_with_backchannel.csv", index=False)
pd.Series(conv_bc_and_fc, name="conv_id").to_csv("conversations_with_backchannel_and_fc.csv", index=False)
print("\nSaved:")
print(" - conversations_with_backchannel.csv")
print(" - conversations_with_backchannel_and_fc.csv")


Saved:
 - conversations_with_backchannel.csv
 - conversations_with_backchannel_and_fc.csv


In [35]:
import random

def pick_three_examples_random(conv_candidates, window_n=12, k=3):
    # conv_candidates = conversations that contain both bc (b/bh) and closing (fc)
    selected = random.sample(conv_candidates, k)
    examples = []
    for conv in selected:
        tail_utts = list(conv.iter_utterances())[-window_n:]
        formatted = []
        for utt in tail_utts:
            tags = [t for t in get_all_tags(utt)]
            formatted.append((utt.id, utt.speaker.id, tags, utt.text))
        examples.append((conv.id, formatted))
    return examples

# 用法
new_examples = pick_three_examples_random(conv_candidates, window_n=12, k=3)
for conv_id, utts in new_examples:
    print(f"=== Conversation {conv_id} ===")
    for utt in utts:
        print(utt)
    print()


NameError: name 'conv_candidates' is not defined