import

In [19]:
%pip install datasets convokit ipykernel


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [20]:
import pandas as pd
import os, re, json, shutil, datetime, zipfile
from pathlib import Path

DATA_CSV= "./QAEvasion.csv"
OUT_DIR="./QuestionEvasion-convokit"
ZIP_BASE = "./QuestionEvasion-convokit"

if os.path.exists(OUT_DIR):
    shutil.rmtree(OUT_DIR)
os.makedirs(OUT_DIR, exist_ok=True)


1) read the CSV

In [55]:
df = pd.read_csv(
    DATA_CSV,
    engine="python",
    sep=",",
    quotechar='"'
)
print("rows:",len(df))
df.head(2)


rows: 3448


Unnamed: 0,title,date,president,url,question_order,interview_question,interview_answer,gpt3.5_summary,gpt3.5_prediction,question,label,annotator_id,inaudible,multiple_questions,affirmative_questions,index
0,"The President's News Conference in Hanoi, Vietnam","September 10, 2023",Joseph R. Biden,https://www.presidency.ucsb.edu/documents/the-...,1,Q. Of the Biden administration. And accused th...,"Well, look, first of all, theI am sincere abou...",The question consists of 2 parts: \n1. How wou...,Question part: 1. How would you respond to the...,How would you respond to the accusation that t...,Explicit,85,False,False,False,0
1,"The President's News Conference in Hanoi, Vietnam","September 10, 2023",Joseph R. Biden,https://www.presidency.ucsb.edu/documents/the-...,1,Q. Of the Biden administration. And accused th...,"Well, look, first of all, theI am sincere abou...",The question consists of 2 parts: \n1. How wou...,Question part: 1. How would you respond to the...,Do you think President Xi is being sincere abo...,General,85,False,False,False,1


2) clean up

In [56]:
def norm_id(x):
    s= "" if pd.isna(x) else str(x)
    s=s.strip()
    s =re.sub(r"\s+", "_", s)
    s=re.sub(r"[^A-Za-z0-9_]+", "", s)
    return s or "NA"

def to_boolish(v):
    s = str(v).strip().lower()
    return s in {"true","t","1","y","yes"}

# these three come in as TRUE/FALSE
for c in ["inaudible","multiple_questions","affirmative_questions"]:
    if c not in df.columns:
        df[c]= False
    else:
        df[c]=df[c].apply(to_boolish)

if "index" in df.columns:
    df["_row_id"]=df["index"].astype(str)
else:
    df["_row_id"] =df.index.astype(str)

# numeric-ish
if "question_order" in df.columns:
    df["question_order"] = pd.to_numeric(df["question_order"], errors="coerce").fillna(-1).astype(int)
else:
    df["question_order"] = -1


3) build speakers (dict per spec: name to meta)

In [57]:
speakers = {}

interviewer_id = "interviewer_generic"
speakers[interviewer_id] ={"role": "interviewer", "name":"Interviewer"}

for name in sorted(df["president"].dropna().unique()):
    sid = f"PRES_{norm_id(name)}"
    speakers[sid] = {"role":"interviewee", "name": str(name)}

print("speakers:",len(speakers))


speakers: 5


4) make conversations + utterances

In [58]:
conversations = {}
utterance_rows = []

for _, row in df.iterrows():
    pres_name = str(row.get("president", "Unknown"))
    pres_id=f"PRES_{norm_id(pres_name)}"
    if pres_id not in speakers:
        speakers[pres_id] = {"role": "interviewee", "name": pres_name}

    #ids for this QA pair
    q_uid=f"Q_{row['_row_id']}"      #question utterance id 
    a_uid = f"A_{row['_row_id']}"      # answer utterance id
    conv_id = q_uid                   # conversation_id == id of first utterance

    # normalize question_order
    q_order=int(row.get("question_order", -1))

    # (1) conversation meta
    conversations[conv_id] = {
        "title": row.get("title", ""),
        "date": row.get("date", ""),
        "url": row.get("url", ""),
        "president": pres_name,

        "question_order": q_order
    }

    # (2) question text
    q_text=row.get("question", "")
    if pd.isna(q_text) or not str(q_text).strip():
        q_text = row.get("interview_question", "")

    # question utterance meta
    q_meta = {
        "type": "question",
        "question_order":q_order,
        "interview_question_raw": row.get("interview_question", ""),
        "gpt35_summary": row.get("gpt3.5_summary", ""),
        "title": row.get("title", ""),
        "date":row.get("date", ""),
        "url": row.get("url", "")
    }

    utterance_rows.append({
        "id":q_uid,
        "speaker": interviewer_id,
        "conversation_id": conv_id,
        "reply_to": None,
        "timestamp": None,
        "text":str(q_text),
        "meta":q_meta
    })

    # (3) answer side
    label_val = str(row.get("label", ""))
    a_meta = {
        "type":"answer",
        "question_order":q_order,
        "label": label_val,
        "annotator_id": str(row.get("annotator_id", "")),
        "inaudible":bool(row.get("inaudible", False)),
        "multiple_questions": bool(row.get("multiple_questions", False)),
        "affirmative_questions":bool(row.get("affirmative_questions", False)),
        "interview_answer_raw":row.get("interview_answer", ""),
        "gpt35_prediction":row.get("gpt3.5_prediction", "")
    }

    utterance_rows.append({
        "id": a_uid,
        "speaker": pres_id,
        "conversation_id": conv_id,
        "reply_to":q_uid,
        "timestamp":None,
        "text":str(row.get("interview_answer", "")),
        "meta": a_meta
    })

print("made utterances:",len(utterance_rows), "conversations:", len(conversations))


made utterances: 6896 conversations: 3448


5) write files

In [59]:
# utterances.jsonl
utt_path = os.path.join(OUT_DIR, "utterances.jsonl")
with open(utt_path, "w", encoding="utf-8") as f:
    for r in utterance_rows:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

#speakers.json
spk_path = os.path.join(OUT_DIR, "speakers.json")
with open(spk_path,"w",encoding="utf-8") as f:
    f.write(json.dumps(speakers, indent=2, ensure_ascii=False))

# conversations.json
conv_path = os.path.join(OUT_DIR,"conversations.json")
with open(conv_path, "w", encoding="utf-8") as f:
    f.write(json.dumps(conversations, indent=2, ensure_ascii=False))

# corpus.json 
corpus_meta = {
    "name": "Question Evasion (ConvoKit)",
    "source": "QAEvasion.csv",
    "created": datetime.datetime.utcnow().isoformat() + "Z"
}

corp_path=os.path.join(OUT_DIR, "corpus.json")

with open(corp_path, "w", encoding="utf-8") as f:
    f.write(json.dumps(corpus_meta, indent=2, ensure_ascii=False))

#index.json
def infer_type_str(val):
    if isinstance(val, bool): return "<class 'bool'>"
    if isinstance(val, int):  return "<class 'int'>"
    
    if isinstance(val, float):return "<class 'float'>"
    return "<class 'str'>"

# union of all utterance meta key
utt_meta_keys=set()
for r in utterance_rows:
    for k in (r.get("meta") or {}):
        utt_meta_keys.add(k)

# speaker meta keys
spk_meta_keys =set()
for k, meta in speakers.items():
    for mk in (meta or {}):
        spk_meta_keys.add(mk)

# conversation meta keys
conv_meta_keys = set()
for k, meta in conversations.items():
    for mk in (meta or {}):
        conv_meta_keys.add(mk)

index_obj = {
    "utterances-index":{ k: "<class 'str'>" for k in sorted(utt_meta_keys) },
    "speakers-index":{ k: "<class 'str'>" for k in sorted(spk_meta_keys) },
    "conversations-index": {k: "<class 'str'>" for k in sorted(conv_meta_keys) },
    "overall-index": {kk: infer_type_str(v) for kk, v in corpus_meta.items() },
    "version": 1
}
idx_path = os.path.join(OUT_DIR, "index.json")
with open(idx_path, "w", encoding="utf-8") as f:
    f.write(json.dumps(index_obj, indent=2, ensure_ascii=False))

for p in [utt_path, spk_path, conv_path, corp_path, idx_path]:
    print(Path(p).name, "bytes=", Path(p).stat().st_size)


utterances.jsonl bytes= 22483052
speakers.json bytes= 436
conversations.json bytes= 989337
corpus.json bytes= 116
index.json bytes= 954


6) print stats

In [60]:
n_convos = len(conversations)
n_utts   = len(utterance_rows)
n_speaks = len(speakers)

label_counts = df["label"].astype(str).str.strip().value_counts().sort_index()
inaudible_true = int(df["inaudible"].sum())
multi_true= int(df["multiple_questions"].sum())
affirm_true    = int(df["affirmative_questions"].sum())

print("==== Basic Stats (Question Evasion, ConvoKit) ====")
print("Conversations:", n_convos)
print("Utterances:", n_utts)
print("Speakers:", n_speaks)
print("\nLabel distribution:")
print(label_counts.to_string())
print("\nFlags:")
print("inaudible =", inaudible_true)
print("multiple_questions =", multi_true)
print("affirmative_questions =", affirm_true)
print("===============================================")


==== Basic Stats (Question Evasion, ConvoKit) ====
Conversations: 3448
Utterances: 6896
Speakers: 5

Label distribution:
label
Claims ignorance        119
Clarification            92
Declining to answer     145
Deflection              381
Dodging                 706
Explicit               1052
General                 386
Implicit                488
Partial/half-answer      79

Flags:
inaudible = 45
multiple_questions = 86
affirmative_questions = 772


7) zip it

In [61]:
zip_path = shutil.make_archive(ZIP_BASE, "zip", OUT_DIR)
print("zipped here:", zip_path)
with zipfile.ZipFile(zip_path, 'r') as zf:
    print("zip members:")
    for m in zf.namelist():
        print("  -", m)


zipped here: /Users/bolinsong/Desktop/NLP and Social Interaction/Question-Evasion Dataset/QuestionEvasion-convokit.zip
zip members:
  - utterances.jsonl
  - conversations.json
  - corpus.json
  - speakers.json
  - index.json


In [4]:
from convokit import Corpus, Speaker, Utterance, Conversation

In [5]:
corpus = Corpus(filename="./QuestionEvasion-convokit")
print(len(list(corpus.iter_conversations())))

3448


In [6]:
corpus.print_summary_stats()

Number of Speakers: 5
Number of Utterances: 6896
Number of Conversations: 3448


In [7]:
convo = corpus.random_conversation()
print(convo)

Conversation('id': 'Q_2622', 'utterances': ['Q_2622', 'A_2622'], 'meta': {'title': "The President's News Conference With President Felipe de Jesus Calderon Hinojosa of Mexico and Prime Minister Stephen Harper of Canada in Guadalajara, Mexico", 'date': 'August 10, 2009', 'url': 'https://www.presidency.ucsb.edu/documents/the-presidents-news-conference-with-president-felipe-de-jesus-calderon-hinojosa-mexico-and', 'president': 'Barack Obama', 'question_order': 1})


In [8]:
convo.print_conversation_structure()

interviewer_generic
    PRES_Barack_Obama


In [9]:
speaker = corpus.random_speaker()
print(speaker)

Speaker(id: PRES_Barack_Obama, vectors: [], meta: {'role': 'interviewee', 'name': 'Barack Obama'})


In [10]:
for utt in corpus.iter_utterances():
    print(utt.text)
    break

How would you respond to the accusation that the United States is containing China while pushing for diplomatic talks?


### Annotating the Corpus with bag-of-words vectors

In [3]:
from convokit import BoWTransformer

In [18]:
bow_transformer = BoWTransformer(obj_type="utterance", vector_name="bow_A")

Initializing default unigram CountVectorizer...Done.


In [19]:
# before transformation
corpus.get_utterance('Q_1503').vectors

[]

In [20]:
bow_transformer.fit_transform(corpus)

<convokit.model.corpus.Corpus at 0x149c08320>

In [21]:
# after transformation
corpus.get_utterance('Q_1503').vectors

['bow_A']

In [22]:
corpus.vectors

{'bow_A'}

In [24]:
corpus.get_vector_matrix('bow_A')

ConvoKitMatrix('name': bow_A, 'matrix': <Compressed Sparse Row sparse matrix of dtype 'int64'
	with 452700 stored elements and shape (6896, 4679)>)

In [25]:
corpus.random_utterance().meta

{'type': 'answer',
 'question_order': 3,
 'label': 'Explicit',
 'annotator_id': '89',
 'inaudible': False,
 'multiple_questions': False,
 'affirmative_questions': True,
 'interview_answer_raw': "We are looking very seriously at going back to Poland. And I don't know what the President has in store for us, but we're thinking about going back sometime in September. Yes. Thank you.",
 'gpt35_prediction': 'Question part: 1 - Confirmation of meeting in September in Warsaw\nVerdict: 1.2 Implicit - The information is given without being explicitly stated in the requested form.\nExplanation: The response states that they are thinking about going back to Poland in September, which implies the possibility of meeting in Warsaw. However, it does not explicitly confirm the meeting in Warsaw in September.'}

In [26]:
corpus.random_utterance().meta

{'type': 'question',
 'question_order': 6,
 'interview_question_raw': "Q. ——got the Cleveland connection, so I appreciate that. You cited the Mayo Clinic and the Cleveland Clinics as models for the delivery of health care in the past. The Mayo Clinic, though, has some problems with the House proposal, saying they're not focused enough on patients and on results. What do you expect to achieve tomorrow by going to the Cleveland Clinic, which hasn't stated an opinion, and are you expecting some form of endorsement from the Cleveland Clinic?",
 'gpt35_summary': "The question consists of 2 parts:\n\n1. Mayo Clinic's problems with the House proposal:\n- What problems does the Mayo Clinic have with the House proposal?\n- Why do they believe the proposal is not focused enough on patients and on results?\n\n2. Expectations from visiting the Cleveland Clinic:\n- What does the speaker expect to achieve by going to the Cleveland Clinic?\n- Is the speaker expecting some form of endorsement from the

In [27]:
corpus.random_utterance()

Utterance({'obj_type': 'utterance', 'meta': {'type': 'question', 'question_order': 2, 'interview_question_raw': "Q. Tajiri, Kyodo News. At Camp David with history. I do have a question to each of the leaders.Mr. President Biden—President Biden, it was mentioned at this summit meeting that Russia's aggression of Ukraine is continuing. So what role do you expect of Japan?Prime Minister Kishida has mentioned that as China's threat in Asia is rising, Ukraine may be East Asia tomorrow. What do you think about this comment, President Biden?And the situation in Asia, where China's threat is rising, what is the meaning and significance of the trilateral relationship with Japan, U.S., R.O.K. becoming stronger in multiple layers?", 'gpt35_summary': "The question consists of 6 parts: \n1. Russia's aggression in Ukraine and Japan's role\n2. China's threat in Asia and the comment made by Prime Minister Kishida \n3. The significance of the trilateral relationship between Japan, U.S., and R.O.K. \n4.

In [28]:
corpus.random_conversation().meta

{'title': "The President's News Conference in Osaka, Japan",
 'date': 'June 29, 2019',
 'url': 'https://www.presidency.ucsb.edu/documents/the-presidents-news-conference-osaka-japan',
 'president': 'Donald J. Trump',
 'question_order': 22}

In [29]:
corpus.random_conversation()

Conversation({'obj_type': 'conversation', 'meta': {'title': "The President's News Conference", 'date': 'August 04, 2020', 'url': 'https://www.presidency.ucsb.edu/documents/the-presidents-news-conference-1249', 'president': 'Donald J. Trump', 'question_order': 1}, 'vectors': [], 'tree': None, 'owner': <convokit.model.corpus.Corpus object at 0x149c08320>, 'id': 'Q_693'})

In [31]:
from collections import Counter
type_counts = Counter([u.meta.get("type") for u in corpus.iter_utterances()])
print(type_counts)

Counter({'question': 3448, 'answer': 3448})


In [32]:
q_labels = [u.meta.get("label") for u in corpus.iter_utterances() if u.meta.get("type")=="question"]
print("Question labels:", set(q_labels))

Question labels: {None}


In [33]:
a_labels = [u.meta.get("label") for u in corpus.iter_utterances() if u.meta.get("type")=="answer"]
print("Answer labels:", Counter(a_labels))

Answer labels: Counter({'Explicit': 1052, 'Dodging': 706, 'Implicit': 488, 'General': 386, 'Deflection': 381, 'Declining to answer': 145, 'Claims ignorance': 119, 'Clarification': 92, 'Partial/half-answer': 79})


In [34]:
import re

def norm(s):
    return re.sub(r"\s+", " ", s.strip().lower())

CLEAR_REPLY = {"Explicit"}
AMBIV_REPLY = {"Implicit", "Dodging", "General", "Deflection", "Partial/half-answer"}
CLEAR_NON   = {"Declining to answer", "Claims ignorance", "Clarification"}

def map_label(lbl):
    if lbl in CLEAR_REPLY:
        return "clear-reply"
    elif lbl in AMBIV_REPLY:
        return "ambivalent-reply"
    elif lbl in CLEAR_NON:
        return "clear-nonreply"
    else:
        return None

for u in corpus.iter_utterances():
    if u.meta.get("type") == "answer" and "label" in u.meta:
        u.meta["coarse_label"] = map_label(u.meta["label"])

coarse_counts = Counter(
    u.meta.get("coarse_label")
    for u in corpus.iter_utterances()
    if u.meta.get("type") == "answer"
)
print(coarse_counts)

Counter({'ambivalent-reply': 2040, 'clear-reply': 1052, 'clear-nonreply': 356})


### Classfier: Answer Only

In [35]:
from convokit import VectorClassifier

In [36]:
convo = corpus.random_conversation()
print(convo)

Conversation('id': 'Q_1517', 'utterances': ['Q_1517', 'A_1517'], 'meta': {'title': "The President's News Conference on Sentosa Island, Singapore", 'date': 'June 12, 2018', 'url': 'https://www.presidency.ucsb.edu/documents/the-presidents-news-conference-sentosa-island-singapore', 'president': 'Donald J. Trump', 'question_order': 10})


In [41]:
from collections import Counter

def labeller(u):
    return u.meta.get("coarse_label") if u.meta.get("type")=="answer" else None

# 1) 看看 labeller 返回了什么
vals = [labeller(u) for u in corpus.iter_utterances()]
print("含 None 的统计：", Counter(x if x is not None else "__NONE__" for x in vals))

# 2) 只保留可用样本（answer 且有 coarse_label）
usable = [u for u in corpus.iter_utterances()
          if u.meta.get("type")=="answer" and isinstance(u.meta.get("coarse_label"), (str, int))]
print("可用样本数：", len(usable))
print("类别分布：", Counter(u.meta["coarse_label"] for u in usable) )

# 3) 检查是否存在奇怪类型
bad = [u.meta.get("coarse_label") for u in corpus.iter_utterances()
       if u.meta.get("type")=="answer" and not isinstance(u.meta.get("coarse_label"), (str, int))]
print("异常类型样例（前5个）：", bad[:5])

含 None 的统计： Counter({'__NONE__': 3448, 'ambivalent-reply': 2040, 'clear-reply': 1052, 'clear-nonreply': 356})
可用样本数： 3448
类别分布： Counter({'ambivalent-reply': 2040, 'clear-reply': 1052, 'clear-nonreply': 356})
异常类型样例（前5个）： []


In [42]:
is_ans_labeled = lambda u: u.meta.get("type")=="answer" and "coarse_label" in u.meta

In [43]:
clf = VectorClassifier(
    obj_type="utterance",
    vector_name="bow_A",
    labeller=lambda u: u.meta.get("coarse_label") if is_ans_labeled(u) else None
)

Initialized default classification model (standard scaled logistic regression).


In [44]:
clf.fit_transform(corpus, selector=is_ans_labeled)   # ← 这里一定要过滤



<convokit.model.corpus.Corpus at 0x149c08320>

In [47]:
df = clf.summarize(corpus)

In [48]:
print(len(df))        # 总行数
print(df.shape)       # (行数, 列数)

6896
(6896, 2)


In [49]:
df

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
A_3229,clear-nonreply,1.000000
A_2722,clear-nonreply,0.999686
A_2723,clear-nonreply,0.999686
A_2676,clear-nonreply,0.999330
A_1966,clear-nonreply,0.999114
...,...,...
Q_3443,,
Q_3444,,
Q_3445,,
Q_3446,,


In [50]:
print(df['prediction'].value_counts(dropna=False))

prediction
None                3448
ambivalent-reply    2134
clear-reply          994
clear-nonreply       320
Name: count, dtype: int64


In [55]:
clf     # (n_classes, n_features)


<convokit.classifier.vectorClassifier.VectorClassifier at 0x14b7e30e0>

In [58]:
from collections import Counter
y_true, _ = clf.get_y_true_pred(corpus, selector=is_ans_labeled)  # 只取有label的
cnt = Counter(y_true)
maj_label, maj_count = max(cnt.items(), key=lambda x: x[1])
base_acc = maj_count / sum(cnt.values())
print("Majority label:", maj_label)
print("Base accuracy:", base_acc)


Majority label: ambivalent-reply
Base accuracy: 0.5916473317865429


In [57]:
clf.accuracy(corpus, selector=is_ans_labeled)

np.float64(0.8474477958236659)

In [90]:
print(clf.classification_report(corpus, selector=is_ans_labeled))

                  precision    recall  f1-score   support

ambivalent-reply       0.86      0.90      0.88      2040
  clear-nonreply       0.88      0.79      0.83       356
     clear-reply       0.81      0.76      0.78      1052

        accuracy                           0.85      3448
       macro avg       0.85      0.82      0.83      3448
    weighted avg       0.85      0.85      0.85      3448



### Questions as Context

In [70]:
# 预构建映射
aq_text = {}
for conv in corpus.iter_conversations():
    for uid in conv.get_utterance_ids(): 
        if uid.startswith("A_"):
            qid = "Q_" + uid.split("_", 1)[1]
            q = corpus.get_utterance(qid)
            aq_text[uid] = (q.text if q and q.text else "")


In [None]:
aq_text[:5]

In [77]:
corpus.get_utterance('Q_1000').text

'Expectations for further actions regarding FETÖ'

In [76]:
list(aq_text.items())[1000]

('A_1000', 'Expectations for further actions regarding FETÖ')

In [78]:
re_tok = re.compile(r"\w+")

def qa_prefixed(u):
    if u.meta.get("type") != "answer":  
        return None   # 只给 answer 生成向量
    
    # 找到对应的 question 文本
    q_text = aq_text.get(u.id, "") or ""
    a_text = u.text or ""

    # 给每个 token 加上来源前缀
    q_tokens = [f"Q_{m.group(0).lower()}" for m in re_tok.finditer(q_text)]
    a_tokens = [f"A_{m.group(0).lower()}" for m in re_tok.finditer(a_text)]

    # 拼成一个字符串返回
    return " ".join(q_tokens + a_tokens)

bow_QA = BoWTransformer(
    obj_type="utterance",
    vector_name="bow_QA_prefixed",
    text_func=qa_prefixed
)

Initializing default unigram CountVectorizer...Done.


In [80]:
bow_QA.fit_transform(corpus, selector=is_ans_labeled)

<convokit.model.corpus.Corpus at 0x149c08320>

In [81]:
corpus.vectors

{'bow_A', 'bow_QA_prefixed'}

In [82]:
answers = [u for u in corpus.iter_utterances() if u.meta.get("type")=="answer"]
labeled = [u for u in answers if "coarse_label" in u.meta]

print(f"Answers: {len(answers)}, labeled: {len(labeled)}")


Answers: 3448, labeled: 3448


In [83]:
u = corpus.get_utterance("A_2622")   # 换成你感兴趣的某个 answer ID
vec = u.get_vector("bow_QA_prefixed")
print(vec)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 17 stored elements and shape (1, 5091)>
  Coords	Values
  (0, 188)	1
  (0, 758)	1
  (0, 1145)	1
  (0, 1740)	1
  (0, 1754)	1
  (0, 1761)	1
  (0, 1880)	1
  (0, 2362)	1
  (0, 2766)	1
  (0, 3196)	1
  (0, 3721)	1
  (0, 4259)	1
  (0, 4505)	1
  (0, 4687)	1
  (0, 4790)	1
  (0, 5016)	1
  (0, 5088)	1


In [84]:
# 取出 BoW 的词表
vocab = bow_QA.vectorizer.get_feature_names_out()

# 随机挑一个 answer 看看
import random
u = random.choice([utt for utt in corpus.iter_utterances() if utt.meta.get("type")=="answer"])
vec = u.get_vector("bow_QA_prefixed")

print("Utterance ID:", u.id)
print("Original text:", u.text)
print("QA text_func:", qa_prefixed(u))   # 你定义的拼接
print("\nVector tokens:")

# 打印非零特征（token 和它的 count）
for idx, val in zip(vec.indices, vec.data):
    print(f"  {vocab[idx]}: {val}")


Utterance ID: A_928
Original text: No, I think you have to always—look, I do it a lot anyway, as you've probably heard. Wash your hands, stay clean. [] You don't have to necessarily grab every handrail unless you have to. You know, you do certain things that you do when you have the flu.I mean, view this the same as the flu. When somebody sneezes—I mean, I try and bail out as much as possible when they're sneezing. [] I had a man come up to me a week ago. I hadn't seen him in a long time, and I said, How you doing? He said, Fine, fine. And he hugs me, kiss. I said, Are you well? He says, No. [] He said, I have the worst fever and the worst flu. And he's hugging and kissing me. So I said, Excuse me. I went, and I started washing my hands. [] So you have to do that.You know, this is—I really think, Doctor, you ought to treat this like you treat the flu, right? And, you know, it's going to be—it's going to be fine.
QA text_func: Q_are Q_you Q_telling Q_the Q_americans Q_except Q_for Q_the

In [92]:
clf_QA = VectorClassifier(
    obj_type="utterance",
    vector_name="bow_QA_prefixed",
    labeller=lambda u: u.meta.get("coarse_label") if is_ans_labeled(u) else None
)
clf_QA.fit_transform(corpus, selector=is_ans_labeled)   # ← 这里一定要过滤

Initialized default classification model (standard scaled logistic regression).




<convokit.model.corpus.Corpus at 0x149c08320>

In [None]:
df = clf_QA.summarize(corpus)
df

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
A_1967,clear-nonreply,1.000000
A_1966,clear-nonreply,1.000000
A_1964,clear-nonreply,1.000000
A_2063,clear-nonreply,1.000000
A_825,clear-nonreply,0.999999
...,...,...
Q_3443,,
Q_3444,,
Q_3445,,
Q_3446,,


In [87]:
print(len(df))        # 总行数
print(df.shape) 

6896
(6896, 2)


In [94]:
print(df['prediction'].value_counts(dropna=False))

prediction
None                3448
ambivalent-reply    2048
clear-reply         1050
clear-nonreply       350
Name: count, dtype: int64


In [95]:
clf_QA.accuracy(corpus, selector=is_ans_labeled)

np.float64(0.9904292343387471)

In [96]:

print(clf_QA.classification_report(corpus, selector=is_ans_labeled))

                  precision    recall  f1-score   support

ambivalent-reply       0.99      0.99      0.99      2040
  clear-nonreply       1.00      0.98      0.99       356
     clear-reply       0.99      0.99      0.99      1052

        accuracy                           0.99      3448
       macro avg       0.99      0.99      0.99      3448
    weighted avg       0.99      0.99      0.99      3448



### Generalizability Test

In [97]:
from sklearn.model_selection import train_test_split

In [98]:
answers = [u for u in corpus.iter_utterances() 
           if u.meta.get("type")=="answer" and "coarse_label" in u.meta]
print(len(answers))

3448


In [107]:
# 分层抽样
y = [u.meta["coarse_label"] for u in answers]
train_utts, test_utts = train_test_split(answers, test_size=0.2, random_state=42, stratify=y)

for u in train_utts:
    u.meta["split"] = "train"
for u in test_utts:
    u.meta["split"] = "test"

In [104]:
# selectors
is_answer  = lambda u: u.meta.get("type") == "answer"
is_train_u = lambda u: is_answer(u) and u.meta.get("split") == "train"
is_test_u  = lambda u: is_answer(u) and u.meta.get("split") == "test"

In [109]:
bow_QA_split = BoWTransformer(
    obj_type="utterance",
    vector_name="bow_QA_prefixed_split",   # 新名字！
    text_func=qa_prefixed                  # 你的 Q+A 前缀拼接函数
)

# learn vocabulary in train dataset
bow_QA_split.fit(corpus, selector=is_train_u)

# vectorize train and test dataset
bow_QA_split.transform(corpus, selector=is_answer)

Initializing default unigram CountVectorizer...Done.


<convokit.model.corpus.Corpus at 0x149c08320>

In [110]:
clf_QA_split = VectorClassifier(
    obj_type="utterance",
    vector_name="bow_QA_prefixed_split",
    labeller=lambda u: u.meta.get("coarse_label") if is_answer(u) else None
)

clf_QA_split.fit(corpus, selector=is_train_u)

clf_QA_split.transform(corpus, selector=is_answer)



Initialized default classification model (standard scaled logistic regression).




<convokit.model.corpus.Corpus at 0x149c08320>

In [111]:
# Evaluation
print("Train accuracy:", clf_QA_split.accuracy(corpus, selector=is_train_u))
print(clf_QA_split.classification_report(corpus, selector=is_train_u))

print("Test accuracy:", clf_QA_split.accuracy(corpus, selector=is_test_u))
print(clf_QA_split.classification_report(corpus, selector=is_test_u))


Train accuracy: 0.994198694706309
                  precision    recall  f1-score   support

ambivalent-reply       0.99      1.00      1.00      1632
  clear-nonreply       1.00      0.99      0.99       285
     clear-reply       0.99      0.99      0.99       841

        accuracy                           0.99      2758
       macro avg       1.00      0.99      0.99      2758
    weighted avg       0.99      0.99      0.99      2758

Test accuracy: 0.5507246376811594
                  precision    recall  f1-score   support

ambivalent-reply       0.65      0.65      0.65       408
  clear-nonreply       0.44      0.38      0.41        71
     clear-reply       0.41      0.42      0.41       211

        accuracy                           0.55       690
       macro avg       0.50      0.48      0.49       690
    weighted avg       0.55      0.55      0.55       690



### GenAI 

In [112]:
df_test_result = clf_QA_split.summarize(corpus, selector=is_test_u)

In [113]:
df_test_result

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
A_1533,clear-nonreply,9.999426e-01
A_1729,clear-nonreply,9.999145e-01
A_1731,clear-nonreply,9.999145e-01
A_1965,clear-nonreply,9.998740e-01
A_1616,clear-nonreply,9.996702e-01
...,...,...
A_1817,ambivalent-reply,4.333080e-24
A_1865,ambivalent-reply,3.361673e-24
A_2399,ambivalent-reply,1.899436e-24
A_1796,clear-reply,7.919053e-25
