In [2]:
import pandas as pd
import os, re, json, shutil, datetime, zipfile
from pathlib import Path
from convokit import Corpus, Speaker, Utterance, Conversation
from collections import Counter
import re


  import pkg_resources


### Load Corpus

In [3]:
corpus = Corpus(filename="./QuestionEvasion-convokit")
print(len(list(corpus.iter_conversations())))

3448


In [4]:
# stats
corpus.print_summary_stats()

Number of Speakers: 5
Number of Utterances: 6896
Number of Conversations: 3448


In [5]:
# conversation
convo = corpus.random_conversation()
print(convo)

Conversation('id': 'Q_3010', 'utterances': ['Q_3010', 'A_3010'], 'meta': {'title': "The President's News Conference With Prime Minister Ehud Olmert of Israel in Jerusalem", 'date': 'January 09, 2008', 'url': 'https://www.presidency.ucsb.edu/documents/the-presidents-news-conference-with-prime-minister-ehud-olmert-israel-jerusalem', 'president': 'George W. Bush', 'question_order': 3})


In [6]:
# utterance
for utt in corpus.iter_utterances():
    print(utt.text)
    break

How would you respond to the accusation that the United States is containing China while pushing for diplomatic talks?


## Hypothesis: Whether adding the question to the answer as context improves the accuracy of the classifier about whether the answer is ambiguous or not (i.e. ambivalent, clear, or clear-nonreply).

### Pre-process Labels According to the Original Paper

In [7]:
# Labels are only included in the answer utterances
a_labels = [u.meta.get("label") for u in corpus.iter_utterances() if u.meta.get("type")=="answer"]
print("Answer labels:", Counter(a_labels))

Answer labels: Counter({'Explicit': 1052, 'Dodging': 706, 'Implicit': 488, 'General': 386, 'Deflection': 381, 'Declining to answer': 145, 'Claims ignorance': 119, 'Clarification': 92, 'Partial/half-answer': 79})


In [8]:

def norm(s):
    return re.sub(r"\s+", " ", s.strip().lower())

CLEAR_REPLY = {"Explicit"}
AMBIV_REPLY = {"Implicit", "Dodging", "General", "Deflection", "Partial/half-answer"}
CLEAR_NON   = {"Declining to answer", "Claims ignorance", "Clarification"}

def map_label(lbl):
    if lbl in CLEAR_REPLY:
        return "clear-reply"
    elif lbl in AMBIV_REPLY:
        return "ambivalent-reply"
    elif lbl in CLEAR_NON:
        return "clear-nonreply"
    else:
        return None

for u in corpus.iter_utterances():
    if u.meta.get("type") == "answer" and "label" in u.meta:
        u.meta["coarse_label"] = map_label(u.meta["label"])

coarse_counts = Counter(
    u.meta.get("coarse_label")
    for u in corpus.iter_utterances()
    if u.meta.get("type") == "answer"
)
print(coarse_counts)

Counter({'ambivalent-reply': 2040, 'clear-reply': 1052, 'clear-nonreply': 356})


### Condition 1: Answer alone

In [9]:
#  bag-of-words vectors
from convokit import BoWTransformer
bow_transformer = BoWTransformer(obj_type="utterance", vector_name="bow_A")
bow_transformer.fit_transform(corpus)


Initializing default unigram CountVectorizer...Done.


<convokit.model.corpus.Corpus at 0x150a75670>

In [10]:
# classifier
from convokit import VectorClassifier
# selector: as only answer utterances have labels
is_ans_labeled = lambda u: u.meta.get("type")=="answer" and "coarse_label" in u.meta

clf_A = VectorClassifier(
    obj_type="utterance",
    vector_name="bow_A",
    labeller=lambda u: u.meta.get("coarse_label") if is_ans_labeled(u) else None
)

clf_A.fit_transform(corpus, selector=is_ans_labeled) 

Initialized default classification model (standard scaled logistic regression).




<convokit.model.corpus.Corpus at 0x150a75670>

In [11]:
# Evaluation
clf_A.summarize(corpus)

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
A_3229,clear-nonreply,1.000000
A_2722,clear-nonreply,0.999686
A_2723,clear-nonreply,0.999686
A_2676,clear-nonreply,0.999330
A_1966,clear-nonreply,0.999114
...,...,...
Q_3443,,
Q_3444,,
Q_3445,,
Q_3446,,


In [None]:
# Base Accuracy
y_true, _ = clf_A.get_y_true_pred(corpus, selector=is_ans_labeled)  
cnt = Counter(y_true)
maj_label, maj_count = max(cnt.items(), key=lambda x: x[1])
base_acc = maj_count / sum(cnt.values())
print("Majority label:", maj_label)
print("Base accuracy:", base_acc)

Majority label: ambivalent-reply
Base accuracy: 0.5916473317865429


In [13]:
clf_A.accuracy(corpus, selector=is_ans_labeled)

np.float64(0.8474477958236659)

In [14]:
print(clf_A.classification_report(corpus, selector=is_ans_labeled))

                  precision    recall  f1-score   support

ambivalent-reply       0.86      0.90      0.88      2040
  clear-nonreply       0.88      0.79      0.83       356
     clear-reply       0.81      0.76      0.78      1052

        accuracy                           0.85      3448
       macro avg       0.85      0.82      0.83      3448
    weighted avg       0.85      0.85      0.85      3448



Analysis (Condition 1: Pure Answer, No Train/Test Split)

-- Using only the answer text as features, our classifier achieves an accuracy of 0.85, which is notably higher than the base accuracy of 0.59. Precision and recall are relatively balanced across the three classes, though performance is strongest on ambivalent-reply and weaker on clear-reply. This suggests that answer content alone carries useful signals for classification.

-- As a next step, I plan to explore a Condition 2 setting where the question text is concatenated with the answer (Q + A). Adding the question as context may provide richer semantic cues, potentially improving disambiguation between clear-reply and clear-nonreply.

## Condition 2: Question + Answer

In [15]:
# mapping question text to answer text
aq_text = {}
for conv in corpus.iter_conversations():
    for uid in conv.get_utterance_ids(): 
        if uid.startswith("A_"):
            qid = "Q_" + uid.split("_", 1)[1]
            q = corpus.get_utterance(qid)
            aq_text[uid] = (q.text if q and q.text else "")


In [16]:
# bag-of-words vectors processing: add prefix Q and A to the tokens

re_tok = re.compile(r"\w+")

def qa_prefixed(u):
    if u.meta.get("type") != "answer":  
        return None   # only generate vectors for answer utterances
    
    # find question text
    q_text = aq_text.get(u.id, "") or ""
    a_text = u.text or ""

    # add prefix to distinguish question and answer tokens
    q_tokens = [f"Q_{m.group(0).lower()}" for m in re_tok.finditer(q_text)]
    a_tokens = [f"A_{m.group(0).lower()}" for m in re_tok.finditer(a_text)]

    # concatenate question and answer tokens
    return " ".join(q_tokens + a_tokens)

bow_QA = BoWTransformer(
    obj_type="utterance",
    vector_name="bow_QA_prefixed",
    text_func=qa_prefixed
)

Initializing default unigram CountVectorizer...Done.


In [17]:
bow_QA.fit_transform(corpus, selector=is_ans_labeled)

<convokit.model.corpus.Corpus at 0x150a75670>

In [18]:
corpus.vectors

{'bow_A', 'bow_QA_prefixed'}

In [19]:
# check the vector of a random answer
vocab = bow_QA.vectorizer.get_feature_names_out()
import random
u = random.choice([utt for utt in corpus.iter_utterances() if utt.meta.get("type")=="answer"])
vec = u.get_vector("bow_QA_prefixed")

print("Utterance ID:", u.id)
print("Original answer text:", u.text)
print("QA text_func:", qa_prefixed(u))   # 你定义的拼接

Utterance ID: A_685
Original answer text: Yes, please, go ahead.
QA text_func: Q_but Q_is Q_it Q_right Q_that Q_they Q_re Q_virtually Q_immune A_yes A_please A_go A_ahead


In [20]:
# classifier
clf_QA = VectorClassifier(
    obj_type="utterance",
    vector_name="bow_QA_prefixed",
    labeller=lambda u: u.meta.get("coarse_label") if is_ans_labeled(u) else None
)
clf_QA.fit_transform(corpus, selector=is_ans_labeled)  

Initialized default classification model (standard scaled logistic regression).




<convokit.model.corpus.Corpus at 0x150a75670>

In [21]:
clf_QA.summarize(corpus)

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
A_1967,clear-nonreply,1.000000
A_1966,clear-nonreply,1.000000
A_1964,clear-nonreply,1.000000
A_2063,clear-nonreply,1.000000
A_825,clear-nonreply,0.999999
...,...,...
Q_3443,,
Q_3444,,
Q_3445,,
Q_3446,,


In [22]:
clf_QA.accuracy(corpus, selector=is_ans_labeled)

np.float64(0.9904292343387471)

In [23]:
print(clf_QA.classification_report(corpus, selector=is_ans_labeled))

                  precision    recall  f1-score   support

ambivalent-reply       0.99      0.99      0.99      2040
  clear-nonreply       1.00      0.98      0.99       356
     clear-reply       0.99      0.99      0.99      1052

        accuracy                           0.99      3448
       macro avg       0.99      0.99      0.99      3448
    weighted avg       0.99      0.99      0.99      3448



Analysis (Condition 2: Question + Answer, No Train/Test Split)

Using concatenated question–answer representations, the classifier achieved extremely high performance (0.99 precision/recall/F1 across all three classes). This suggests that incorporating the question as context can substantially boost discriminative signal compared to using the answer alone. However, the near-perfect scores are also indicative of potential overfitting, since evaluation was conducted on the same data used for training. To more rigorously assess generalizability, I therefore introduce Condition 3, where I split the dataset into separate train and test sets.

## Condition 3: Question + Answer & Split Train and Test Datasets

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
# stratified sampling and creating new metadata indicating train and test; still, only answers have labels
answers = [u for u in corpus.iter_utterances() 
           if u.meta.get("type")=="answer" and "coarse_label" in u.meta]
y = [u.meta["coarse_label"] for u in answers]
train_utts, test_utts = train_test_split(answers, test_size=0.2, random_state=42, stratify=y)

for u in train_utts:
    u.meta["split"] = "train"
for u in test_utts:
    u.meta["split"] = "test"


In [26]:
# selectors
is_answer  = lambda u: u.meta.get("type") == "answer"
is_train_u = lambda u: is_answer(u) and u.meta.get("split") == "train"
is_test_u  = lambda u: is_answer(u) and u.meta.get("split") == "test"

In [27]:
# bag-of-words vectors processing: add prefix Q and A to the tokens
bow_QA_split = BoWTransformer(
    obj_type="utterance",
    vector_name="bow_QA_prefixed_split",  
    text_func=qa_prefixed                  
)

# learn vocabulary only in train dataset
bow_QA_split.fit(corpus, selector=is_train_u)

# vectorize both train and test dataset
bow_QA_split.transform(corpus, selector=is_answer)

Initializing default unigram CountVectorizer...Done.


<convokit.model.corpus.Corpus at 0x150a75670>

In [28]:
# classifier
clf_QA_split = VectorClassifier(
    obj_type="utterance",
    vector_name="bow_QA_prefixed_split",
    labeller=lambda u: u.meta.get("coarse_label") if is_answer(u) else None
)

# only fit on train dataset
clf_QA_split.fit(corpus, selector=is_train_u)

# create prediction on both train and test dataset
clf_QA_split.transform(corpus, selector=is_answer)



Initialized default classification model (standard scaled logistic regression).




<convokit.model.corpus.Corpus at 0x150a75670>

In [29]:
# Evaluation
print("Train accuracy:", clf_QA_split.accuracy(corpus, selector=is_train_u))
print(clf_QA_split.classification_report(corpus, selector=is_train_u))

print("Test accuracy:", clf_QA_split.accuracy(corpus, selector=is_test_u))
print(clf_QA_split.classification_report(corpus, selector=is_test_u))


Train accuracy: 0.994198694706309
                  precision    recall  f1-score   support

ambivalent-reply       0.99      1.00      1.00      1632
  clear-nonreply       1.00      0.99      0.99       285
     clear-reply       0.99      0.99      0.99       841

        accuracy                           0.99      2758
       macro avg       1.00      0.99      0.99      2758
    weighted avg       0.99      0.99      0.99      2758

Test accuracy: 0.5507246376811594
                  precision    recall  f1-score   support

ambivalent-reply       0.65      0.65      0.65       408
  clear-nonreply       0.44      0.38      0.41        71
     clear-reply       0.41      0.42      0.41       211

        accuracy                           0.55       690
       macro avg       0.50      0.48      0.49       690
    weighted avg       0.55      0.55      0.55       690



Analysis (Condition 3:  Question + Answer, Train/Test Split)

When evaluated under a proper train–test split, the classifier shows extremely high performance on the training set (accuracy 0.99) but drops sharply on the test set (accuracy 0.55), even underperforming the base accuracy. This indicates severe overfitting: the model memorizes training data but fails to generalize to unseen examples. The gap demonstrates the limitation of the bag-of-words approach and suggests that richer contextual representations or more powerful models (e.g., LLMs) are needed to capture the nuances of question–answer interactions.

### ChatGPT 5

In [36]:
clf_QA_split.summarize(corpus,selector=is_test_u)

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
A_1533,clear-nonreply,9.999426e-01
A_1729,clear-nonreply,9.999145e-01
A_1731,clear-nonreply,9.999145e-01
A_1965,clear-nonreply,9.998740e-01
A_1616,clear-nonreply,9.996702e-01
...,...,...
A_1817,ambivalent-reply,4.333080e-24
A_1865,ambivalent-reply,3.361673e-24
A_2399,ambivalent-reply,1.899436e-24
A_1796,clear-reply,7.919053e-25


In [38]:
corpus.random_utterance().meta

{'type': 'answer',
 'question_order': 2,
 'label': 'Explicit',
 'annotator_id': '86',
 'inaudible': False,
 'multiple_questions': False,
 'affirmative_questions': False,
 'interview_answer_raw': "A couple of points on that—one, the Turks, the Americans, and the Iraqis, including the Iraqi Kurds, share a common enemy in the PKK. And secondly, it's in nobody's interests that there be safe haven for people who are—have the willingness to kill innocent people.A second point I want to make to you, Matt [Matt Spetalnick, Reuters], is that there is a Special Forces presence in northern Iraq, in Kurdistan, now, apart from what you're referring to. In other words— so there is a presence, and there has been a presence for a while.Thirdly, I strongly agree with the sentiments of Secretary Gates, who said that the incursion must be limited and must be temporary in nature. In other words, it shouldn't be long lasting. But the Turks need to move, move quickly, achieve their objective, and get out.",

In [39]:
rows = []
for u in corpus.iter_utterances():
    if u.meta.get("split") == "test" and u.meta.get("type") == "answer":  
        gold = u.meta.get("coarse_label")
        pred = str(u.meta.get("prediction"))
        score = float(u.meta.get("pred_score"))
        rows.append({
            "id": u.id,
            "text": u.text,
            "gold": gold,
            "pred": pred,
            "pred_score": score,
            "correct": gold == pred
        })

In [40]:
df_pred = pd.DataFrame(rows)

In [42]:
df_pred

Unnamed: 0,id,text,gold,pred,pred_score,correct
0,A_2,"Look, I think China has a difficult economic p...",ambivalent-reply,ambivalent-reply,8.970231e-05,True
1,A_10,We talked about what we talked about at the co...,clear-nonreply,ambivalent-reply,1.837310e-02,False
2,A_14,"First of all, this trilateral cooperation amon...",ambivalent-reply,ambivalent-reply,4.188193e-10,True
3,A_17,Let me be clear: I didn't say we didn't guaran...,clear-reply,ambivalent-reply,3.812730e-02,False
4,A_21,"Like I told, we are discussing on DCA, the def...",ambivalent-reply,clear-nonreply,9.906355e-01,False
...,...,...,...,...,...,...
685,A_3424,understand full well that the world expects th...,ambivalent-reply,ambivalent-reply,1.353469e-01,True
686,A_3435,Because I know Prime Minister Maliki; I know h...,clear-reply,ambivalent-reply,2.266968e-12,False
687,A_3438,I think the coming election is a referendum on...,clear-reply,ambivalent-reply,7.055957e-04,False
688,A_3442,"See, that's that hypothetical Keil is trying t...",ambivalent-reply,ambivalent-reply,1.454134e-01,True


In [43]:
df_correct = df_pred[df_pred["correct"]]
df_wrong = df_pred[~df_pred["correct"]]

In [46]:
sample_correct = df_correct.sample(5, random_state=42)
sample_wrong = df_wrong.sample(5, random_state=42)

In [48]:
test_llm = pd.concat([sample_correct, sample_wrong])
test_llm

Unnamed: 0,id,text,gold,pred,pred_score,correct
486,A_2550,It feels bad. [] The toughest thing over the l...,ambivalent-reply,ambivalent-reply,1.916811e-08,True
477,A_2518,"Well, look, I've got a whole bunch of lines in...",clear-reply,clear-reply,8.035658e-06,True
484,A_2543,"Well, I think this is going to be an important...",ambivalent-reply,ambivalent-reply,1.627012e-08,True
94,A_527,"And if you ask him, he will tell you, and he'l...",ambivalent-reply,ambivalent-reply,0.00862036,True
83,A_432,"Well, I have real questions about it. Look, th...",ambivalent-reply,ambivalent-reply,0.08614237,True
645,A_3203,"Ed, going into Iraq, we were warned about a lo...",ambivalent-reply,clear-reply,0.006847671,False
16,A_87,Let me answer the first question—the first par...,clear-nonreply,clear-reply,6.373325e-07,False
98,A_559,"I consider firing everybody. [] At some point,...",ambivalent-reply,clear-nonreply,0.5227444,False
101,A_566,I haven't been told of it yet. I heard there's...,clear-nonreply,clear-reply,0.470469,False
40,A_242,"Okay. Whoa, whoa, whoa. Hang on, guys. We've o...",ambivalent-reply,clear-nonreply,0.8191754,False


In [55]:
aq_text['A_2518']

'Where is your line in the sand?'

In [49]:
def build_prompt(q: str, a: str) -> str:
    return f"""You are a careful political Q&A analyst.

Task: Classify the *answer* with respect to the *question* into exactly one of three classes:
- "clear-reply": the answer clearly and directly addresses the question.
- "clear-nonreply": the answer clearly does not address the question (for example, it may refuse to answer, express ignorance, or shift to asking for clarification instead of answering).
- "ambivalent-reply": the answer partially addresses the question or is indirectly related, showing ambiguity or incompleteness (for example, it may be vague, evasive, or only partially relevant).

Return your decision in **strict JSON** with two fields and nothing else:
{{"label": "<one of: clear-reply | clear-nonreply | ambivalent-reply>", "rationale": "<one-sentence explanation>"}}

Question: {q.strip() if q else "(none provided)"}
Answer: {a.strip()}"""

In [51]:
def make_prompts_df(test_llm, aq_text, build_prompt, csv_path="llm_prompts.csv"):
    def row_to_prompt(row):
        q = aq_text.get(row["id"], "")      
        a = row["text"]
        return build_prompt(q, a)

    out = test_llm.copy()
    out["prompt"] = out.apply(row_to_prompt, axis=1)

    keep_cols = ["id", "prompt", "gold", "pred", "correct"]
    out_to_save = out[keep_cols]

    out_to_save.to_csv(csv_path, index=False, encoding="utf-8")

    return out_to_save

In [52]:
prompts_df = make_prompts_df(test_llm, aq_text, build_prompt, csv_path="llm_prompts.csv")


{'A_0': 'How would you respond to the accusation that the United States is containing China while pushing for diplomatic talks?',
 'A_1': 'Do you think President Xi is being sincere about getting the relationship back on track as he bans Apple in China?',
 'A_2': " Do you believe the country's slowdown and growth could risk destabilizing the global economy or cause China to be more aggressive defensively, including with Taiwan?",
 'A_3': ' Are you worried about the meeting between President Putin and Kim Jong Un, if that could mean Russia has more gains in the war in Ukraine?',
 'A_4': " Is the President's engagement with Asian countries a sign of a cold war mentality?",
 'A_5': ' Is there a danger of a cold war?',
 'A_6': 'When will the President meet Mr. Xi?',
 'A_7': ' How concerned are you about this lack of consensus?',
 'A_8': ' Concerns about the lack of communication between the interviewee and President Xi of China, and the potential destabilization of the U.S.-China relations