In [1]:
import pandas as pd
import os, re, json, shutil, datetime, zipfile
from pathlib import Path
from convokit import Corpus, Speaker, Utterance, Conversation
from collections import Counter
import re


  import pkg_resources


### Load Corpus

In [2]:
corpus = Corpus(filename="./QuestionEvasion-convokit")
print(len(list(corpus.iter_conversations())))

3448


In [3]:
# stats
corpus.print_summary_stats()

Number of Speakers: 5
Number of Utterances: 6896
Number of Conversations: 3448


In [4]:
# conversation
convo = corpus.random_conversation()
print(convo)

Conversation('id': 'Q_3036', 'utterances': ['Q_3036', 'A_3036'], 'meta': {'title': "The President's News Conference", 'date': 'December 04, 2007', 'url': 'https://www.presidency.ucsb.edu/documents/the-presidents-news-conference-1132', 'president': 'George W. Bush', 'question_order': 7})


In [5]:
# utterance
for utt in corpus.iter_utterances():
    print(utt.text)
    break

How would you respond to the accusation that the United States is containing China while pushing for diplomatic talks?


## Hypothesis: Whether adding the question to the answer as context improves the accuracy of the classifier about whether the answer is ambiguous or not (i.e. ambivalent, clear, or clear-nonreply).

### Pre-process Labels According to the Original Paper

In [6]:
# Labels are only included in the answer utterances
a_labels = [u.meta.get("label") for u in corpus.iter_utterances() if u.meta.get("type")=="answer"]
print("Answer labels:", Counter(a_labels))

Answer labels: Counter({'Explicit': 1052, 'Dodging': 706, 'Implicit': 488, 'General': 386, 'Deflection': 381, 'Declining to answer': 145, 'Claims ignorance': 119, 'Clarification': 92, 'Partial/half-answer': 79})


In [7]:

def norm(s):
    return re.sub(r"\s+", " ", s.strip().lower())

CLEAR_REPLY = {"Explicit"}
AMBIV_REPLY = {"Implicit", "Dodging", "General", "Deflection", "Partial/half-answer"}
CLEAR_NON   = {"Declining to answer", "Claims ignorance", "Clarification"}

def map_label(lbl):
    if lbl in CLEAR_REPLY:
        return "clear-reply"
    elif lbl in AMBIV_REPLY:
        return "ambivalent-reply"
    elif lbl in CLEAR_NON:
        return "clear-nonreply"
    else:
        return None

for u in corpus.iter_utterances():
    if u.meta.get("type") == "answer" and "label" in u.meta:
        u.meta["coarse_label"] = map_label(u.meta["label"])

coarse_counts = Counter(
    u.meta.get("coarse_label")
    for u in corpus.iter_utterances()
    if u.meta.get("type") == "answer"
)
print(coarse_counts)

Counter({'ambivalent-reply': 2040, 'clear-reply': 1052, 'clear-nonreply': 356})


### Condition 1: Answer alone

In [8]:
#  bag-of-words vectors
from convokit import BoWTransformer
bow_transformer = BoWTransformer(obj_type="utterance", vector_name="bow_A")
bow_transformer.fit_transform(corpus)


Initializing default unigram CountVectorizer...Done.


<convokit.model.corpus.Corpus at 0x14fbe6b70>

In [9]:
# classifier
from convokit import VectorClassifier
# selector: as only answer utterances have labels
is_ans_labeled = lambda u: u.meta.get("type")=="answer" and "coarse_label" in u.meta

clf_A = VectorClassifier(
    obj_type="utterance",
    vector_name="bow_A",
    labeller=lambda u: u.meta.get("coarse_label") if is_ans_labeled(u) else None
)

clf_A.fit_transform(corpus, selector=is_ans_labeled) 

Initialized default classification model (standard scaled logistic regression).




<convokit.model.corpus.Corpus at 0x14fbe6b70>

In [10]:
# Evaluation
clf_A.summarize(corpus)

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
A_3229,clear-nonreply,1.000000
A_2722,clear-nonreply,0.999686
A_2723,clear-nonreply,0.999686
A_2676,clear-nonreply,0.999330
A_1966,clear-nonreply,0.999114
...,...,...
Q_3443,,
Q_3444,,
Q_3445,,
Q_3446,,


In [11]:
# Base Accuracy
y_true, _ = clf_A.get_y_true_pred(corpus, selector=is_ans_labeled)  
cnt = Counter(y_true)
maj_label, maj_count = max(cnt.items(), key=lambda x: x[1])
base_acc = maj_count / sum(cnt.values())
print("Majority label:", maj_label)
print("Base accuracy:", base_acc)

Majority label: ambivalent-reply
Base accuracy: 0.5916473317865429


In [12]:
clf_A.accuracy(corpus, selector=is_ans_labeled)

np.float64(0.8474477958236659)

In [13]:
print(clf_A.classification_report(corpus, selector=is_ans_labeled))

                  precision    recall  f1-score   support

ambivalent-reply       0.86      0.90      0.88      2040
  clear-nonreply       0.88      0.79      0.83       356
     clear-reply       0.81      0.76      0.78      1052

        accuracy                           0.85      3448
       macro avg       0.85      0.82      0.83      3448
    weighted avg       0.85      0.85      0.85      3448



Analysis (Condition 1: Pure Answer, No Train/Test Split)

-- Using only the answer text as features, our classifier achieves an accuracy of 0.85, which is notably higher than the base accuracy of 0.59. Precision and recall are relatively balanced across the three classes, though performance is strongest on ambivalent-reply and weaker on clear-reply. This suggests that answer content alone carries useful signals for classification.

-- As a next step, I plan to explore a Condition 2 setting where the question text is concatenated with the answer (Q + A). Adding the question as context may provide richer semantic cues, potentially improving disambiguation between clear-reply and clear-nonreply.

## Condition 2: Question + Answer

In [14]:
# mapping question text to answer text
aq_text = {}
for conv in corpus.iter_conversations():
    for uid in conv.get_utterance_ids(): 
        if uid.startswith("A_"):
            qid = "Q_" + uid.split("_", 1)[1]
            q = corpus.get_utterance(qid)
            aq_text[uid] = (q.text if q and q.text else "")


In [15]:
# bag-of-words vectors processing: add prefix Q and A to the tokens

re_tok = re.compile(r"\w+")

def qa_prefixed(u):
    if u.meta.get("type") != "answer":  
        return None   # only generate vectors for answer utterances
    
    # find question text
    q_text = aq_text.get(u.id, "") or ""
    a_text = u.text or ""

    # add prefix to distinguish question and answer tokens
    q_tokens = [f"Q_{m.group(0).lower()}" for m in re_tok.finditer(q_text)]
    a_tokens = [f"A_{m.group(0).lower()}" for m in re_tok.finditer(a_text)]

    # concatenate question and answer tokens
    return " ".join(q_tokens + a_tokens)

bow_QA = BoWTransformer(
    obj_type="utterance",
    vector_name="bow_QA_prefixed",
    text_func=qa_prefixed
)

Initializing default unigram CountVectorizer...Done.


In [16]:
bow_QA.fit_transform(corpus, selector=is_ans_labeled)

<convokit.model.corpus.Corpus at 0x14fbe6b70>

In [17]:
corpus.vectors

{'bow_A', 'bow_QA_prefixed'}

In [None]:
# check the vector of a random answer
vocab = bow_QA.vectorizer.get_feature_names_out()
import random
u = random.choice([utt for utt in corpus.iter_utterances() if utt.meta.get("type")=="answer"])
vec = u.get_vector("bow_QA_prefixed")

print("Utterance ID:", u.id)
print("Original answer text:", u.text)
print("QA text_func:", qa_prefixed(u))  

Utterance ID: A_2310
Original answer text: Okay. Good. I think we've just answered the question.
QA text_func: Q_asking Q_for Q_the Q_president Q_s Q_understanding Q_of Q_people Q_s Q_perception Q_of Q_legitimizing Q_a Q_regime Q_and Q_oppressing Q_its Q_people Q_by Q_meeting Q_and Q_shaking Q_hands Q_with Q_the Q_leader Q_of Q_north Q_korea A_okay A_good A_i A_think A_we A_ve A_just A_answered A_the A_question


In [19]:
# classifier
clf_QA = VectorClassifier(
    obj_type="utterance",
    vector_name="bow_QA_prefixed",
    labeller=lambda u: u.meta.get("coarse_label") if is_ans_labeled(u) else None
)
clf_QA.fit_transform(corpus, selector=is_ans_labeled)  

Initialized default classification model (standard scaled logistic regression).




<convokit.model.corpus.Corpus at 0x14fbe6b70>

In [20]:
clf_QA.summarize(corpus)

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
A_1967,clear-nonreply,1.000000
A_1966,clear-nonreply,1.000000
A_1964,clear-nonreply,1.000000
A_2063,clear-nonreply,1.000000
A_825,clear-nonreply,0.999999
...,...,...
Q_3443,,
Q_3444,,
Q_3445,,
Q_3446,,


In [21]:
clf_QA.accuracy(corpus, selector=is_ans_labeled)

np.float64(0.9904292343387471)

In [22]:
print(clf_QA.classification_report(corpus, selector=is_ans_labeled))

                  precision    recall  f1-score   support

ambivalent-reply       0.99      0.99      0.99      2040
  clear-nonreply       1.00      0.98      0.99       356
     clear-reply       0.99      0.99      0.99      1052

        accuracy                           0.99      3448
       macro avg       0.99      0.99      0.99      3448
    weighted avg       0.99      0.99      0.99      3448



Analysis (Condition 2: Question + Answer, No Train/Test Split)

Using concatenated question–answer representations, the classifier achieved extremely high performance (0.99 precision/recall/F1 across all three classes). This suggests that incorporating the question as context can substantially boost discriminative signal compared to using the answer alone. However, the near-perfect scores are also indicative of potential overfitting, since evaluation was conducted on the same data used for training. To more rigorously assess generalizability, I therefore introduce Condition 3, where I split the dataset into separate train and test sets.

## Condition 3: Question + Answer & Split Train and Test Datasets

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
# stratified sampling and creating new metadata indicating train and test; still, only answers have labels
answers = [u for u in corpus.iter_utterances() 
           if u.meta.get("type")=="answer" and "coarse_label" in u.meta]
y = [u.meta["coarse_label"] for u in answers]
train_utts, test_utts = train_test_split(answers, test_size=0.2, random_state=42, stratify=y)

for u in train_utts:
    u.meta["split"] = "train"
for u in test_utts:
    u.meta["split"] = "test"


In [25]:
# selectors
is_answer  = lambda u: u.meta.get("type") == "answer"
is_train_u = lambda u: is_answer(u) and u.meta.get("split") == "train"
is_test_u  = lambda u: is_answer(u) and u.meta.get("split") == "test"

In [26]:
# bag-of-words vectors processing: add prefix Q and A to the tokens
bow_QA_split = BoWTransformer(
    obj_type="utterance",
    vector_name="bow_QA_prefixed_split",  
    text_func=qa_prefixed                  
)

# learn vocabulary only in train dataset
bow_QA_split.fit(corpus, selector=is_train_u)

# vectorize both train and test dataset
bow_QA_split.transform(corpus, selector=is_answer)

Initializing default unigram CountVectorizer...Done.


<convokit.model.corpus.Corpus at 0x14fbe6b70>

In [27]:
# classifier
clf_QA_split = VectorClassifier(
    obj_type="utterance",
    vector_name="bow_QA_prefixed_split",
    labeller=lambda u: u.meta.get("coarse_label") if is_answer(u) else None
)

# only fit on train dataset
clf_QA_split.fit(corpus, selector=is_train_u)

# create prediction on both train and test dataset
clf_QA_split.transform(corpus, selector=is_answer)



Initialized default classification model (standard scaled logistic regression).




<convokit.model.corpus.Corpus at 0x14fbe6b70>

In [28]:
# Evaluation
print("Train accuracy:", clf_QA_split.accuracy(corpus, selector=is_train_u))
print(clf_QA_split.classification_report(corpus, selector=is_train_u))

print("Test accuracy:", clf_QA_split.accuracy(corpus, selector=is_test_u))
print(clf_QA_split.classification_report(corpus, selector=is_test_u))


Train accuracy: 0.994198694706309
                  precision    recall  f1-score   support

ambivalent-reply       0.99      1.00      1.00      1632
  clear-nonreply       1.00      0.99      0.99       285
     clear-reply       0.99      0.99      0.99       841

        accuracy                           0.99      2758
       macro avg       1.00      0.99      0.99      2758
    weighted avg       0.99      0.99      0.99      2758

Test accuracy: 0.5507246376811594
                  precision    recall  f1-score   support

ambivalent-reply       0.65      0.65      0.65       408
  clear-nonreply       0.44      0.38      0.41        71
     clear-reply       0.41      0.42      0.41       211

        accuracy                           0.55       690
       macro avg       0.50      0.48      0.49       690
    weighted avg       0.55      0.55      0.55       690



Analysis (Condition 3:  Question + Answer, Train/Test Split)

When evaluated under a proper train–test split, the classifier shows extremely high performance on the training set (accuracy 0.99) but drops sharply on the test set (accuracy 0.55), even underperforming the base accuracy. This indicates severe overfitting: the model memorizes training data but fails to generalize to unseen examples. The gap demonstrates the limitation of the bag-of-words approach and suggests that richer contextual representations or more powerful models (e.g., LLMs) are needed to capture the nuances of question–answer interactions.