In [1]:
import pandas as pd
from datasets import Dataset

arrow_path = "dataset/train/train/data-00000-of-00001.arrow"
train_ds = Dataset.from_file(arrow_path)

train_df = train_ds.to_pandas()

print(f"Train dataset shape: {train_df.shape}")
train_df.head()

Train dataset shape: (3448, 8)


Unnamed: 0,question,interview_question,interview_answer,label,url,inaudible,multiple_questions,affirmative_questions
0,How would you respond to the accusation that t...,Q. Of the Biden administration. And accused th...,"Well, look, first of all, theI am sincere abou...",Explicit,https://www.presidency.ucsb.edu/documents/the-...,False,False,False
1,Do you think President Xi is being sincere abo...,Q. Of the Biden administration. And accused th...,"Well, look, first of all, theI am sincere abou...",General,https://www.presidency.ucsb.edu/documents/the-...,False,False,False
2,Do you believe the country's slowdown and gro...,Q. No worries. Do you believe the country's sl...,"Look, I think China has a difficult economic p...",Partial/half-answer,https://www.presidency.ucsb.edu/documents/the-...,False,False,False
3,Are you worried about the meeting between Pre...,Q. No worries. Do you believe the country's sl...,"Look, I think China has a difficult economic p...",Dodging,https://www.presidency.ucsb.edu/documents/the-...,False,False,False
4,Is the President's engagement with Asian coun...,"Q. I can imagine. It is evening, I'd like to r...","Well, I hope I get to see Mr. Xi sooner than l...",Explicit,https://www.presidency.ucsb.edu/documents/the-...,False,False,False


In [2]:
import numpy as np
import re
from collections import Counter
from tqdm import tqdm
tqdm.pandas()
import spacy
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer
nltk.download('vader_lexicon', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('punkt_tab', quiet=True)
print("Loading spaCy model...")
nlp = spacy.load("en_core_web_sm")
print("Loading Sentence-BERT model...")
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Loading VADER sentiment analyzer...")
sia = SentimentIntensityAnalyzer()

print("All models loaded!")

  warn(



Loading spaCy model...
Loading Sentence-BERT model...
Loading VADER sentiment analyzer...
All models loaded!


In [3]:
# ============================================
# LOAD PROFESSIONAL NLP LEXICONS
# ============================================
# Using established academic lexicons:
# - AFINN: Sentiment scores (Nielsen, 2011)
# - NRC EmoLex: Emotion associations (Mohammad & Turney, 2013)
# - Brysbaert: Concreteness ratings (Brysbaert et al., 2014)
# - MPQA: Subjectivity classification (Wilson et al., 2005)

from lexicon_loader import lexicons

# Print status of loaded lexicons
lexicons.print_status()

# Export lexicon variables for feature functions
HEDGE_WORDS = lexicons.hedge_words
MODAL_VERBS = lexicons.modal_verbs
NEGATION_WORDS = lexicons.negation_words
FILLER_WORDS_SET = lexicons.filler_words
FILLER_PHRASES = lexicons.filler_phrases
VAGUE_WORDS_SET = lexicons.vague_words
VAGUE_PHRASES = lexicons.vague_phrases
PIVOT_PHRASES = lexicons.pivot_phrases
THANKS_STARTERS = lexicons.thanks_starters

# Professional lexicons (for enhanced features)
NRC_EMOTIONS = lexicons.nrc_emotions
CONCRETENESS_RATINGS = lexicons.concreteness
MPQA_STRONG = lexicons.mpqa_strong_subjective
MPQA_WEAK = lexicons.mpqa_weak_subjective

print(f"\nHedge words loaded: {len(HEDGE_WORDS)}")
print(f"NRC emotions available: {bool(NRC_EMOTIONS.get('anger'))}")
print(f"Concreteness ratings: {len(CONCRETENESS_RATINGS)}")
print(f"MPQA subjective words: {len(MPQA_STRONG) + len(MPQA_WEAK)}")

LEXICON STATUS
  ✓ AFINN
  ✗ NRC EmoLex
  ✗ Brysbaert Concreteness
  ✗ MPQA Subjectivity
  ✓ Hedge Words

Hedge words loaded: 162
NRC emotions available: False
Concreteness ratings: 0
MPQA subjective words: 0


In [4]:
def compute_semantic_features(questions, answers):
    print("Encoding questions...")
    q_embeddings = sbert_model.encode(questions.tolist(), show_progress_bar=True)
    print("Encoding answers...")
    a_embeddings = sbert_model.encode(answers.tolist(), show_progress_bar=True)
    qa_similarity = np.array([
        np.dot(q, a) / (np.linalg.norm(q) * np.linalg.norm(a) + 1e-8)
        for q, a in zip(q_embeddings, a_embeddings)
    ])
    topic_shift_score = 1 - qa_similarity
    return qa_similarity, topic_shift_score

def compute_keyword_overlap(question, answer):
    q_doc = nlp(question.lower())
    a_doc = nlp(answer.lower())
    q_words = {token.lemma_ for token in q_doc if not token.is_stop and token.is_alpha}
    a_words = {token.lemma_ for token in a_doc if not token.is_stop and token.is_alpha}
    if len(q_words) == 0:
        return 0.0
    overlap = len(q_words & a_words)
    return overlap / len(q_words)

def compute_entity_overlap(question, answer):
    q_doc = nlp(question)
    a_doc = nlp(answer)
    q_entities = {ent.text.lower() for ent in q_doc.ents}
    a_entities = {ent.text.lower() for ent in a_doc.ents}
    return len(q_entities & a_entities)

In [5]:
def compute_structure_features(question, answer):
    a_doc = nlp(answer)
    answer_length_tokens = len([t for t in a_doc if not t.is_space])
    answer_length_chars = len(answer)
    q_len = len(question)
    answer_to_question_len_ratio = answer_length_chars / q_len if q_len > 0 else 0
    sentences = list(a_doc.sents)
    num_sentences = len(sentences)
    return {
        'answer_length_tokens': answer_length_tokens,
        'answer_length_chars': answer_length_chars,
        'answer_to_question_len_ratio': answer_to_question_len_ratio,
        'num_sentences': num_sentences
    }

def compute_first_sentence_similarity(questions, answers):
    first_sentences = []
    for answer in answers:
        sents = sent_tokenize(answer)
        first_sentences.append(sents[0] if sents else "")
    print("Encoding first sentences...")
    q_emb = sbert_model.encode(questions.tolist(), show_progress_bar=True)
    fs_emb = sbert_model.encode(first_sentences, show_progress_bar=True)
    similarities = np.array([
        np.dot(q, f) / (np.linalg.norm(q) * np.linalg.norm(f) + 1e-8)
        for q, f in zip(q_emb, fs_emb)
    ])
    return similarities

In [6]:
def compute_hedging_features(answer):
    answer_lower = answer.lower()
    tokens = word_tokenize(answer_lower)
    hedge_score = sum(1 for t in tokens if t in HEDGE_WORDS)
    filler_score = sum(1 for t in tokens if t in FILLER_WORDS_SET)
    for phrase in FILLER_PHRASES:
        filler_score += answer_lower.count(phrase)
    vague_word_count = sum(1 for t in tokens if t in VAGUE_WORDS_SET)
    for phrase in VAGUE_PHRASES:
        vague_word_count += answer_lower.count(phrase)
    modal_verb_count = sum(1 for t in tokens if t in MODAL_VERBS)
    return {
        'hedge_score': hedge_score,
        'filler_score': filler_score,
        'vague_word_count': vague_word_count,
        'modal_verb_count': modal_verb_count
    }

In [7]:
def compute_specificity_features(answer):
    doc = nlp(answer)
    num_numbers = sum(1 for token in doc if token.like_num or token.pos_ == 'NUM')
    num_named_entities = len(doc.ents)
    total_tokens = len([t for t in doc if t.is_alpha])
    content_tokens = len([t for t in doc if t.is_alpha and not t.is_stop])
    specificity_score = content_tokens / total_tokens if total_tokens > 0 else 0
    return {
        'num_numbers': num_numbers,
        'num_named_entities': num_named_entities,
        'specificity_score': specificity_score
    }

def compute_concreteness_score(answer):
    """Compute concreteness using Brysbaert ratings (1=abstract, 5=concrete)."""
    if CONCRETENESS_RATINGS:
        # Use professional Brysbaert concreteness ratings
        tokens = word_tokenize(answer.lower())
        scores = [CONCRETENESS_RATINGS.get(t, 0) for t in tokens if t.isalpha()]
        valid_scores = [s for s in scores if s > 0]
        return sum(valid_scores) / len(valid_scores) if valid_scores else 2.5  # 2.5 = neutral
    else:
        # Fallback: proxy based on NER and numbers
        doc = nlp(answer)
        concrete_indicators = len(doc.ents) + sum(1 for t in doc if t.like_num)
        total = len([t for t in doc if t.is_alpha])
        return (concrete_indicators / total * 5) if total > 0 else 2.5  # Scale to 1-5

In [8]:
def compute_sentiment_features(answer):
    """Compute sentiment using VADER + AFINN (professional lexicon)."""
    # VADER scores
    vader_scores = sia.polarity_scores(answer)
    
    # AFINN score (normalized to -1 to 1 range)
    afinn_score = lexicons.get_afinn_score(answer)
    afinn_normalized = max(-1, min(1, afinn_score / 10))  # Normalize
    
    return {
        'sentiment_compound': vader_scores['compound'],
        'sentiment_positive': vader_scores['pos'],
        'sentiment_negative': vader_scores['neg'],
        'sentiment_neutral': vader_scores['neu'],
        'afinn_score': afinn_normalized
    }

def compute_emotion_features(answer):
    """Compute emotion features using NRC EmoLex (professional lexicon)."""
    tokens = word_tokenize(answer.lower())
    token_count = len([t for t in tokens if t.isalpha()])
    
    if token_count == 0:
        return {f'nrc_{e}': 0 for e in ['anger', 'fear', 'joy', 'sadness', 'trust', 'disgust']}
    
    # Count emotion words using NRC lexicon
    emotion_counts = {}
    for emotion, words in NRC_EMOTIONS.items():
        count = sum(1 for t in tokens if t in words)
        emotion_counts[f'nrc_{emotion}'] = count / token_count  # Normalized
    
    return emotion_counts

def compute_emotion_confidence(answer):
    """Sentiment extremity as confidence proxy."""
    scores = sia.polarity_scores(answer)
    return abs(scores['compound'])

In [9]:
def compute_syntactic_features(answer):
    doc = nlp(answer)
    total_tokens = len([t for t in doc if t.is_alpha])
    if total_tokens == 0:
        return {
            'pos_ratio_verbs': 0,
            'pos_ratio_nouns': 0,
            'pos_ratio_pronouns': 0,
            'num_clauses': 0
        }
    verbs = len([t for t in doc if t.pos_ == 'VERB'])
    nouns = len([t for t in doc if t.pos_ == 'NOUN'])
    pronouns = len([t for t in doc if t.pos_ == 'PRON'])
    num_clauses = 1  
    for token in doc:
        if token.dep_ in ('ccomp', 'advcl', 'relcl', 'acl'):
            num_clauses += 1
    return {
        'pos_ratio_verbs': verbs / total_tokens,
        'pos_ratio_nouns': nouns / total_tokens,
        'pos_ratio_pronouns': pronouns / total_tokens,
        'num_clauses': num_clauses
    }

In [10]:
def compute_evasion_features(answer):
    answer_lower = answer.lower()
    starts_with_thanks = int(any(answer_lower.strip().startswith(phrase) for phrase in THANKS_STARTERS))
    pivot_score = sum(1 for phrase in PIVOT_PHRASES if phrase in answer_lower)
    tokens = word_tokenize(answer_lower)
    negation_count = sum(1 for t in tokens if t in NEGATION_WORDS or "n't" in t)
    return {
        'starts_with_thanks': starts_with_thanks,
        'pivot_score': pivot_score,
        'negation_count': negation_count
    }

def compute_deflection_score(question, answer):
    q_doc = nlp(question)
    a_doc = nlp(answer)
    q_entities = {ent.text.lower() for ent in q_doc.ents}
    a_entities = {ent.text.lower() for ent in a_doc.ents}
    if len(a_entities) == 0:
        return 0.0
    new_entities = a_entities - q_entities
    return len(new_entities) / len(a_entities)

In [11]:
def compute_lexical_diversity(answer):
    tokens = word_tokenize(answer.lower())
    tokens = [t for t in tokens if t.isalpha()]
    if len(tokens) == 0:
        return {'ttr': 0, 'entropy_score': 0}
    unique_tokens = set(tokens)
    ttr = len(unique_tokens) / len(tokens)
    word_counts = Counter(tokens)
    total = len(tokens)
    probs = [count / total for count in word_counts.values()]
    entropy_score = -sum(p * np.log2(p) for p in probs if p > 0)
    return {
        'ttr': ttr,
        'entropy_score': entropy_score
    }

In [12]:
def extract_all_features(row):
    """Extract all linguistic features for a Q/A pair."""
    question = str(row['question'])
    answer = str(row['interview_answer'])
    features = {}
    
    # Structure features
    features.update(compute_structure_features(question, answer))
    
    # Hedging features (using hedge lexicon)
    features.update(compute_hedging_features(answer))
    
    # Specificity features
    features.update(compute_specificity_features(answer))
    features['concreteness_score'] = compute_concreteness_score(answer)
    
    # Sentiment features (VADER + AFINN)
    features.update(compute_sentiment_features(answer))
    features['emotion_confidence'] = compute_emotion_confidence(answer)
    
    # NRC Emotion features (if available)
    if NRC_EMOTIONS.get('anger'):
        features.update(compute_emotion_features(answer))
    
    # Syntactic features
    features.update(compute_syntactic_features(answer))
    
    # Evasion pattern features
    features.update(compute_evasion_features(answer))
    features['deflection_score'] = compute_deflection_score(question, answer)
    
    # Lexical diversity
    features.update(compute_lexical_diversity(answer))
    
    # Semantic overlap
    features['keyword_overlap'] = compute_keyword_overlap(question, answer)
    features['entity_overlap'] = compute_entity_overlap(question, answer)
    
    return pd.Series(features)

In [13]:
print("=" * 50)
print("EXTRACTING FEATURES")
print("=" * 50)
print("\n[1/2] Computing semantic similarity features (batched)...")
qa_similarity, topic_shift_score = compute_semantic_features(
    train_df['question'], 
    train_df['interview_answer']
)
train_df['qa_similarity'] = qa_similarity
train_df['topic_shift_score'] = topic_shift_score
print("\n[1.5/2] Computing first sentence similarity (batched)...")
train_df['first_sentence_similarity'] = compute_first_sentence_similarity(
    train_df['question'],
    train_df['interview_answer']
)
print("\n[2/2] Extracting row-wise features (this may take a few minutes)...")
feature_df = train_df.progress_apply(extract_all_features, axis=1)
train_df = pd.concat([train_df, feature_df], axis=1)
print("\n" + "=" * 50)
print("FEATURE EXTRACTION COMPLETE!")
print("=" * 50)
print(f"\nFinal dataset shape: {train_df.shape}")
print(f"New features added: {feature_df.shape[1]}")

EXTRACTING FEATURES

[1/2] Computing semantic similarity features (batched)...
Encoding questions...


Batches:   0%|          | 0/108 [00:00<?, ?it/s]

Encoding answers...


Batches:   0%|          | 0/108 [00:00<?, ?it/s]


[1.5/2] Computing first sentence similarity (batched)...
Encoding first sentences...


Batches:   0%|          | 0/108 [00:00<?, ?it/s]

Batches:   0%|          | 0/108 [00:00<?, ?it/s]


[2/2] Extracting row-wise features (this may take a few minutes)...


100%|██████████| 3448/3448 [23:25<00:00,  2.45it/s]  


FEATURE EXTRACTION COMPLETE!

Final dataset shape: (3448, 41)
New features added: 30





In [14]:
# ============================================
# VIEW EXTRACTED FEATURES
# ============================================

# List all new feature columns
feature_columns = [
    # Semantic
    'qa_similarity', 'topic_shift_score', 'keyword_overlap', 'entity_overlap', 'first_sentence_similarity',
    # Structure
    'answer_length_tokens', 'answer_length_chars', 'answer_to_question_len_ratio', 'num_sentences',
    # Hedging
    'hedge_score', 'filler_score', 'vague_word_count', 'modal_verb_count',
    # Specificity
    'num_numbers', 'num_named_entities', 'specificity_score', 'concreteness_score',
    # Sentiment
    'sentiment_compound', 'sentiment_positive', 'sentiment_negative', 'sentiment_neutral', 'emotion_confidence',
    # Syntactic
    'pos_ratio_verbs', 'pos_ratio_nouns', 'pos_ratio_pronouns', 'num_clauses',
    # Evasion
    'starts_with_thanks', 'pivot_score', 'negation_count', 'deflection_score',
    # Lexical diversity
    'ttr', 'entropy_score'
]

print(f"Total feature columns: {len(feature_columns)}\n")
print("Feature columns:")
for i, col in enumerate(feature_columns, 1):
    print(f"  {i:2d}. {col}")

# Show sample of features
train_df[feature_columns].head()

Total feature columns: 32

Feature columns:
   1. qa_similarity
   2. topic_shift_score
   3. keyword_overlap
   4. entity_overlap
   5. first_sentence_similarity
   6. answer_length_tokens
   7. answer_length_chars
   8. answer_to_question_len_ratio
   9. num_sentences
  10. hedge_score
  11. filler_score
  12. vague_word_count
  13. modal_verb_count
  14. num_numbers
  15. num_named_entities
  16. specificity_score
  17. concreteness_score
  18. sentiment_compound
  19. sentiment_positive
  20. sentiment_negative
  21. sentiment_neutral
  22. emotion_confidence
  23. pos_ratio_verbs
  24. pos_ratio_nouns
  25. pos_ratio_pronouns
  26. num_clauses
  27. starts_with_thanks
  28. pivot_score
  29. negation_count
  30. deflection_score
  31. ttr
  32. entropy_score


Unnamed: 0,qa_similarity,topic_shift_score,keyword_overlap,entity_overlap,first_sentence_similarity,answer_length_tokens,answer_length_chars,answer_to_question_len_ratio,num_sentences,hedge_score,...,pos_ratio_verbs,pos_ratio_nouns,pos_ratio_pronouns,num_clauses,starts_with_thanks,pivot_score,negation_count,deflection_score,ttr,entropy_score
0,0.52322,0.47678,0.555556,2.0,0.125558,429.0,2010.0,17.033898,21.0,21.0,...,0.142857,0.131868,0.093407,40.0,0.0,0.0,4.0,0.875,0.465714,6.722073
1,0.45599,0.54401,0.7,2.0,0.325216,429.0,2010.0,17.631579,21.0,21.0,...,0.142857,0.131868,0.093407,40.0,0.0,0.0,4.0,0.875,0.465714,6.722073
2,0.679849,0.320151,0.357143,2.0,0.745094,263.0,1246.0,7.506024,19.0,10.0,...,0.162162,0.13964,0.099099,21.0,0.0,0.0,8.0,0.5,0.528302,6.30339
3,0.183232,0.816768,0.0,0.0,0.214009,263.0,1246.0,8.964029,19.0,10.0,...,0.162162,0.13964,0.099099,21.0,0.0,0.0,8.0,1.0,0.528302,6.30339
4,0.333183,0.666817,0.375,0.0,0.123043,525.0,2502.0,30.144578,33.0,14.0,...,0.13964,0.164414,0.101351,26.0,0.0,0.0,6.0,1.0,0.497685,7.100189


In [15]:
# ============================================
# ANALYZE FEATURES BY LABEL
# ============================================

print("Feature statistics by label:\n")
print(train_df.groupby('label')[feature_columns].mean().T.round(3))

Feature statistics by label:

label                         Claims ignorance  Clarification  \
qa_similarity                            0.327          0.179   
topic_shift_score                        0.673          0.821   
keyword_overlap                          0.237          0.054   
entity_overlap                           0.218          0.065   
first_sentence_similarity                0.234          0.160   
answer_length_tokens                   199.924        105.620   
answer_length_chars                    902.269        496.076   
answer_to_question_len_ratio            16.063          7.612   
num_sentences                           12.403          5.511   
hedge_score                              8.613          4.413   
filler_score                             2.597          1.217   
vague_word_count                         2.513          1.326   
modal_verb_count                         2.899          0.989   
num_numbers                              1.361          1.01

In [16]:
# ============================================
# SAVE ENRICHED DATASET (OPTIONAL)
# ============================================

# Uncomment to save the feature-enriched dataset
train_df.to_parquet('dataset/train_with_features.parquet', index=False)
train_df.to_csv('dataset/train_with_features.csv', index=False)
print("Saved feature-enriched dataset!")

Saved feature-enriched dataset!
