DATA LOADING and CLEANING

In [17]:
# Uncomment to Install all required packages using conda (safer on Mac & Anaconda)
#!conda install -y -c conda-forge pyarrow fsspec huggingface_hub datasets
#!conda install -y -c conda-forge pyarrow pandas --force-reinstall
#!pip install numpy pandas matplotlib scikit-learn statsmodels

In [27]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.formula.api as smf
import re

In [29]:
import pandas as pd
from datasets import load_dataset

ds = load_dataset("truthfulqa/truthful_qa", "generation")
df1 = ds["validation"].to_pandas()
print("[generation]", df1.shape, df1.columns.tolist())

from huggingface_hub import HfFileSystem
fs = HfFileSystem()
with fs.open("datasets/truthfulqa/truthful_qa/multiple_choice/validation-00000-of-00001.parquet", "rb") as f:
    df2 = pd.read_parquet(f)
print("[multiple_choice]", df2.shape, df2.columns.tolist())

df1.to_csv("truthful_qa_generation.csv", index=False)
df2.to_csv("truthful_qa_multiple_choice.csv", index=False)

df1["ai_correct"] = df1.apply(lambda row: row["best_answer"] in row["correct_answers"], axis=1)
print("[ok] df1 ready:", df1.shape)


In [30]:
from datasets import load_dataset
ds = load_dataset("truthfulqa/truthful_qa", "generation")

hedging_words = [
    'probable','possible','likely','probably','possibly','perhaps','maybe','may','might','could',
    'apparently','seemingly','sometimes','suggest','appear','seem','often','usually',
    'primarily','generally','largely','tend','tendency'
]

certainty_words = [
    'certainly','undoubtedly','obviously','definitely','surely','absolutely',
    'clearly','sure','in fact','guaranteed','proven','conclusively','evidently',
    'beyond doubt','no doubt'
]

In [None]:
# ensure list columns
def to_list_if_needed(x):
    if isinstance(x, list):
        return x
    if isinstance(x, np.ndarray):
        return list(x)
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    s = str(x).strip()
    try:
        return ast.literal_eval(s)
    except Exception:
        parts = re.findall(r"'(.*?)'|\"(.*?)\"", s)
        parts = [p[0] or p[1] for p in parts]
        return [p for p in parts if p]

if not isinstance(df1.iloc[0]['correct_answers'], list):
    df1['correct_answers'] = df1['correct_answers'].apply(to_list_if_needed)
if not isinstance(df1.iloc[0]['incorrect_answers'], list):
    df1['incorrect_answers'] = df1['incorrect_answers'].apply(to_list_if_needed)

# remove incomplete observations
before = len(df1)
df1 = df1.dropna(subset=['best_answer'])
df1 = df1[df1['correct_answers'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
df1 = df1[df1['incorrect_answers'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
after = len(df1)
print(f"[clean] dropped {before - after} rows")

# recompute ai_correct after cleaning
df1['ai_correct'] = df1.apply(lambda r: r['best_answer'] in r['correct_answers'], axis=1)
df1.to_csv("truthful_qa_generation_clean_question_level.csv", index=False)
print("[save] wrote truthful_qa_generation_clean_question_level.csv")

In [None]:
# new column
df1['n_correct'] = df1['correct_answers'].apply(len)
df1['n_incorrect'] = df1['incorrect_answers'].apply(len)
df1['ai_correct'] = df1.apply(lambda r: 1 if r['best_answer'] in r['correct_answers'] else 0, axis=1)
df1['answer_length_best'] = df1['best_answer'].astype(str).str.split().apply(len)
df1.to_csv("truthful_qa_generation_prepared.csv", index=False)
print("[save] wrote truthful_qa_generation_prepared.csv")

In [None]:
# feature engineering
hedge_patterns = [r"\b" + re.escape(w) + r"\b" for w in hedging_words]
certainty_patterns = [r"\b" + re.escape(w) + r"\b" for w in certainty_words]

def count_matches(text, patterns):
    text = str(text).lower()
    return sum(len(re.findall(p, text)) for p in patterns)

df1['hedging_words_count'] = df1['best_answer'].apply(lambda x: count_matches(x, hedge_patterns))
df1['certainty_markers_count'] = df1['best_answer'].apply(lambda x: count_matches(x, certainty_patterns))

df1.to_csv("truthful_qa_generation_features.csv", index=False)
print("[save] wrote truthful_qa_generation_features.csv")

In [None]:
# restructuring to answer-level
keep_cols = ['type', 'category', 'question', 'source']
rows = []
for _, r in df1.iterrows():
    for a in r['correct_answers']:
        rows.append({**{k: r.get(k, None) for k in keep_cols}, 'answer': a, 'correctness': 1})
    for a in r['incorrect_answers']:
        rows.append({**{k: r.get(k, None) for k in keep_cols}, 'answer': a, 'correctness': 0})

answers_df = pd.DataFrame(rows)
answers_df['answer_length']   = answers_df['answer'].astype(str).str.split().apply(len)
answers_df['hedge_count']     = answers_df['answer'].apply(lambda x: count_matches(x, hedge_patterns))
answers_df['certainty_count'] = answers_df['answer'].apply(lambda x: count_matches(x, certainty_patterns))
answers_df.to_csv("truthful_qa_generation_answer_level.csv", index=False)
print("[save] wrote truthful_qa_generation_answer_level.csv")

In [None]:
# exclusion criteria & final variables
# question-level
q = df1.dropna(subset=['best_answer'])
q = q[q['correct_answers'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
q = q[q['incorrect_answers'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

if 'n_correct' not in q.columns:  q['n_correct'] = q['correct_answers'].apply(len)
if 'n_incorrect' not in q.columns: q['n_incorrect'] = q['incorrect_answers'].apply(len)
if 'answer_length_best' not in q.columns: q['answer_length_best'] = q['best_answer'].astype(str).str.split().apply(len)
if 'hedging_words_count' not in q.columns: q['hedging_words_count'] = q['best_answer'].apply(lambda x: count_matches(x, hedge_patterns))
if 'certainty_markers_count' not in q.columns: q['certainty_markers_count'] = q['best_answer'].apply(lambda x: count_matches(x, certainty_patterns))
if 'ai_correct' not in q.columns: q['ai_correct'] = q.apply(lambda r: 1 if r['best_answer'] in r['correct_answers'] else 0, axis=1)

q = q[['type','category','question','best_answer','source',
       'n_correct','n_incorrect','answer_length_best',
       'hedging_words_count','certainty_markers_count','ai_correct']]
q.to_csv("truthful_qa_generation_final_question_level.csv", index=False)
print("[save] wrote truthful_qa_generation_final_question_level.csv")

# answer-level
a = answers_df.dropna(subset=['answer'])
a = a[a['answer'].astype(str).str.strip().ne('')]

if 'answer_length' not in a.columns:   a['answer_length'] = a['answer'].astype(str).str.split().apply(len)
if 'hedge_count' not in a.columns:     a['hedge_count'] = a['answer'].apply(lambda x: count_matches(x, hedge_patterns))
if 'certainty_count' not in a.columns: a['certainty_count'] = a['answer'].apply(lambda x: count_matches(x, certainty_patterns))

a = a[['type','category','question','source','answer',
       'correctness','answer_length','hedge_count','certainty_count']]
a.to_csv("truthful_qa_generation_final_answer_level.csv", index=False)
print("[save] wrote truthful_qa_generation_final_answer_level.csv")

In [21]:
def count_words(text, word_list):
    text = str(text).lower() # made lowercase to avoid case mismatches
    return sum(text.count(word.lower()) for word in word_list)
    
df1['Hedging_words_count'] = df1['best_answer'].apply(lambda x: count_words(x, hedging_words))
df1['Certainty_markers_count'] = df1['best_answer'].apply(lambda x: count_words(x, certainty_words))


EXPLORING THE DATA

We will be creating a new binary variable called 'ai_correct' to indicate the correctness of the answer LLM gives us (1 if the AI answer matches a correct answer and 0 if it doesn’t). This variable captures exactly what we are interested in: whether the AI provides correct or trustful answers and specifically whether linguistic and structural features relate to factual accuracy. Since correctness is one of the main aspects we are analyzing, this binary measure is precise and straightforward. We expect that this variable will clearly show patterns in AI performance, such as differences across question types, categories, or response characteristics. We would expect that AI-generated answers with longer length and more certain linguistic cues to be more trustful and correct. 

Our key explanatory variables are: 
a). Hedging_words_count, which measures the frequency of uncertainty phrases, such as 'might' 'possibly' and 'perhaps'.
b). Certainty_markers_count, which measures the frequency of high-certainty phrases, such as 'definitely', 'clearly', and 'of course'.
c). answer_length, which measures the word count of the LLM-generated answer.
d). Category, which captures the domain of each question, such as Science, Politics, Economics, and Education and indicates contextual differences in model performance, revealing whether certain knowledge areas are more prone to factual errors or overconfident falsehoods.

In [37]:
# create a table of summary statistics of our key variables
df1[['ai_correct', 'Hedging_words_count', 'Certainty_markers_count', 'answer_length']].describe()

DATA VISUALIZATION

In [None]:
#Outcome Variable Plots
counts = df1['ai_correct'].value_counts()
counts = counts.reindex([True, False], fill_value=0)  # add False with 0 if missing
percentages = counts / counts.sum() * 100
plt.figure(figsize=(6,4))
plt.bar(['True', 'False'], percentages.values, color=['skyblue', 'salmon'], label=['Correct', 'Incorrect'])
plt.ylabel('Percentage (%)')
plt.xlabel('AI Correct?')
plt.title('Percentage of Correct vs Incorrect AI Responses')
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.15))
plt.show()

#Key Explanatory Variable Plots
type_counts = df1['type'].value_counts()
plt.figure(figsize=(6,4))
plt.bar(type_counts.index, type_counts.values, color=['lightgreen','salmon'], label=['Non-Adversarial', 'Adversarial'])
plt.xlabel('Question Type')
plt.ylabel('Number of Questions')
plt.title('Number of Adversarial vs Non-Adversarial Questions')
plt.legend(loc='upper right', bbox_to_anchor=(1.4, 1.15))
plt.show()

In [None]:
# answer length vs correctness
plt.figure(figsize=(6,4))
df1.boxplot(column='answer_length_best', by='ai_correct', patch_artist=True,
            boxprops=dict(facecolor='lightblue'), medianprops=dict(color='red'))
plt.title('Answer Length by AI Correctness')
plt.suptitle('')
plt.xlabel('AI Correct (1 = Correct, 0 = Incorrect)')
plt.ylabel('Answer Length (words)')
plt.show()

In [None]:
# linguistic features by category
agg = df1.groupby('category')[['hedging_words_count','certainty_markers_count']].mean().sort_values('hedging_words_count', ascending=False)
agg.plot(kind='bar', figsize=(8,5))
plt.title('Average Hedging and Certainty Counts by Question Category')
plt.xlabel('Question Category')
plt.ylabel('Average Count')
plt.legend(['Hedging Words','Certainty Markers'])
plt.tight_layout()
plt.show()