DATA LOADING and CLEANING

In [17]:
# Uncomment to Install all required packages using conda (safer on Mac & Anaconda)
#!conda install -y -c conda-forge pyarrow fsspec huggingface_hub datasets
#!conda install -y -c conda-forge pyarrow pandas --force-reinstall
#!pip install numpy pandas matplotlib scikit-learn statsmodels

In [27]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.formula.api as smf

In [29]:
import pandas as pd
df1 = pd.read_parquet("hf://datasets/truthfulqa/truthful_qa/generation/validation-00000-of-00001.parquet")
import pandas as pd
df2 = pd.read_parquet("hf://datasets/truthfulqa/truthful_qa/multiple_choice/validation-00000-of-00001.parquet")

df1.to_csv("truthful_qa_validation.csv", index=False)
df2.to_csv("truthful_qa_validation.csv", index=False)

df1['ai_correct'] = df1.apply(lambda row: row['best_answer'] in row['correct_answers'], axis=1)


In [30]:
from datasets import load_dataset
ds = load_dataset("truthfulqa/truthful_qa", "generation")

hedging_words = ['maybe', 'might', 'could', 'possibly', 'seems', 'appears', 'suggests', 'may',
    'perhaps', 'probably', 'likely', 'presumably', 'assume', 'assumes', 'indicates',
    'apparently', 'arguably', 'tends', 'potentially', 'often', 'sometimes', 'can', 
    'seem', 'think', 'estimated', 'considered', 'speculate', 'roughly'
]
certainty_words = [
    'definitely', 'certainly', 'absolutely', 'clearly', 'undoubtedly', 'surely',
    'yes', 'no', 'nothing', 'never', 'is', 'are', 'will', 'must', 'cannot',
    'always', 'every', 'all', 'without a doubt', 'it is clear', 'proven', 'true', 'fact', 'obviously'
]

In [21]:
def count_words(text, word_list):
    text = str(text).lower() # made lowercase to avoid case mismatches
    return sum(text.count(word.lower()) for word in word_list)
    
df1['Hedging_words_count'] = df1['best_answer'].apply(lambda x: count_words(x, hedging_words))
df1['Certainty_markers_count'] = df1['best_answer'].apply(lambda x: count_words(x, certainty_words))


EXPLORING THE DATA

We will be creating a new binary variable called 'ai_correct' to indicate the correctness of the answer LLM gives us (1 if the AI answer matches a correct answer and 0 if it doesn’t). This variable captures exactly what we are interested in: whether the AI provides correct or trustful answers and specifically whether linguistic and structural features relate to factual accuracy. Since correctness is one of the main aspects we are analyzing, this binary measure is precise and straightforward. We expect that this variable will clearly show patterns in AI performance, such as differences across question types, categories, or response characteristics. We would expect that AI-generated answers with longer length and more certain linguistic cues to be more trustful and correct. 

Our key explanatory variables are: 
a). Hedging_words_count, which measures the frequency of uncertainty phrases, such as 'might' 'possibly' and 'perhaps'.
b). Certainty_markers_count, which measures the frequency of high-certainty phrases, such as 'definitely', 'clearly', and 'of course'.
c). answer_length, which measures the word count of the LLM-generated answer.
d). Category, which captures the domain of each question, such as Science, Politics, Economics, and Education and indicates contextual differences in model performance, revealing whether certain knowledge areas are more prone to factual errors or overconfident falsehoods.

In [37]:
# create a table of summary statistics of our key variables
df1[['ai_correct', 'Hedging_words_count', 'Certainty_markers_count', 'answer_length']].describe()

DATA VISUALIZATION

In [None]:
#Outcome Variable Plots
counts = df1['ai_correct'].value_counts()
counts = counts.reindex([True, False], fill_value=0)  # add False with 0 if missing
percentages = counts / counts.sum() * 100
plt.figure(figsize=(6,4))
plt.bar(['True', 'False'], percentages.values, color=['skyblue', 'salmon'], label=['Correct', 'Incorrect'])
plt.ylabel('Percentage (%)')
plt.xlabel('AI Correct?')
plt.title('Percentage of Correct vs Incorrect AI Responses')
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.15))
plt.show()

#Key Explanatory Variable Plots
type_counts = df1['type'].value_counts()
plt.figure(figsize=(6,4))
plt.bar(type_counts.index, type_counts.values, color=['lightgreen','salmon'], label=['Non-Adversarial', 'Adversarial'])
plt.xlabel('Question Type')
plt.ylabel('Number of Questions')
plt.title('Number of Adversarial vs Non-Adversarial Questions')
plt.legend(loc='upper right', bbox_to_anchor=(1.4, 1.15))
plt.show()