In [127]:
from __future__ import annotations

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupShuffleSplit, train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, confusion_matrix

In [128]:
# Import and load the LIAR dataset
COLS = [
    "id","label","statement","subject","speaker","speaker_job","state","party",
    "barely_true_counts","false_counts","half_true_counts","mostly_true_counts","pants_on_fire_counts",
    "context"
]

def load_tsv(path):
    df = pd.read_csv(path, sep="\t", header=None, dtype=str)
    df.columns = COLS
    return df

train_df = load_tsv("train.tsv")
valid_df = load_tsv("valid.tsv")
test_df  = load_tsv("test.tsv")

print(train_df.shape, valid_df.shape, test_df.shape)
train_df.head()


(10240, 14) (1284, 14) (1267, 14)


Unnamed: 0,id,label,statement,subject,speaker,speaker_job,state,party,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0,1,0,0,0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0,0,1,1,0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70,71,160,163,9,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7,19,3,5,44,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15,9,20,19,2,an interview on CNN


In [129]:
train_df["label"].value_counts()


label
half-true      2114
false          1995
mostly-true    1962
true           1676
barely-true    1654
pants-fire      839
Name: count, dtype: int64

In [130]:
# Feature engineering
def make_text(df, use_meta=True):
    stmt = df["statement"].fillna("")
    
    if not use_meta:
        return stmt
    
    subj = df["subject"].fillna("")
    ctx  = df["context"].fillna("")
    speaker = df["speaker"].fillna("")
    party = df["party"].fillna("")
    
    return stmt + " [SUBJECT] " + subj + " [CONTEXT] " + ctx + " [SPEAKER] " + speaker + " [PARTY] " + party

In [131]:
# Muliclass speaker-based split
df = train_df.copy()
df["speaker"] = df["speaker"].fillna("UNKNOWN_SPEAKER")

X = make_text(df, use_meta=True)
y = df["label"]
groups = df["speaker"]

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, holdout_idx = next(gss.split(X, y, groups=groups))

X_train_sp = X.iloc[train_idx]
y_train_sp = y.iloc[train_idx]

X_holdout_sp = X.iloc[holdout_idx]
y_holdout_sp = y.iloc[holdout_idx]

print("Speaker-split sizes:", len(X_train_sp), len(X_holdout_sp))
print("Unique speakers in train:", df["speaker"].iloc[train_idx].nunique())
print("Unique speakers in holdout:", df["speaker"].iloc[holdout_idx].nunique())

Speaker-split sizes: 8299 1941
Unique speakers in train: 2328
Unique speakers in holdout: 583


In [132]:
# Train and evaluate speaker-based split model
model = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        ngram_range=(1,2),
        min_df=2,
        max_df=0.9,
        max_features=30000
    )),
    ("clf", LinearSVC())
])

model.fit(X_train_sp, y_train_sp)

pred_holdout = model.predict(X_holdout_sp)

print("Speaker-holdout accuracy:", round(accuracy_score(y_holdout_sp, pred_holdout), 4))
print("Speaker-holdout macro-F1:", round(f1_score(y_holdout_sp, pred_holdout, average="macro"), 4))
print("\nClassification report (speaker-holdout):")
print(classification_report(y_holdout_sp, pred_holdout, digits=3))

Speaker-holdout accuracy: 0.2375
Speaker-holdout macro-F1: 0.2293

Classification report (speaker-holdout):
              precision    recall  f1-score   support

 barely-true      0.165     0.190     0.177       306
       false      0.247     0.306     0.273       379
   half-true      0.266     0.279     0.272       409
 mostly-true      0.240     0.223     0.231       363
  pants-fire      0.280     0.140     0.186       186
        true      0.253     0.221     0.236       298

    accuracy                          0.238      1941
   macro avg      0.242     0.226     0.229      1941
weighted avg      0.241     0.238     0.236      1941



In [133]:
# Multiclass standard split model
X_train = make_text(train_df, use_meta=True)
y_train = train_df["label"]

X_valid = make_text(valid_df, use_meta=True)
y_valid = valid_df["label"]

X_test  = make_text(test_df, use_meta=True)
y_test  = test_df["label"]

model.fit(X_train, y_train)

pred_valid = model.predict(X_valid)
pred_test  = model.predict(X_test)

print("VALID accuracy:", round(accuracy_score(y_valid, pred_valid), 4),
      "macro-F1:", round(f1_score(y_valid, pred_valid, average="macro"), 4))
print("TEST  accuracy:", round(accuracy_score(y_test, pred_test), 4),
      "macro-F1:", round(f1_score(y_test, pred_test, average="macro"), 4))


VALID accuracy: 0.2687 macro-F1: 0.2774
TEST  accuracy: 0.2636 macro-F1: 0.2613


In [134]:
# Confusion matrix for validation set
labels = sorted(y_train.unique())
confusion_matrix(y_valid, pred_valid, labels=labels)


array([[50, 54, 61, 31, 12, 29],
       [48, 72, 48, 34, 28, 33],
       [43, 51, 73, 43,  7, 31],
       [35, 31, 57, 66,  8, 54],
       [16, 29, 10,  8, 41, 12],
       [16, 28, 39, 42,  1, 43]])

In [135]:
# Binary classification conversion
TRUEISH  = {"true", "mostly-true", "half-true"}
FALSEISH = {"false", "barely-true", "pants-fire"}

def to_binary(labels: pd.Series) -> pd.Series:
    return labels.map(lambda x: 1 if x in TRUEISH else 0)

y_train_bin = to_binary(train_df["label"])
y_valid_bin = to_binary(valid_df["label"])
y_test_bin  = to_binary(test_df["label"])


In [136]:
# Binary classification model
bin_model = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        ngram_range=(1,2),
        min_df=2,
        max_df=0.9,
        max_features=30000
    )),
    ("clf", LogisticRegression(
        max_iter=800,
        solver="liblinear",
        class_weight="balanced"
    ))
])

X_train_bin = make_text(train_df, use_meta=True)
X_valid_bin = make_text(valid_df, use_meta=True)
X_test_bin  = make_text(test_df,  use_meta=True)

bin_model.fit(X_train_bin, y_train_bin)

probs_valid = bin_model.predict_proba(X_valid_bin)[:, 1]  # P(true-ish)
pred_valid_bin = (probs_valid >= 0.5).astype(int)

print("Binary VALID accuracy:", round(accuracy_score(y_valid_bin, pred_valid_bin), 4))
print("Binary VALID F1:", round(f1_score(y_valid_bin, pred_valid_bin), 4))
print("\nBinary classification report (VALID):")
print(classification_report(y_valid_bin, pred_valid_bin, digits=3))


Binary VALID accuracy: 0.6433
Binary VALID F1: 0.6602

Binary classification report (VALID):
              precision    recall  f1-score   support

           0      0.631     0.619     0.625       616
           1      0.654     0.666     0.660       668

    accuracy                          0.643      1284
   macro avg      0.643     0.642     0.642      1284
weighted avg      0.643     0.643     0.643      1284



In [137]:
# Evaluation with abstention
def evaluate_with_abstention(y_true, probs, threshold=0.7):
    y_true = np.asarray(y_true, dtype=int)
    probs = np.asarray(probs, dtype=float)

    confident = (probs >= threshold) | (probs <= 1 - threshold)
    coverage = confident.mean()

    preds = (probs >= 0.5).astype(int)

    if confident.sum() == 0:
        return coverage, np.nan, np.nan

    acc = accuracy_score(y_true[confident], preds[confident])
    f1  = f1_score(y_true[confident], preds[confident])

    return coverage, acc, f1

for t in [0.55, 0.60, 0.65, 0.70, 0.75, 0.80]:
    coverage, acc, f1 = evaluate_with_abstention(y_valid_bin, probs_valid, threshold=t)
    print(f"threshold={t:.2f} | coverage={coverage:.2f} | acc={acc:.3f} | f1={f1:.3f}")


threshold=0.55 | coverage=0.77 | acc=0.682 | f1=0.690
threshold=0.60 | coverage=0.56 | acc=0.711 | f1=0.722
threshold=0.65 | coverage=0.38 | acc=0.753 | f1=0.759
threshold=0.70 | coverage=0.24 | acc=0.806 | f1=0.804
threshold=0.75 | coverage=0.14 | acc=0.835 | f1=0.824
threshold=0.80 | coverage=0.07 | acc=0.915 | f1=0.907


In [138]:
# Test set evaluation with abstention
t = 0.70  

probs_test = bin_model.predict_proba(X_test_bin)[:, 1]

coverage, acc, f1 = evaluate_with_abstention(y_test_bin, probs_test, threshold=t)
print(f"TEST with abstention (t={t}): coverage={coverage:.2f} | acc={acc:.3f} | f1={f1:.3f}")


TEST with abstention (t=0.7): coverage=0.24 | acc=0.799 | f1=0.826


In [139]:
# Prediction function with abstention
def predict_sentence(text, model, threshold=0.70):
    
    proba_trueish = model.predict_proba([text])[0, 1]  # P(true-ish)
    proba_falseish = 1 - proba_trueish
    
   
    confident = (proba_trueish >= threshold) or (proba_trueish <= 1 - threshold)
    
    if not confident:
        return {
            "decision": "ABSTAIN (not confident)",
            "p_trueish": float(proba_trueish),
            "p_falseish": float(proba_falseish),
        }
    
    decision = "TRUE-ISH" if proba_trueish >= 0.5 else "FALSE-ISH"
    return {
        "decision": decision,
        "p_trueish": float(proba_trueish),
        "p_falseish": float(proba_falseish),
    }


In [140]:
# Example predictions
threshold = 0.70

samples = [
    "The Earth orbits the Sun once every year.",
    "Vaccines cause autism.",
    "The unemployment rate fell last month according to the Bureau of Labor Statistics.",
    "The President was born on Mars.",
    "Drinking bleach cures infections."
]

for s in samples:
    print(s)
    print(predict_sentence(s, bin_model, threshold=threshold))
    print("-" * 60)

The Earth orbits the Sun once every year.
{'decision': 'ABSTAIN (not confident)', 'p_trueish': 0.6076728739748772, 'p_falseish': 0.39232712602512276}
------------------------------------------------------------
Vaccines cause autism.
{'decision': 'ABSTAIN (not confident)', 'p_trueish': 0.3769723661386222, 'p_falseish': 0.6230276338613778}
------------------------------------------------------------
The unemployment rate fell last month according to the Bureau of Labor Statistics.
{'decision': 'ABSTAIN (not confident)', 'p_trueish': 0.37455311422243326, 'p_falseish': 0.6254468857775668}
------------------------------------------------------------
The President was born on Mars.
{'decision': 'ABSTAIN (not confident)', 'p_trueish': 0.421302881909816, 'p_falseish': 0.578697118090184}
------------------------------------------------------------
Drinking bleach cures infections.
{'decision': 'ABSTAIN (not confident)', 'p_trueish': 0.4505471595313009, 'p_falseish': 0.5494528404686991}
-------

In [141]:
# Analyze confident test set examples
t = 0.70
X_test_bin = make_text(test_df, use_meta=True)
y_test_bin = to_binary(test_df["label"])

probs_test = bin_model.predict_proba(X_test_bin)[:, 1]
conf = (probs_test >= t) | (probs_test <= 1 - t)

conf_idx = np.where(conf)[0]
print("Confident examples:", len(conf_idx), "out of", len(test_df))

for i in conf_idx[:10]:
    s = test_df.loc[i, "statement"]
    true_lab = "TRUE-ISH" if y_test_bin.iloc[i] == 1 else "FALSE-ISH"
    p = probs_test[i]
    pred = "TRUE-ISH" if p >= 0.5 else "FALSE-ISH"
    print("\nSTATEMENT:", s)
    print("p_trueish:", round(float(p), 3), "| PRED:", pred, "| TRUE:", true_lab)


Confident examples: 298 out of 1267

STATEMENT: Building a wall on the U.S.-Mexico border will take literally years.
p_trueish: 0.253 | PRED: FALSE-ISH | TRUE: TRUE-ISH

STATEMENT: Suzanne Bonamici supports a plan that will cut choice for Medicare Advantage seniors.
p_trueish: 0.225 | PRED: FALSE-ISH | TRUE: TRUE-ISH

STATEMENT: When asked by a reporter whether hes at the center of a criminal scheme to violate campaign laws, Gov. Scott Walker nodded yes.
p_trueish: 0.297 | PRED: FALSE-ISH | TRUE: FALSE-ISH

STATEMENT: We know there are more Democrats in Georgia than Republicans. We know that for a fact.
p_trueish: 0.703 | PRED: TRUE-ISH | TRUE: FALSE-ISH

STATEMENT: Denali is the Kenyan word for black power.
p_trueish: 0.209 | PRED: FALSE-ISH | TRUE: FALSE-ISH

STATEMENT: Unfortunately we have documented instances where people defecated in the (Statehouse) building.
p_trueish: 0.179 | PRED: FALSE-ISH | TRUE: FALSE-ISH

STATEMENT: Says Charlie Crist is embroiled in a fraud case for stee