In [98]:
from __future__ import annotations

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupShuffleSplit, train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, confusion_matrix

In [123]:
COLS = [
    "id","label","statement","subject","speaker","speaker_job","state","party",
    "barely_true_counts","false_counts","half_true_counts","mostly_true_counts","pants_on_fire_counts",
    "context"
]

def load_tsv(path):
    df = pd.read_csv(path, sep="\t", header=None, dtype=str)
    df.columns = COLS
    return df

train_df = load_tsv("train.tsv")
valid_df = load_tsv("valid.tsv")
test_df  = load_tsv("test.tsv")

print(train_df.shape, valid_df.shape, test_df.shape)
train_df.head()


(10240, 14) (1284, 14) (1267, 14)


Unnamed: 0,id,label,statement,subject,speaker,speaker_job,state,party,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0,1,0,0,0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0,0,1,1,0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70,71,160,163,9,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7,19,3,5,44,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15,9,20,19,2,an interview on CNN


In [100]:
train_df["label"].value_counts()


label
half-true      2114
false          1995
mostly-true    1962
true           1676
barely-true    1654
pants-fire      839
Name: count, dtype: int64

In [101]:
def make_text(df, use_meta=True):
    stmt = df["statement"].fillna("")
    
    if not use_meta:
        return stmt
    
    subj = df["subject"].fillna("")
    ctx  = df["context"].fillna("")
    speaker = df["speaker"].fillna("")
    party = df["party"].fillna("")
    
    return stmt + " [SUBJECT] " + subj + " [CONTEXT] " + ctx + " [SPEAKER] " + speaker + " [PARTY] " + party

In [102]:
model = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        ngram_range=(1,2),
        min_df=2,
        max_df=0.9,
        max_features=30000
    )),
    ("clf", LinearSVC())
])

model.fit(X_train, y_train)


0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('tfidf', ...), ('clf', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"input  input: {'filename', 'file', 'content'}, default='content' - If `'filename'`, the sequence passed as an argument to fit is  expected to be a list of filenames that need reading to fetch  the raw content to analyze. - If `'file'`, the sequence items must have a 'read' method (file-like  object) that is called to fetch the bytes in memory. - If `'content'`, the input is expected to be a sequence of items that  can be of type string or byte.",'content'
,"encoding  encoding: str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode.",'utf-8'
,"decode_error  decode_error: {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'.",'strict'
,"strip_accents  strip_accents: {'ascii', 'unicode'} or callable, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have a direct ASCII mapping. 'unicode' is a slightly slower method that works on any characters. None (default) means no character normalization is performed. Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`.",
,"lowercase  lowercase: bool, default=True Convert all characters to lowercase before tokenizing.",True
,"preprocessor  preprocessor: callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer`` is not callable.",
,"tokenizer  tokenizer: callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``.",
,"analyzer  analyzer: {'word', 'char', 'char_wb'} or callable, default='word' Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. .. versionchanged:: 0.21  Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data  is first read from the file and then passed to the given callable  analyzer.",'word'
,"stop_words  stop_words: {'english'}, list, default=None If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. If None, no stop words will be used. In this case, setting `max_df` to a higher value, such as in the range (0.7, 1.0), can automatically detect and filter stop words based on intra corpus document frequency of terms.",
,"token_pattern  token_pattern: str, default=r""(?u)\\b\\w\\w+\\b"" Regular expression denoting what constitutes a ""token"", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). If there is a capturing group in token_pattern then the captured group content, not the entire match, becomes the token. At most one capturing group is permitted.",'(?u)\\b\\w\\w+\\b'

0,1,2
,"penalty  penalty: {'l1', 'l2'}, default='l2' Specifies the norm used in the penalization. The 'l2' penalty is the standard used in SVC. The 'l1' leads to ``coef_`` vectors that are sparse.",'l2'
,"loss  loss: {'hinge', 'squared_hinge'}, default='squared_hinge' Specifies the loss function. 'hinge' is the standard SVM loss (used e.g. by the SVC class) while 'squared_hinge' is the square of the hinge loss. The combination of ``penalty='l1'`` and ``loss='hinge'`` is not supported.",'squared_hinge'
,"dual  dual: ""auto"" or bool, default=""auto"" Select the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features. `dual=""auto""` will choose the value of the parameter automatically, based on the values of `n_samples`, `n_features`, `loss`, `multi_class` and `penalty`. If `n_samples` < `n_features` and optimizer supports chosen `loss`, `multi_class` and `penalty`, then dual will be set to True, otherwise it will be set to False. .. versionchanged:: 1.3  The `""auto""` option is added in version 1.3 and will be the default  in version 1.5.",'auto'
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"multi_class  multi_class: {'ovr', 'crammer_singer'}, default='ovr' Determines the multi-class strategy if `y` contains more than two classes. ``""ovr""`` trains n_classes one-vs-rest classifiers, while ``""crammer_singer""`` optimizes a joint objective over all classes. While `crammer_singer` is interesting from a theoretical perspective as it is consistent, it is seldom used in practice as it rarely leads to better accuracy and is more expensive to compute. If ``""crammer_singer""`` is chosen, the options loss, penalty and dual will be ignored.",'ovr'
,"fit_intercept  fit_intercept: bool, default=True Whether or not to fit an intercept. If set to True, the feature vector is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where 1 corresponds to the intercept. If set to False, no intercept will be used in calculations (i.e. data is expected to be already centered).",True
,"intercept_scaling  intercept_scaling: float, default=1.0 When `fit_intercept` is True, the instance vector x becomes ``[x_1, ..., x_n, intercept_scaling]``, i.e. a ""synthetic"" feature with a constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes intercept_scaling * synthetic feature weight. Note that liblinear internally penalizes the intercept, treating it like any other term in the feature vector. To reduce the impact of the regularization on the intercept, the `intercept_scaling` parameter can be set to a value greater than 1; the higher the value of `intercept_scaling`, the lower the impact of regularization on it. Then, the weights become `[w_x_1, ..., w_x_n, w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent the feature weights and the intercept weight is scaled by `intercept_scaling`. This scaling allows the intercept term to have a different regularization behavior compared to the other features.",1
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to ``class_weight[i]*C`` for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",
,"verbose  verbose: int, default=0 Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in liblinear that, if enabled, may not work properly in a multithreaded context.",0


In [103]:
X_train = make_text(train_df, use_meta=True)
y_train = train_df["label"]

X_valid = make_text(valid_df, use_meta=True)
y_valid = valid_df["label"]

X_test  = make_text(test_df,  use_meta=True)
y_test  = test_df["label"]


In [104]:
df = train_df.copy()
df["speaker"] = df["speaker"].fillna("UNKNOWN_SPEAKER")

X = make_text(df, use_meta=True)
y = df["label"]
groups = df["speaker"]

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, holdout_idx = next(gss.split(X, y, groups=groups))

X_train_sp = X.iloc[train_idx]
y_train_sp = y.iloc[train_idx]

X_holdout_sp = X.iloc[holdout_idx]
y_holdout_sp = y.iloc[holdout_idx]

print("Speaker-split sizes:", len(X_train_sp), len(X_holdout_sp))
print("Unique speakers in train:", df["speaker"].iloc[train_idx].nunique())
print("Unique speakers in holdout:", df["speaker"].iloc[holdout_idx].nunique())


Speaker-split sizes: 8299 1941
Unique speakers in train: 2328
Unique speakers in holdout: 583


In [105]:
model.fit(X_train_sp, y_train_sp)

pred_holdout = model.predict(X_holdout_sp)

print("Speaker-holdout accuracy:", round(accuracy_score(y_holdout_sp, pred_holdout), 4))
print("Speaker-holdout macro-F1:", round(f1_score(y_holdout_sp, pred_holdout, average="macro"), 4))
print("\nClassification report (speaker-holdout):")
print(classification_report(y_holdout_sp, pred_holdout, digits=3))


Speaker-holdout accuracy: 0.2375
Speaker-holdout macro-F1: 0.2293

Classification report (speaker-holdout):
              precision    recall  f1-score   support

 barely-true      0.165     0.190     0.177       306
       false      0.247     0.306     0.273       379
   half-true      0.266     0.279     0.272       409
 mostly-true      0.240     0.223     0.231       363
  pants-fire      0.280     0.140     0.186       186
        true      0.253     0.221     0.236       298

    accuracy                          0.238      1941
   macro avg      0.242     0.226     0.229      1941
weighted avg      0.241     0.238     0.236      1941



In [106]:
X_train = make_text(train_df, use_meta=True)
y_train = train_df["label"]

X_valid = make_text(valid_df, use_meta=True)
y_valid = valid_df["label"]

X_test  = make_text(test_df, use_meta=True)
y_test  = test_df["label"]

model.fit(X_train, y_train)

pred_valid = model.predict(X_valid)
pred_test  = model.predict(X_test)

print("VALID accuracy:", round(accuracy_score(y_valid, pred_valid), 4),
      "macro-F1:", round(f1_score(y_valid, pred_valid, average="macro"), 4))
print("TEST  accuracy:", round(accuracy_score(y_test, pred_test), 4),
      "macro-F1:", round(f1_score(y_test, pred_test, average="macro"), 4))


VALID accuracy: 0.2687 macro-F1: 0.2774
TEST  accuracy: 0.2636 macro-F1: 0.2613


In [107]:
labels = sorted(y_train.unique())
confusion_matrix(y_valid, pred_valid, labels=labels)


array([[50, 54, 61, 31, 12, 29],
       [48, 72, 48, 34, 28, 33],
       [43, 51, 73, 43,  7, 31],
       [35, 31, 57, 66,  8, 54],
       [16, 29, 10,  8, 41, 12],
       [16, 28, 39, 42,  1, 43]])

In [108]:
TRUEISH  = {"true", "mostly-true", "half-true"}
FALSEISH = {"false", "barely-true", "pants-fire"}

def to_binary(labels: pd.Series) -> pd.Series:
    return labels.map(lambda x: 1 if x in TRUEISH else 0)

y_train_bin = to_binary(train_df["label"])
y_valid_bin = to_binary(valid_df["label"])
y_test_bin  = to_binary(test_df["label"])


In [109]:
bin_model = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        ngram_range=(1,2),
        min_df=2,
        max_df=0.9,
        max_features=30000
    )),
    ("clf", LogisticRegression(
        max_iter=800,
        solver="liblinear",
        class_weight="balanced"
    ))
])

X_train_bin = make_text(train_df, use_meta=True)
X_valid_bin = make_text(valid_df, use_meta=True)
X_test_bin  = make_text(test_df,  use_meta=True)

bin_model.fit(X_train_bin, y_train_bin)

probs_valid = bin_model.predict_proba(X_valid_bin)[:, 1]  # P(true-ish)
pred_valid_bin = (probs_valid >= 0.5).astype(int)

print("Binary VALID accuracy:", round(accuracy_score(y_valid_bin, pred_valid_bin), 4))
print("Binary VALID F1:", round(f1_score(y_valid_bin, pred_valid_bin), 4))
print("\nBinary classification report (VALID):")
print(classification_report(y_valid_bin, pred_valid_bin, digits=3))


Binary VALID accuracy: 0.6433
Binary VALID F1: 0.6602

Binary classification report (VALID):
              precision    recall  f1-score   support

           0      0.631     0.619     0.625       616
           1      0.654     0.666     0.660       668

    accuracy                          0.643      1284
   macro avg      0.643     0.642     0.642      1284
weighted avg      0.643     0.643     0.643      1284



In [110]:
def evaluate_with_abstention(y_true, probs, threshold=0.7):
    y_true = np.asarray(y_true, dtype=int)
    probs = np.asarray(probs, dtype=float)

    confident = (probs >= threshold) | (probs <= 1 - threshold)
    coverage = confident.mean()

    preds = (probs >= 0.5).astype(int)

    # If threshold is too strict, could end up with 0 confident examples
    if confident.sum() == 0:
        return coverage, np.nan, np.nan

    acc = accuracy_score(y_true[confident], preds[confident])
    f1  = f1_score(y_true[confident], preds[confident])

    return coverage, acc, f1

for t in [0.55, 0.60, 0.65, 0.70, 0.75, 0.80]:
    coverage, acc, f1 = evaluate_with_abstention(y_valid_bin, probs_valid, threshold=t)
    print(f"threshold={t:.2f} | coverage={coverage:.2f} | acc={acc:.3f} | f1={f1:.3f}")


threshold=0.55 | coverage=0.77 | acc=0.682 | f1=0.690
threshold=0.60 | coverage=0.56 | acc=0.711 | f1=0.722
threshold=0.65 | coverage=0.38 | acc=0.753 | f1=0.759
threshold=0.70 | coverage=0.24 | acc=0.806 | f1=0.804
threshold=0.75 | coverage=0.14 | acc=0.835 | f1=0.824
threshold=0.80 | coverage=0.07 | acc=0.915 | f1=0.907


In [118]:
t = 0.70  # change this after you see the sweep results

probs_test = bin_model.predict_proba(X_test_bin)[:, 1]

coverage, acc, f1 = evaluate_with_abstention(y_test_bin, probs_test, threshold=t)
print(f"TEST with abstention (t={t}): coverage={coverage:.2f} | acc={acc:.3f} | f1={f1:.3f}")


TEST with abstention (t=0.7): coverage=0.24 | acc=0.799 | f1=0.826


In [115]:
def predict_sentence(text, model, threshold=0.70):
    # model should be your trained `bin_model`
    proba_trueish = model.predict_proba([text])[0, 1]  # P(true-ish)
    proba_falseish = 1 - proba_trueish
    
    # abstain unless very confident
    confident = (proba_trueish >= threshold) or (proba_trueish <= 1 - threshold)
    
    if not confident:
        return {
            "decision": "ABSTAIN (not confident)",
            "p_trueish": float(proba_trueish),
            "p_falseish": float(proba_falseish),
        }
    
    decision = "TRUE-ISH" if proba_trueish >= 0.5 else "FALSE-ISH"
    return {
        "decision": decision,
        "p_trueish": float(proba_trueish),
        "p_falseish": float(proba_falseish),
    }


In [122]:
threshold = 0.70

samples = [
    "The Earth orbits the Sun once every year.",
    "Vaccines cause autism.",
    "The unemployment rate fell last month according to the Bureau of Labor Statistics.",
    "The President was born on Mars.",
    "Drinking bleach cures infections."
]

for s in samples:
    print(s)
    print(predict_sentence(s, bin_model, threshold=threshold))
    print("-" * 60)

The Earth orbits the Sun once every year.
{'decision': 'ABSTAIN (not confident)', 'p_trueish': 0.6076728739748772, 'p_falseish': 0.39232712602512276}
------------------------------------------------------------
Vaccines cause autism.
{'decision': 'ABSTAIN (not confident)', 'p_trueish': 0.3769723661386222, 'p_falseish': 0.6230276338613778}
------------------------------------------------------------
The unemployment rate fell last month according to the Bureau of Labor Statistics.
{'decision': 'ABSTAIN (not confident)', 'p_trueish': 0.37455311422243326, 'p_falseish': 0.6254468857775668}
------------------------------------------------------------
The President was born on Mars.
{'decision': 'ABSTAIN (not confident)', 'p_trueish': 0.421302881909816, 'p_falseish': 0.578697118090184}
------------------------------------------------------------
Drinking bleach cures infections.
{'decision': 'ABSTAIN (not confident)', 'p_trueish': 0.4505471595313009, 'p_falseish': 0.5494528404686991}
-------

In [124]:
t = 0.70
X_test_bin = make_text(test_df, use_meta=True)
y_test_bin = to_binary(test_df["label"])

probs_test = bin_model.predict_proba(X_test_bin)[:, 1]
conf = (probs_test >= t) | (probs_test <= 1 - t)

conf_idx = np.where(conf)[0]
print("Confident examples:", len(conf_idx), "out of", len(test_df))

# show 10 confident examples
for i in conf_idx[:10]:
    s = test_df.loc[i, "statement"]
    true_lab = "TRUE-ISH" if y_test_bin.iloc[i] == 1 else "FALSE-ISH"
    p = probs_test[i]
    pred = "TRUE-ISH" if p >= 0.5 else "FALSE-ISH"
    print("\nSTATEMENT:", s)
    print("p_trueish:", round(float(p), 3), "| PRED:", pred, "| TRUE:", true_lab)


Confident examples: 298 out of 1267

STATEMENT: Building a wall on the U.S.-Mexico border will take literally years.
p_trueish: 0.253 | PRED: FALSE-ISH | TRUE: TRUE-ISH

STATEMENT: Suzanne Bonamici supports a plan that will cut choice for Medicare Advantage seniors.
p_trueish: 0.225 | PRED: FALSE-ISH | TRUE: TRUE-ISH

STATEMENT: When asked by a reporter whether hes at the center of a criminal scheme to violate campaign laws, Gov. Scott Walker nodded yes.
p_trueish: 0.297 | PRED: FALSE-ISH | TRUE: FALSE-ISH

STATEMENT: We know there are more Democrats in Georgia than Republicans. We know that for a fact.
p_trueish: 0.703 | PRED: TRUE-ISH | TRUE: FALSE-ISH

STATEMENT: Denali is the Kenyan word for black power.
p_trueish: 0.209 | PRED: FALSE-ISH | TRUE: FALSE-ISH

STATEMENT: Unfortunately we have documented instances where people defecated in the (Statehouse) building.
p_trueish: 0.179 | PRED: FALSE-ISH | TRUE: FALSE-ISH

STATEMENT: Says Charlie Crist is embroiled in a fraud case for stee