In [1]:
import pandas as pd
from datasets import load_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, classification_report

In [2]:
dataset = load_dataset("liar", trust_remote_code=True)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'],
        num_rows: 10269
    })
    test: Dataset({
        features: ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'],
        num_rows: 1283
    })
    validation: Dataset({
        features: ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'],
        num_rows: 1284
    })
})

In [3]:
dataset['train'].features

{'id': Value(dtype='string', id=None),
 'label': ClassLabel(names=['false', 'half-true', 'mostly-true', 'true', 'barely-true', 'pants-fire'], id=None),
 'statement': Value(dtype='string', id=None),
 'subject': Value(dtype='string', id=None),
 'speaker': Value(dtype='string', id=None),
 'job_title': Value(dtype='string', id=None),
 'state_info': Value(dtype='string', id=None),
 'party_affiliation': Value(dtype='string', id=None),
 'barely_true_counts': Value(dtype='float32', id=None),
 'false_counts': Value(dtype='float32', id=None),
 'half_true_counts': Value(dtype='float32', id=None),
 'mostly_true_counts': Value(dtype='float32', id=None),
 'pants_on_fire_counts': Value(dtype='float32', id=None),
 'context': Value(dtype='string', id=None)}

In [4]:
df_train = dataset['train'].to_pandas()
df_test = dataset['test'].to_pandas()
df_val = dataset['validation'].to_pandas()

In [5]:
# input is the statement feature, output is one of six labels (pants-fire <-> true)
X_train, y_train = df_train['statement'], df_train['label']
print(X_train.shape, y_train.shape)
X_val, y_val = df_val['statement'], df_val['label']
X_test, y_test = df_test['statement'], df_test['label']

(10269,) (10269,)


In [6]:
# consider bigrams, normalize, and keep words relevant to at least a few statements
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, lowercase=True)

X_train_vec = vectorizer.fit_transform(X_train)
print(X_train_vec.shape)
X_val_vec = vectorizer.transform(X_val)
print(X_val_vec.shape)
X_test_vec = vectorizer.transform(X_test)
print(X_test_vec.shape)

(10269, 25715)
(1284, 25715)
(1283, 25715)


In [7]:
c = vectorizer.get_feature_names_out(X_train_vec)
print(len(c))
print(c)

25715
['00' '000' '000 000' ... 'zones' 'zones in' 'zoning']


In [8]:
log_reg = LogisticRegression(solver='saga', n_jobs=-1)
log_reg.fit(X_train_vec, y_train)

In [9]:
y_val_preds = log_reg.predict(X_val_vec)
y_test_preds = log_reg.predict(X_test_vec)
print(y_val_preds)
print(y_test_preds)

[2 4 0 ... 3 4 0]
[0 0 0 ... 1 0 0]


In [10]:
print(f'macro-f1 score based on y validation predictions - {f1_score(y_val, y_val_preds, average="macro")}')
print(f'macro-f1 score based on y test predictions - {f1_score(y_test, y_test_preds, average="macro")}')

macro-f1 score based on y validation predictions - 0.2429573512909522
macro-f1 score based on y test predictions - 0.2111513637218858


In [11]:
# 'false' (0), 'half-true' (1), 'mostly-true' (2), 'true' (3), 'barely-true' (4), 'pants-fire' (5)
labels = ['false', 'half-true', 'mostly-true', 'true', 'barely-true', 'pants-fire']

print(classification_report(y_val, y_val_preds, target_names=labels))

              precision    recall  f1-score   support

       false       0.27      0.33      0.30       263
   half-true       0.26      0.38      0.31       248
 mostly-true       0.29      0.28      0.28       251
        true       0.22      0.22      0.22       169
 barely-true       0.25      0.17      0.20       237
  pants-fire       0.48      0.09      0.15       116

    accuracy                           0.26      1284
   macro avg       0.29      0.24      0.24      1284
weighted avg       0.28      0.26      0.26      1284

