In [25]:
import pandas as pd

# Define column names (14 columns in LIAR dataset)
columns = [
    "id", "label", "statement", "subject", "speaker", "job_title",
    "state_info", "party", "barely_true_counts", "false_counts",
    "half_true_counts", "mostly_true_counts", "pants_on_fire_counts",
    "context"
]

# Load train, valid, test
train_df = pd.read_csv("liar_dataset/train.tsv", sep="\t", header=None, names=columns)
valid_df = pd.read_csv("liar_dataset/valid.tsv", sep="\t", header=None, names=columns)
test_df  = pd.read_csv("liar_dataset/test.tsv",  sep="\t", header=None, names=columns)

In [28]:
# Define fake vs real labels
fake_labels = ["pants-fire", "false", "barely-true"]
real_labels = ["half-true", "mostly-true", "true"]


In [29]:
# Apply binary mapping
for df in [train_df, valid_df, test_df]:
    df["binary_label"] = df["label"].apply(lambda x: 0 if x in fake_labels else 1)

# Check balance
print("\nTrain binary label distribution:\n", train_df["binary_label"].value_counts())
print("\nValid binary label distribution:\n", valid_df["binary_label"].value_counts())
print("\nTest binary label distribution:\n",  test_df["binary_label"].value_counts())

# Example: look at first few rows
print("\nSample:\n", train_df[["statement", "label", "binary_label"]].head())



Train binary label distribution:
 binary_label
1    5752
0    4488
Name: count, dtype: int64

Valid binary label distribution:
 binary_label
1    668
0    616
Name: count, dtype: int64

Test binary label distribution:
 binary_label
1    714
0    553
Name: count, dtype: int64

Sample:
                                            statement        label  \
0  Says the Annies List political group supports ...        false   
1  When did the decline of coal start? It started...    half-true   
2  Hillary Clinton agrees with John McCain "by vo...  mostly-true   
3  Health care reform legislation is likely to ma...        false   
4  The economic turnaround started at the end of ...    half-true   

   binary_label  
0             0  
1             1  
2             1  
3             0  
4             1  


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [31]:
X_train, y_train = train_df["statement"], train_df["binary_label"]
X_valid, y_valid = valid_df["statement"], valid_df["binary_label"]
X_test, y_test = test_df["statement"], test_df["binary_label"]

In [32]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_valid_tfidf = vectorizer.transform(X_valid)
X_test_tfidf  = vectorizer.transform(X_test)

In [33]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [34]:
y_pred_valid = model.predict(X_valid_tfidf)
print("Validation Accuracy:", accuracy_score(y_valid, y_pred_valid))
print(classification_report(y_valid, y_pred_valid))


Validation Accuracy: 0.616822429906542
              precision    recall  f1-score   support

           0       0.63      0.50      0.56       616
           1       0.61      0.73      0.66       668

    accuracy                           0.62      1284
   macro avg       0.62      0.61      0.61      1284
weighted avg       0.62      0.62      0.61      1284



In [35]:
y_pred_test = model.predict(X_test_tfidf)
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))


Test Accuracy: 0.6156274664561957
              precision    recall  f1-score   support

           0       0.58      0.46      0.51       553
           1       0.64      0.74      0.68       714

    accuracy                           0.62      1267
   macro avg       0.61      0.60      0.60      1267
weighted avg       0.61      0.62      0.61      1267



In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Pipeline
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("clf", LogisticRegression(max_iter=3000, random_state=42))
])


In [41]:
param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],       # unigrams vs unigrams+bigrams
    "tfidf__max_features": [5000, 10000, None], # limit vocab size
    "clf__C": [0.01, 0.1, 1, 5, 10],            # regularization strength
    "clf__class_weight": [None, "balanced"]     # handle imbalance
}

In [42]:
grid = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=3, 
    n_jobs=-1, 
    scoring="f1_weighted", # better metric than accuracy
    verbose=2
)

In [43]:
grid.fit(train_df["statement"], train_df["binary_label"])

# Best parameters
print("Best Parameters:", grid.best_params_)
print("Best CV Score:", grid.best_score_)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV] END clf__C=0.01, clf__class_weight=None, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END clf__C=0.01, clf__class_weight=None, tfidf__max_features=10000, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END clf__C=0.01, clf__class_weight=None, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time=   0.1s
[CV] END clf__C=0.01, clf__class_weight=None, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time=   0.2s
[CV] END clf__C=0.01, clf__class_weight=None, tfidf__max_features=10000, tfidf__ngram_range=(1, 1); total time=   0.2s
[CV] END clf__C=0.01, clf__class_weight=None, tfidf__max_features=10000, tfidf__ngram_range=(1, 1); total time=   0.2s
[CV] END clf__C=0.01, clf__class_weight=None, tfidf__max_features=None, tfidf__ngram_range=(1, 1); total time=   0.2s
[CV] END clf__C=0.01, clf__class_weight=None, tfidf__max_features=None, tfidf__ngram_range=(1, 1); total time

In [44]:
# Evaluate on test set
from sklearn.metrics import classification_report, accuracy_score

y_pred = grid.predict(test_df["statement"])
print("Test Accuracy:", accuracy_score(test_df["binary_label"], y_pred))
print(classification_report(test_df["binary_label"], y_pred))

Test Accuracy: 0.611681136543015
              precision    recall  f1-score   support

           0       0.56      0.54      0.55       553
           1       0.65      0.67      0.66       714

    accuracy                           0.61      1267
   macro avg       0.60      0.60      0.60      1267
weighted avg       0.61      0.61      0.61      1267

