In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib



In [6]:
from datasets import load_dataset

# dataset = load_dataset("ucsbnlp/liar")
dataset = load_dataset("ucsbnlp/liar", split="train")
dataset.save_to_disk('liar_dataset')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Saving the dataset (0/1 shards):   0%|          | 0/10269 [00:00<?, ? examples/s]

In [7]:
# Load dataset
df = pd.read_csv('./liar_dataset/train.tsv', delimiter='\t', header=None)
df.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 
              'state', 'party', 'barely_true_counts', 'false_counts', 
              'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']

print(df)

               id        label  \
0       2635.json        false   
1      10540.json    half-true   
2        324.json  mostly-true   
3       1123.json        false   
4       9028.json    half-true   
...           ...          ...   
10235   5473.json  mostly-true   
10236   3408.json  mostly-true   
10237   3959.json    half-true   
10238   2253.json        false   
10239   1155.json   pants-fire   

                                               statement  \
0      Says the Annies List political group supports ...   
1      When did the decline of coal start? It started...   
2      Hillary Clinton agrees with John McCain "by vo...   
3      Health care reform legislation is likely to ma...   
4      The economic turnaround started at the end of ...   
...                                                  ...   
10235  There are a larger number of shark attacks in ...   
10236  Democrats have now become the party of the [At...   
10237  Says an alternative to Social Security that 

In [8]:
# Select necessary columns
df = df[['label', 'statement']]

# Map labels to binary classification
df['label'] = df['label'].map({
    'true': 1, 
    'mostly-true': 1, 
    'half-true': 1, 
    'barely-true': 0, 
    'false': 0, 
    'pants-fire': 0
})
print(df)

       label                                          statement
0          0  Says the Annies List political group supports ...
1          1  When did the decline of coal start? It started...
2          1  Hillary Clinton agrees with John McCain "by vo...
3          0  Health care reform legislation is likely to ma...
4          1  The economic turnaround started at the end of ...
...      ...                                                ...
10235      1  There are a larger number of shark attacks in ...
10236      1  Democrats have now become the party of the [At...
10237      1  Says an alternative to Social Security that op...
10238      0  On lifting the U.S. Cuban embargo and allowing...
10239      0  The Department of Veterans Affairs has a manua...

[10240 rows x 2 columns]


In [9]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['statement'], df['label'], test_size=0.2, random_state=42)
print(X_train, X_test, y_train, y_test)

550     We are now, for the first time ever, energy in...
7231    Were not engaged in nation-building in Afghani...
2515    Because of President Barack Obamas failure to ...
4466    New carbon regulations will increase electric ...
211     Obamacare is the biggest tax increase in Ameri...
                              ...                        
5734    When I took office, the deficit was nearly 10 ...
5191                      On the mosque near ground zero.
5390    Louie Gohmert of Texas blamed the mass shootin...
860     The Governor did not consult members of his ow...
7270    A telecom bill could keep the State Corporatio...
Name: statement, Length: 8192, dtype: object 3842     Polling shows that nearly 74 percent of Nation...
6480         I left the city with $43 million in the bank.
4521     Says she couldn't take stimulus money because ...
4026     The United States is the only industrialized c...
10111    The Health Care and Education Reconciliation A...
                      

In [10]:
# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)

In [11]:
# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

In [12]:
# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)
print(X_test_tfidf)

  (0, 320)	0.16853232521597145
  (0, 438)	0.32247722841075743
  (0, 488)	0.26297772295211475
  (0, 821)	0.2869991588431989
  (0, 1689)	0.10741925220949278
  (0, 1879)	0.22171723590846704
  (0, 2645)	0.23337345106799107
  (0, 2815)	0.20074456763133275
  (0, 2825)	0.2050480889229645
  (0, 2927)	0.07945473265228409
  (0, 3179)	0.1336593237833107
  (0, 3285)	0.33593078688508177
  (0, 3760)	0.3044049846117915
  (0, 3818)	0.32247722841075743
  (0, 3884)	0.24719780808431074
  (0, 4023)	0.2559032978761419
  (0, 4269)	0.21270321718245078
  (0, 4416)	0.11427406975532184
  (1, 118)	0.5059616635223685
  (1, 506)	0.4826503671556121
  (1, 859)	0.34974615607953935
  (1, 2071)	0.13340486995502887
  (1, 2435)	0.4268354590764318
  (1, 2689)	0.2822180759174692
  (1, 4418)	0.20987542348952018
  :	:
  (2045, 4418)	0.06577700292344864
  (2045, 4479)	0.08977455349680387
  (2045, 4916)	0.20274761682503892
  (2046, 499)	0.39415847155646533
  (2046, 679)	0.4032896825109492
  (2046, 1804)	0.35959394504831593
  (

In [13]:
# Debug: Check shapes and types
print("X_train_tfidf shape:", X_train_tfidf.shape)
print("y_train shape:", y_train.shape)
print("Data types:", type(X_train_tfidf), type(y_train))

X_train_tfidf shape: (8192, 5000)
y_train shape: (8192,)
Data types: <class 'scipy.sparse._csr.csr_matrix'> <class 'pandas.core.series.Series'>


In [14]:
# Initialize Logistic Regression
model = LogisticRegression()

In [15]:
# Train the model
try:
    model.fit(X_train_tfidf, y_train)
    print("Model trained successfully.")
except ValueError as e:
    print("Error during model fitting:", e)
    print("Ensure X_train_tfidf and y_train are compatible and have correct shapes.")
    raise

Model trained successfully.


In [16]:
# Predict on the test set
y_pred = model.predict(X_test_tfidf)

In [17]:
# Print Accuracy and Classification Report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6162109375
Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.47      0.52       888
           1       0.64      0.73      0.68      1160

    accuracy                           0.62      2048
   macro avg       0.61      0.60      0.60      2048
weighted avg       0.61      0.62      0.61      2048



In [18]:
# Save the model and vectorizer
joblib.dump(model, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [22]:
X = vectorizer.transform(["the sky is bule"])
prediction = model.predict(X)
print(X)
print(prediction)

  (0, 2199)	0.8825229163216437
  (0, 4418)	0.4702693931855879
[0]
