In [36]:
!pip install datasets
!pip install datasets transformers scikit-learn



In [37]:
import torch
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Load dataset
dataset = load_dataset('liar')

train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])

print(train_df.columns)
print(train_df.head())

# Combine train and test for preprocessing
df = pd.concat([train_df, test_df], ignore_index=True)

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Split back into train and test
train_df = df.iloc[:len(train_df)]
test_df = df.iloc[len(train_df):]

train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['label'])


Index(['id', 'label', 'statement', 'subject', 'speaker', 'job_title',
       'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts',
       'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts',
       'context'],
      dtype='object')
           id  label                                          statement  \
0   2635.json      0  Says the Annies List political group supports ...   
1  10540.json      1  When did the decline of coal start? It started...   
2    324.json      2  Hillary Clinton agrees with John McCain "by vo...   
3   1123.json      0  Health care reform legislation is likely to ma...   
4   9028.json      1  The economic turnaround started at the end of ...   

                              subject         speaker             job_title  \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-

In [38]:
class NewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        text = row['statement']
        label = row['label']
        # Tokenize the text
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )
        input_ids = encoding['input_ids'].squeeze(0)  # Remove batch dimension
        attention_mask = encoding['attention_mask'].squeeze(0)
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [39]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define maximum sequence length
max_length = 128

train_dataset = NewsDataset(train_df, tokenizer, max_length)
val_dataset = NewsDataset(val_df, tokenizer, max_length)
test_dataset = NewsDataset(test_df, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


In [40]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load BERT model with a classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

# Calculates accuracy
def compute_accuracy(preds, labels):
    preds = torch.argmax(preds, dim=1)
    return accuracy_score(labels.cpu(), preds.cpu())


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    for batch in tqdm(data_loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        all_preds.append(logits)
        all_labels.append(labels)
    avg_train_loss = total_loss / len(data_loader)
    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    train_accuracy = compute_accuracy(all_preds, all_labels)
    return avg_train_loss, train_accuracy

def eval_model(model, data_loader, device):
    model = model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            total_loss += loss.item()
            all_preds.append(logits)
            all_labels.append(labels)
    avg_val_loss = total_loss / len(data_loader)
    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    val_accuracy = compute_accuracy(all_preds, all_labels)
    return avg_val_loss, val_accuracy


In [42]:
# Training loop
epochs = 4
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        all_preds.append(logits)
        all_labels.append(labels)
    avg_train_loss = total_loss / len(train_loader)
    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    train_accuracy = compute_accuracy(all_preds, all_labels)
    print(f"Train Loss: {avg_train_loss}, Train Accuracy: {train_accuracy}")

    # Validation
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            total_loss += loss.item()
            all_preds.append(logits)
            all_labels.append(labels)
    avg_val_loss = total_loss / len(val_loader)
    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    val_accuracy = compute_accuracy(all_preds, all_labels)
    print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy}")


Epoch 1/4


Training: 100%|██████████| 578/578 [03:11<00:00,  3.02it/s]


Train Loss: 1.735674844274884, Train Accuracy: 0.2288465700064921


Evaluating: 100%|██████████| 65/65 [00:07<00:00,  8.52it/s]


Validation Loss: 1.7186565710948063, Validation Accuracy: 0.22590068159688412
Epoch 2/4


Training: 100%|██████████| 578/578 [03:13<00:00,  2.99it/s]


Train Loss: 1.658349938252393, Train Accuracy: 0.29084613719974034


Evaluating: 100%|██████████| 65/65 [00:07<00:00,  8.51it/s]


Validation Loss: 1.697879145695613, Validation Accuracy: 0.24829600778967867
Epoch 3/4


Training: 100%|██████████| 578/578 [03:12<00:00,  3.00it/s]


Train Loss: 1.5377616065596214, Train Accuracy: 0.3632330664358364


Evaluating: 100%|██████████| 65/65 [00:07<00:00,  8.57it/s]


Validation Loss: 1.722458632175739, Validation Accuracy: 0.25705939629990265
Epoch 4/4


Training: 100%|██████████| 578/578 [03:12<00:00,  3.00it/s]


Train Loss: 1.3109831924463227, Train Accuracy: 0.4822549231768016


Evaluating: 100%|██████████| 65/65 [00:07<00:00,  8.65it/s]

Validation Loss: 1.85408521432143, Validation Accuracy: 0.2599805258033106





In [43]:
test_loss, test_accuracy = eval_model(model, test_loader, device)
print(f"Test Accuracy: {test_accuracy}")

Evaluating: 100%|██████████| 81/81 [00:09<00:00,  8.61it/s]

Test Accuracy: 0.2634450506625097



