In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score
from wordcloud import WordCloud, STOPWORDS
from collections import Counter, defaultdict
import random
import time
import datetime
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
df = pd.read_csv("/content/drive/MyDrive/ML/ML Problems/Fake News Classifier/Data/train.csv")

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)
def tokenize_map(sentence,labels):
    input_ids = []
    attention_masks = []

    for text in sentence:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens = True,
                            truncation='longest_first',
                            max_length = 84,
                            pad_to_max_length = True,
                            return_attention_mask = True,
                            return_tensors = 'pt')
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    return input_ids, attention_masks, labels

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]



In [5]:
labels = df['label'].values
df = df.fillna(' ')
text = df['text'].values
print(text.shape)
input_ids, attention_masks, labels = tokenize_map(text, labels)



(20800,)


In [6]:
dataset = TensorDataset(input_ids, attention_masks, labels)
train_dataset, val_dataset = train_test_split(dataset, test_size=0.20)
print(len(train_dataset),'training samples', len(val_dataset),  'validation samples')

16640 training samples 4160 validation samples


In [7]:
batch_size = 15

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size)

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size )

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = BertForSequenceClassification.from_pretrained(
    'bert-large-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)
model.to(device)
optimizer = AdamW(model.parameters(),lr = 6e-6,eps = 1e-8 )

cuda:0


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 0,num_training_steps = total_steps)
def calc_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, pred_flat)

In [None]:


training_stats = []
for epoch_i in range(epochs):

    print('Epoch {} / {} '.format(epoch_i + 1, epochs))
    print('Training')

    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        if step % 100 == 0:
            print("Step completed:", step)

        b_input_ids = batch[0].to(device).to(torch.int64)
        b_input_mask = batch[1].to(device).to(torch.int64)
        b_labels = batch[2].to(device).to(torch.int64)

        model.zero_grad()
        loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,labels=b_labels)[0]
        logits = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask,labels=b_labels)[1]
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print('Average training loss:', avg_train_loss)

    #validation
    print('Validation')
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    total_eval_f1 = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            loss = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask,labels=b_labels)[0]

            logits = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask,labels=b_labels)[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += calc_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print('Accuracy', avg_val_accuracy)
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print('Loss:', avg_val_loss)

    training_stats.append({'epoch': epoch_i + 1,'train_loss': avg_train_loss,'val_loss': avg_val_loss,'val_acc.': avg_val_accuracy})
print("DONE")

In [10]:
# NEW CODE WITH OPTIMISATION SO THAT IT CAN TRAIN PROPERLY
import torch
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler

# Assuming your model, optimizer, and scheduler are already defined
epochs = 3
accumulation_steps = 4  # Gradients will accumulate for 4 steps before updating
batch_size = 4  # Reduced batch size
scaler = GradScaler()  # For mixed precision training

# Adjust your DataLoader for smaller batch size
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
validation_dataloader = DataLoader(val_dataset, batch_size=batch_size, pin_memory=True)

training_stats = []

for epoch_i in range(epochs):
    print(f'Epoch {epoch_i + 1}/{epochs}')
    print('Training')

    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 100 == 0:
            print(f"Step completed: {step}")

        # Load batch to device
        b_input_ids = batch[0].to(device).to(torch.int64)
        b_input_mask = batch[1].to(device).to(torch.int64)
        b_labels = batch[2].to(device).to(torch.int64)

        # Zero gradients every accumulation step
        if step % accumulation_steps == 0:
            optimizer.zero_grad()

        # Mixed precision training
        with autocast():
            # Forward pass
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs[0]  # Loss from the output tuple

            # Normalize loss for accumulation
            loss = loss / accumulation_steps

        # Backpropagation with scaled gradients
        scaler.scale(loss).backward()

        # Gradient clipping and optimization every accumulation step
        if (step + 1) % accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters
            scaler.step(optimizer)
            scaler.update()

            # Update learning rate scheduler
            scheduler.step()

        # Track total training loss
        total_train_loss += loss.item()

        # Free memory for the next batch
        del b_input_ids, b_input_mask, b_labels
        torch.cuda.empty_cache()

    # Calculate average training loss for the epoch
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f'Average training loss: {avg_train_loss}')

    # Validation phase
    print('Validation')
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Disable gradient computation for validation
        with torch.no_grad():
            with autocast():
                outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
                loss = outputs[0]
                logits = outputs[1]

            total_eval_loss += loss.item()

            # Get predictions and calculate accuracy
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            total_eval_accuracy += calc_accuracy(logits, label_ids)

        # Free memory
        del b_input_ids, b_input_mask, b_labels
        torch.cuda.empty_cache()

    # Calculate validation metrics
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    print(f'Validation Accuracy: {avg_val_accuracy}')
    print(f'Validation Loss: {avg_val_loss}')

    # Store training stats
    training_stats.append({
        'epoch': epoch_i + 1,
        'train_loss': avg_train_loss,
        'val_loss': avg_val_loss,
        'val_acc': avg_val_accuracy
    })

print("Training Complete!")


  scaler = GradScaler()  # For mixed precision training
  with autocast():


Epoch 1/3
Training
Step completed: 0
Step completed: 100
Step completed: 200
Step completed: 300
Step completed: 400
Step completed: 500
Step completed: 600
Step completed: 700
Step completed: 800
Step completed: 900
Step completed: 1000
Step completed: 1100
Step completed: 1200
Step completed: 1300
Step completed: 1400
Step completed: 1500
Step completed: 1600
Step completed: 1700
Step completed: 1800
Step completed: 1900
Step completed: 2000
Step completed: 2100
Step completed: 2200
Step completed: 2300
Step completed: 2400
Step completed: 2500
Step completed: 2600
Step completed: 2700
Step completed: 2800
Step completed: 2900
Step completed: 3000
Step completed: 3100
Step completed: 3200
Step completed: 3300
Step completed: 3400
Step completed: 3500
Step completed: 3600
Step completed: 3700
Step completed: 3800
Step completed: 3900
Step completed: 4000
Step completed: 4100
Average training loss: 0.03727190126844037
Validation


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autoc

Validation Accuracy: 0.9802884615384615
Validation Loss: 0.08647727662554154
Epoch 2/3
Training
Step completed: 0
Step completed: 100
Step completed: 200
Step completed: 300
Step completed: 400
Step completed: 500
Step completed: 600
Step completed: 700
Step completed: 800
Step completed: 900
Step completed: 1000
Step completed: 1100
Step completed: 1200
Step completed: 1300
Step completed: 1400
Step completed: 1500
Step completed: 1600
Step completed: 1700
Step completed: 1800
Step completed: 1900
Step completed: 2000
Step completed: 2100
Step completed: 2200
Step completed: 2300
Step completed: 2400
Step completed: 2500
Step completed: 2600
Step completed: 2700
Step completed: 2800
Step completed: 2900
Step completed: 3000
Step completed: 3100
Step completed: 3200
Step completed: 3300
Step completed: 3400
Step completed: 3500
Step completed: 3600
Step completed: 3700
Step completed: 3800
Step completed: 3900
Step completed: 4000
Step completed: 4100
Average training loss: 0.011062969

  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autoc

Validation Accuracy: 0.9850961538461539
Validation Loss: 0.0747883739379736
Epoch 3/3
Training
Step completed: 0


  with autocast():


Step completed: 100
Step completed: 200
Step completed: 300
Step completed: 400
Step completed: 500
Step completed: 600
Step completed: 700
Step completed: 800
Step completed: 900
Step completed: 1000
Step completed: 1100
Step completed: 1200
Step completed: 1300
Step completed: 1400
Step completed: 1500
Step completed: 1600
Step completed: 1700
Step completed: 1800
Step completed: 1900
Step completed: 2000
Step completed: 2100
Step completed: 2200
Step completed: 2300
Step completed: 2400
Step completed: 2500
Step completed: 2600
Step completed: 2700
Step completed: 2800
Step completed: 2900
Step completed: 3000
Step completed: 3100
Step completed: 3200
Step completed: 3300
Step completed: 3400
Step completed: 3500
Step completed: 3600
Step completed: 3700
Step completed: 3800
Step completed: 3900
Step completed: 4000
Step completed: 4100
Average training loss: 0.00384463956937767
Validation


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autoc

Validation Accuracy: 0.9853365384615385
Validation Loss: 0.08806625722119442
Training Complete!


In [11]:

from sklearn.metrics import precision_score, recall_score
def calc_f1(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, pred_flat)

def calc_precision(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(labels_flat, pred_flat)

def calc_recall(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(labels_flat, pred_flat)

model.eval()
accuracy_sum = 0
total_eval_loss = 0
f1_sum = 0
precision_sum = 0
recall_sum = 0
nb_eval_steps = 0

for batch in validation_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():
        loss = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask,labels=b_labels)[0]

        logits = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask,labels=b_labels)[1]

    total_eval_loss += loss.item()
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    accuracy_sum += calc_accuracy(logits, label_ids)
    f1_sum += calc_f1(logits, label_ids)
    precision_sum += calc_precision(logits, label_ids)
    recall_sum += calc_recall(logits, label_ids)


print('Accuracy',accuracy_sum / len(validation_dataloader))
print('Precision', precision_sum / len(validation_dataloader))
print('Recall', recall_sum / len(validation_dataloader))
print('F1', f1_sum / len(validation_dataloader))
print('Loss:', total_eval_loss / len(validation_dataloader))

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(averag

Accuracy 0.9853365384615385
Precision 0.9330128205128204
Recall 0.9330128205128204
F1 0.9225641025641019
Loss: 0.08807378946054926


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
