In [10]:
#Import relevant libraries
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm


In [11]:
#Load text dataset
text_df = pd.read_csv('text_dataset.csv')
columns_to_drop_text = ['token_count']
text_df.drop(columns_to_drop_text, axis=1, inplace=True)
text_df

Unnamed: 0,label,cleaned_text,cleaned_subject,cleaned_title
0,1,donald trump met member nato go well moment ar...,news,watch trump shove foreign leader way get front...
1,0,washington reuters rick perry presidentelect d...,politicsnews,trump energy pick perry softens stance climate...
2,1,president obama blasted republican presidentia...,politics,obama finally build border wallbut there one p...
3,1,male idaho republican five daughter made creep...,news,republican lawmaker say rape wont cause pregna...
4,1,kellyanne conway tried spin white house press ...,news,watch chuck todd swat annoying kellyanne conwa...
...,...,...,...,...
13829,0,washington reuters member u congress party fri...,politicsnews,u lawmaker back syria strike demand plan trump
13830,1,far video 530000 view make content legitimate ...,politics,ups secretly fly refugee u middle east watch g...
13831,0,dec 27 story corrects say 55000 page email ins...,politicsnews,u appeal court revives clinton email suit
13832,0,madrid reuters spain high court said tuesday g...,worldnews,spanish court grant u extradition russian hack...


In [12]:
# Splitting data into training and validation sets, tokenizing the text, and converting labels into a fitting format.
# Split data into training and validation sets
train_df, val_df = train_test_split(text_df, test_size=0.2, random_state=42)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Encode the texts using tokenizer
        encoding = tokenizer(text, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create dataset objects
train_dataset = NewsDataset(train_df['cleaned_text'].tolist(), train_df['label'].tolist())
val_dataset = NewsDataset(val_df['cleaned_text'].tolist(), val_df['label'].tolist())


In [4]:
#Creating data loaders to handle batching and shuffling the data for training and validation.
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [5]:
#Initializing a BERT pre-trained model
BERTmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
#Below we define the training process, including forward and backward propagation.

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BERTmodel.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

def train_epoch(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(data_loader)

# Train the model (example for one epoch)
loss = train_epoch(model, train_loader, optimizer)
print(f"Training loss: {loss}")


100%|██████████| 692/692 [5:36:41<00:00, 29.19s/it]    

Training loss: 0.05537884194423065





In [7]:
#Now we evaluate the model on the validation/test set
def evaluate(model, data_loader):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in tqdm(data_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct_predictions += torch.sum(predictions == batch['labels'])

    return total_loss / len(data_loader), correct_predictions.double() / len(data_loader.dataset)

val_loss, val_accuracy = evaluate(model, val_loader)
print(f"Validation loss: {val_loss}, Accuracy: {val_accuracy}")


100%|██████████| 173/173 [17:06<00:00,  5.94s/it]

Validation loss: 0.004552477456523628, Accuracy: 0.9992771955186122





In [14]:
#Saving the model
#torch.save(model.state_dict(), 'bert_news_classifier.pth')
#model.load_state_dict(torch.load('bert_news_classifier.pth'))


<All keys matched successfully>