In [3]:
#Importing necessary libraries
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [4]:
#Loading the dataset
data = pd.read_csv("E:\Fake News\News.csv", usecols = ['title','label'])
data.head()

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 12-13: malformed \N character escape (2386469579.py, line 2)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   71576 non-null  object
 1   label   72134 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


In [None]:
#Handling Missing Values
data=data.dropna()
data.reset_index(drop=True, inplace=True)
data.head()

Unnamed: 0,title,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,1
2,"Bobby Jindal, raised Hindu, uses story of Chri...",0
3,SATAN 2: Russia unvelis an image of its terrif...,1
4,About Time! Christian Group Sues Amazon and SP...,1


In [None]:
#Preprocess the data
class NewsDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Spliting the data into training and testing sets
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    data['title'].values,
    data['label'].values,
    test_size=0.2,
    random_state=42
)

In [None]:
# Initializing tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 32 

In [None]:
# Creating dataset and dataloaders
train_dataset = NewsDataset(train_sentences, train_labels, tokenizer, max_length)
test_dataset = NewsDataset(test_sentences, test_labels, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
#Model Building
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#Model Training
optimizer = AdamW(model.parameters(), lr=2e-5)

def train_epoch(model, data_loader, optimizer):
    model = model.train()
    total_loss = 0

    for data in tqdm(data_loader):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    loss = train_epoch(model, train_loader, optimizer)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss:.4f}')


100%|██████████| 3579/3579 [2:10:34<00:00,  2.19s/it]  


Epoch 1/5, Loss: 0.1803


  5%|▍         | 169/3579 [06:10<2:04:39,  2.19s/it]


KeyboardInterrupt: 

In [None]:
#Model Inference
from sklearn.metrics import accuracy_score, classification_report
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predictions.cpu().numpy())

print("BERT Results:")
print(accuracy_score(y_true,y_pred))
print(classification_report(y_true, y_pred))

In [None]:
model.save_pretrained('E:/Fake News')
tokenizer.save_pretrained('E:/Fake News')
print("Model Saved Successfully")