In [1]:
import pandas as pd

# Load the data
fake_news = pd.read_csv('Fake.csv')
non_fake_news = pd.read_csv('True.csv')

# Add a label column
fake_news['label'] = 'fake'
non_fake_news['label'] = 'non-fake'

In [2]:
# Combine the dataframes
combined_df = pd.concat([fake_news, non_fake_news], ignore_index=True)


In [3]:
df = combined_df

In [4]:
df['label'] = df['label'].map(lambda x:1 if x == 'fake' else 0)

In [5]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'])

In [6]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Define a padding token - this is necessary because GPT-2 doesn't have one by default
tokenizer.pad_token = tokenizer.eos_token

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = tokenizer.eos_token_id


# Tokenize the dataset
train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=512)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import torch
from torch.utils.data import Dataset, DataLoader

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_df['label'].tolist())
test_dataset = NewsDataset(test_encodings, test_df['label'].tolist())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [8]:
from tqdm import tqdm

In [9]:
from transformers import AdamW

# Setup GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}/{num_epochs} Finished")


  0%|▏                                     | 20/4490 [02:54<10:50:04,  8.73s/it][E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument

[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument


KeyboardInterrupt: 

## OPEN-AI GPT

In [11]:
from transformers import OpenAIGPTTokenizer, OpenAIGPTForSequenceClassification

# # Load the tokenizer and model
# tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
# tokenizer.pad_token = tokenizer.eos_token

# model = OpenAIGPTForSequenceClassification.from_pretrained("openai-gpt", num_labels=2)
# model.config.pad_token_id = tokenizer.eos_token_id



ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.
Using eos_token, but it is not set yet.
Some weights of OpenAIGPTForSequenceClassification were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Load the tokenizer
tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")

# Define and add a padding token - this is necessary because OpenAI GPT doesn't have one by default
pad_token = '<PAD>'
tokenizer.add_special_tokens({'pad_token': pad_token})


ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


1

In [13]:
model = OpenAIGPTForSequenceClassification.from_pretrained("openai-gpt", num_labels=2)
model.resize_token_embeddings(len(tokenizer))


Some weights of OpenAIGPTForSequenceClassification were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(40479, 768)

In [14]:
model.config.pad_token_id = tokenizer.pad_token_id

In [15]:
# Tokenize the dataset
train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=512)

In [16]:
import torch
from torch.utils.data import Dataset, DataLoader

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_df['label'].tolist())
test_dataset = NewsDataset(test_encodings, test_df['label'].tolist())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [18]:
from transformers import AdamW

# Setup GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}/{num_epochs} Finished")

100%|█████████████████████████████████████| 4490/4490 [9:51:17<00:00,  7.90s/it]

Epoch 1/1 Finished





In [20]:
import numpy as np

In [21]:
from sklearn.metrics import f1_score, confusion_matrix

# Evaluation loop
model.eval()

# Lists to store true labels and predictions
true_labels = []
predicted_labels = []

for batch in test_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        
        # Append to lists
        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(predictions.cpu().numpy())

# Compute accuracy
accuracy = 100 * np.mean(np.array(true_labels) == np.array(predicted_labels))
print(f"Accuracy: {accuracy}%")

# Compute F1 score
f1 = f1_score(true_labels, predicted_labels, average='macro') # 'macro' will compute the F1 score for each label and find their average
print(f"F1 Score: {f1}")

# Compute confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 99.92204899777283%
F1 Score: 0.9992188854365017
Confusion Matrix:
[[4283    1]
 [   6 4690]]
