In [1]:
import pandas as pd

# Load datasets
true_df = pd.read_csv('/home/vojtech/Documents/aaa_programovani/detekce-fake-news-projekt-backend/source/testing/deepsee-ayp/archive/True.csv')
fake_df = pd.read_csv('/home/vojtech/Documents/aaa_programovani/detekce-fake-news-projekt-backend/source/testing/deepsee-ayp/archive/Fake.csv')

# Add labels
true_df['label'] = 1  # Real news
fake_df['label'] = 0  # Fake news

# Combine and shuffle
df = pd.concat([true_df, fake_df]).sample(frac=1).reset_index(drop=True)

In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|@\w+|[^a-zA-Z]', ' ', text)  # Remove URLs/special chars
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vojtech/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/vojtech/nltk_data...


In [3]:
from sklearn.model_selection import train_test_split

X = df['cleaned_text'].values
y = df['label'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True, max_length=128)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


In [5]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).shuffle(1000).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
)).batch(16)

2025-03-31 10:11:23.170963: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743408683.209740    5458 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743408683.220090    5458 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743408683.245320    5458 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743408683.245347    5458 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743408683.245352    5458 computation_placer.cc:177] computation placer alr

In [12]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import numpy as np

# Load PyTorch model instead of TensorFlow
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    id2label={0: "fake", 1: "real"}
)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to encode text and make predictions
def predict(texts):
    # Tokenize
    encoded = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    
    # Get predictions
    with torch.no_grad():
        outputs = model(**encoded)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predictions = torch.argmax(probabilities, dim=-1)
    
    # Convert to numpy for easier handling
    probs_numpy = probabilities.numpy()
    preds_numpy = predictions.numpy()
    
    results = []
    for i, text in enumerate(texts):
        results.append({
            'text': text,
            'prediction': model.config.id2label[preds_numpy[i]],
            'fake_probability': float(probs_numpy[i, 0]),
            'real_probability': float(probs_numpy[i, 1])
        })
    
    return results

# For training, you'll need to create PyTorch datasets and use PyTorch's training loop

  _torch_pytree._register_pytree_node(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Evaluate on validation set
results = model.evaluate(val_dataset)
print(f'Validation Accuracy: {results[1]:.2f}')

# Predict
sample_text = "Scientists confirm that the Earth is flat."
inputs = tokenizer(sample_text, return_tensors='tf', truncation=True, padding=True)
outputs = model(inputs)
prediction = tf.argmax(outputs.logits, axis=1).numpy()[0]
print("Fake" if prediction == 0 else "Real")

AttributeError: 'BertForSequenceClassification' object has no attribute 'evaluate'

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

# Create PyTorch Dataset
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create PyTorch datasets from your tokenized data
train_dataset = NewsDataset(train_encodings, y_train)
val_dataset = NewsDataset(val_encodings, y_val)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Set device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Set optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
def train_model(model, train_loader, val_loader, epochs=3):
    best_accuracy = 0
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0
        
        for batch in tqdm(train_loader, desc=f"Training epoch {epoch+1}"):
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss
            train_loss += loss.item()
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        avg_train_loss = train_loss / len(train_loader)
        
        # Evaluation phase
        val_accuracy, val_report = evaluate_model(model, val_loader)
        
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"  Training loss: {avg_train_loss:.4f}")
        print(f"  Validation accuracy: {val_accuracy:.4f}")
        print(f"  Validation report:\n{val_report}")
        
        # Save best model
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_bert_model.pt')
            print(f"  New best model saved with accuracy: {best_accuracy:.4f}")
    
    return model

# Evaluation function
def evaluate_model(model, dataloader):
    model.eval()
    predictions = []
    actual_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            labels = batch.pop('labels')
            outputs = model(**batch)
            
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            
            predictions.extend(preds)
            actual_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(actual_labels, predictions)
    report = classification_report(actual_labels, predictions, target_names=['fake', 'real'])
    
    return accuracy, report

# Train the model
model = train_model(model, train_loader, val_loader, epochs=3)

# Save the final model
torch.save({
    'model_state_dict': model.state_dict(),
    'tokenizer': tokenizer
}, 'fake_news_bert_model.pt')

Training epoch 1:  47%|████▋     | 1049/2245 [3:27:07<2:17:48,  6.91s/it]  