In [1]:

# Install necessary libraries
# !pip install transformers pandas scikit-learn torch
# or conda defined environment conda install conda-forge::sentencepiece

In [2]:
# torch.cuda.empty_cache()
# torch.mps.empty_cache()

# Environment setting

In [3]:
# Import libraries
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import RobertaTokenizer, RobertaModel
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm

from torch.utils.tensorboard import SummaryWriter
import os
from datetime import datetime

In [4]:
# If Metal Performance Shader (mps) is not available tell me:
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

# If mps is available directly put it on the device.
else:
    device = torch.device("mps")
,
# If cuda (nvidia gpu) is not available tell me:
if not torch.cuda.is_available():
    print("Cuda not available because the current PyTorch install was not "
              "built with Cuda enabled.")
   

# If cuda is available directly put it on the device.
else:
    device = torch.device("cuda")
,

# If neither cuda and mps are available, set device to "cpu"
if not torch.backends.mps.is_available():
    if not torch.cuda.is_available():
        print("Neither Cuda nor MPS are available")
        device = torch.device("cpu")

,
# Is mps available?

mps_avail = torch.backends.mps.is_available() 
print(f"Is Metal Performance Shader (mps) available? {mps_avail}")

,

# Is mps available?

cuda_avail = torch.cuda.is_available() 
print(f"Is Cuda available? {cuda_avail}")


Cuda not available because the current PyTorch install was not built with Cuda enabled.
Is Metal Performance Shader (mps) available? True
Is Cuda available? False


In [5]:
print("gpu device: ", device)

gpu device:  mps


In [6]:
# Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

In [7]:
# Setting seed:

def set_seed_fun(seed_number: int):
    """
    We could also use pytorch_lightning package
    try:
        import pytorch_lightning as pl
    except ModuleNotFoundError: # Google Colab does not have PyTorch Lightning installed by default. Hence, we do it here if necessary
        !pip install --quiet pytorch-lightning>=1.5
        import pytorch_lightning as pl
    
    pl.seed_everything(42)
    """
    np.random.seed(seed_number)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed_number)
    if torch.cuda.is_available():
        torch.manual_seed(seed_number)
        torch.cuda.manual_seed(seed_number)
        torch.cuda.manual_seed_all(seed_number)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchark = False

set_seed_fun(42)


gpu device:  mps


# Function definitions

## Embedding function

In [8]:
# Function to generate embeddings with RoBERTa
def generate_embeddings(texts, tokenizer, model):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length').to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.append(embedding)
    return np.vstack(embeddings)

## Class functions

Definition of functions to:
- Transform a data set into the good format
- create a simple neural network architecture
- create a funtion to transform seconds into a list of (hours, minutes, seconds)

In [9]:
# Define a Dataset class for PyTorch
class CovidNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length, device):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.device = device

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [10]:
# Define a simple neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.dropout1 = nn.Dropout(p=0.6)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 256)
        self.dropout2 = nn.Dropout(p=0.6)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(256, 2)

    def forward(self, x):
        x = self.fc1(x)
        x = self.dropout1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.dropout2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x


In [11]:
def sec2hms(ss):
	(hh, ss)=divmod(ss, 3600)
	(mm, ss)=divmod(ss, 60)
	return (hh, mm, ss)

## Training function

Definition of the training function.

In [27]:
input_size = 768
model = SimpleNN(input_size).to(device)
criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.01)


gpu device:  mps


0it [00:00, ?it/s]

In [14]:
# function to train the model
def train_model(model, train_dataloader, val_dataloader, epochs, device, writer):

    num_training_steps = 0
    progress_bar = tqdm(range(num_training_steps))
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay = 0.01)
    
    # patience parameters initialization
    best_val_loss = float('inf')
    patience = 2
    patience_counter = 0

    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            progress_bar.update(1)

        avg_train_loss = total_loss / len(train_dataloader)
        writer.add_scalar('Loss/train', avg_train_loss, epoch)

        model.eval()
        val_preds = []
        val_labels = []
        val_loss = 0
        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
                preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
                val_preds.extend(preds)
                val_labels.extend(labels.cpu().numpy())

        val_accuracy = accuracy_score(val_labels, val_preds)
        avg_val_loss = val_loss / len(val_dataloader)

        writer.add_scalar('Loss/validation', avg_val_loss, epoch)
        writer.add_scalar('Accuracy/validation', val_accuracy, epoch)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, "
              f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print("Early stopping")
            break
    
    return model

# Training part

## Loading of the dataset 

In [15]:
# Load data from Google Drive
# data_path = '/content/drive/My Drive/final_combined_dataset.csv'
# df = pd.read_csv(data_path)
df = pd.read_csv("final_combined_dataset.csv")

In [16]:
## Just in case of bug: to check where we are in the directories

#import os

#cwd = os.getcwd()  # Get the current working directory (cwd)
#files = os.listdir(cwd)  # Get all the files in that directory
#print("Files in %r: %s" % (cwd, files))

## Model initialization

In [17]:
# Initialize XLNet for embeddings
# Load model directly

tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
#####################################
# benchmark beginning for embedding #
#####################################
time_start_embed = time.perf_counter()

In [19]:
# Prepare data
X = df['Text'].values.astype(str)
y = (df['Label'] == 'fake').astype(int).values


## Splitting

In [20]:
# Split the data into training, validation, and test sets
texts_train, texts_temp, labels_train, labels_temp = train_test_split(X, y, test_size=0.3, random_state=42)
texts_val, texts_test, labels_val, labels_test = train_test_split(texts_temp, labels_temp, test_size=0.5, random_state=42)



#    train_texts, val_texts, train_labels, val_labels = train_test_split(
 #       data['text'], data['label'], test_size=0.2, random_state=42
  #  )


## Embedding

In [21]:
# Generate embeddings
max_length = 128
train_dataset = CovidNewsDataset(texts_train, labels_train, tokenizer, max_length, device = device)
val_dataset = CovidNewsDataset(texts_val, labels_val, tokenizer, max_length, device = device)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle = False)

In [22]:
#####################################
# benchmark ending for embedding    #
#####################################
time_end_embed = time.perf_counter()

## Training

In [23]:
#####################################
# benchmark beginning for modeling  #
#####################################
time_start_model = time.perf_counter()

In [24]:
# Define the embedding size
input_size = 768  # Embedding size generated by RoBERTa


In [25]:
# running tensorboard
log_dir = os.path.join("runs", datetime.now().strftime("%Y%m%d-%H%M%S"))
writer = SummaryWriter(log_dir)
# Train the model
epochs = 3
model = train_model(model, train_dataloader, val_dataloader, epochs, device= device, writer)
   
# closing tensorboard    
writer.close()

    

 

RuntimeError: Placeholder storage has not been allocated on MPS device!

In [None]:
#####################################
# benchmark ending for modeling     #
#####################################
time_end_model = time.perf_counter()

## Evaluating

In [None]:
# Evaluate the model on the test set

model.eval()
val_preds = []
with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        val_preds.extend(preds)

print("Classification Report:")
print(classification_report(labels_val, val_preds, target_names=['Real News', 'Fake News']))

print(f"TensorBoard logs saved to: {log_dir}")
print("To view the TensorBoard, run: tensorboard --logdir=runs")

In [None]:
# Evaluate performance
acc = accuracy_score(labels_test, val_preds)
class_report = classification_report(labels_test, val_preds, target_names=["real", "fake"])
conf_matrix = confusion_matrix(labels_test, val_preds)

print(f"Accuracy: {acc}")
print(f"Classification Report:{class_report}")
print(f"Confusion Matrix:{conf_matrix}")


In [None]:
# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["real", "fake"], yticklabels=["real", "fake"])
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()

## Benchmark results

In [None]:
#####################################
#          Benchmark results        #
#####################################
# calculating the performances
embedding_duration = time_end_embed - time_start_embed
modeling_duration = time_end_model - time_start_model

# formating
embedding_duration_hms = sec2hms(embedding_duration)
modeling_duration_hms = sec2hms(modeling_duration)

# printing the embedding and modeling performances

print(f'Embedding duration : {embedding_duration_hms[0]:.0f}:{embedding_duration_hms[1]:.0f}:{embedding_duration_hms[2]:.3f}')
print(f'Modeling duration : {modeling_duration_hms[0]:.0f}:{modeling_duration_hms[1]:.0f}:{modeling_duration_hms[2]:.3f}')

In [None]:
## we should write a function with an if else condition for when a drive is in and where there's not
# Save the trained model to Google Drive
# final_model_path = '/content/drive/My Drive/fake_news_model_roberta3.pth'
# torch.save(model.state_dict(), final_model_path)
# print(f"Model saved to: {final_model_path}")
torch.save(model.state_dict(), "FakeNews-roberta-base"+ " "+ "model.tar")
