# Environment setting

In [1]:
# Environment setting for Google Colab
#!pip install transformers sentence-transformers tqdm

import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
from tqdm import tqdm


In [2]:
!pip install flash_attn


Collecting flash_attn
  Using cached flash_attn-2.6.3.tar.gz (2.6 MB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[20 lines of output][0m
  [31m   [0m fatal : ni ceci ni aucun de ses répertoires parents n'est un dépôt git : .git
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "<string>", line 2, in <module>
  [31m   [0m   File "<pip-setuptools-caller>", line 34, in <module>
  [31m   [0m   File "/private/var/folders/67/cf7pdc1j39gg5qhcm6y5nxrc0000gn/T/pip-install-lyp2j9wh/flash-attn_294fd455b1274419a1cfe8b96efd6d63/setup.py", line 179, in <module>
  [31m   [0m     CUDAExtension(
  [31m   [0m   File "/Applications/anaconda3/envs/DL-torch-arm64/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1076, in CUDAExtension
  [31m   [0m     library_dirs +

In [3]:
torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

device(type='mps')

In [4]:
# Setting seed:

def set_seed_fun(seed_number: int):
    """
    We could also use pytorch_lightning package
    try:
        import pytorch_lightning as pl
    except ModuleNotFoundError: # Google Colab does not have PyTorch Lightning installed by default. Hence, we do it here if necessary
        !pip install --quiet pytorch-lightning>=1.5
        import pytorch_lightning as pl

    pl.seed_everything(42)
    """
    np.random.seed(seed_number)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed_number)
    if torch.cuda.is_available():
        torch.manual_seed(seed_number)
        torch.cuda.manual_seed(seed_number)
        torch.cuda.manual_seed_all(seed_number)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchark = False

set_seed_fun(42)


In [5]:
# Set up the device (cuda, mps, or cpu)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [6]:
# Load the embedding model (dunzhang/stella_en_1.5B_v5)
model_name = "dunzhang/stella_en_1.5B_v5"
model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)


ImportError: This modeling file requires the following packages that were not found in your environment: flash_attn. Run `pip install flash_attn`

In [None]:
# Import dataset from github
# Raw URL of the CSV file
url = 'https://raw.githubusercontent.com/Anerol18/Fake_News_Detector_NLP_DeepLearning_Project/main/final_combined_dataset.csv'
df = pd.read_csv(url)

In [None]:
# Prepare data
X = df['Text'].values.astype(str)
y = (df['Label'] == 'fake').astype(int).values


In [None]:
# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


# Embedding

In [None]:
# Modified function without dimension reduction
def generate_stella_embeddings(texts, tokenizer, model, batch_size=32):
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
        batch_texts = texts[i:i + batch_size]

        with torch.no_grad():
            inputs = tokenizer(batch_texts, padding="longest", truncation=True, max_length=512, return_tensors="pt").to(device)
            attention_mask = inputs["attention_mask"]
            outputs = model(**inputs)[0]
            last_hidden = outputs.masked_fill(~attention_mask[..., None].bool(), 0.0)
            embeddings_batch = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
            embeddings_batch = normalize(embeddings_batch.cpu().numpy())

            embeddings.append(embeddings_batch)

    return np.vstack(embeddings)

In [None]:
#####################################
# benchmark beginning for embedding #
#####################################
time_start_embed = time.perf_counter()

In [None]:
# Initialize the Vertex AI TextEmbeddingModel
# embedding_model = TextEmbeddingModel.from_pretrained("text-embedding-004")

In [None]:
# Ensure data is in the correct format
X_train = X_train.tolist() if isinstance(X_train, np.ndarray) else X_train
X_val = X_val.tolist() if isinstance(X_val, np.ndarray) else X_val
X_test = X_test.tolist() if isinstance(X_test, np.ndarray) else X_test


In [None]:
# Generate embeddings for the train, validation, and test sets
X_train_embeddings = generate_stella_embeddings(X_train, tokenizer, model)
X_val_embeddings = generate_stella_embeddings(X_val, tokenizer, model)
X_test_embeddings = generate_stella_embeddings(X_test, tokenizer, model)


In [None]:
#####################################
# benchmark ending for embedding    #
#####################################
time_end_embed = time.perf_counter()

# Training part

## Class functions

Definition of functions to:
- Transform a data set into the good format
- create a simple neural network architecture
- create a funtion to transform seconds into a list of (hours, minutes, seconds)

In [None]:
# Define a Dataset class for PyTorch
class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.long)



In [None]:
# input_size = 1536 / 768 / 384 / 192

In [None]:
# Define a simple neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc0 = nn.Linear(input_size, 3072)
        self.dropout0 = nn.Dropout(p=0.6)
        self.relu0 = nn.ReLU()
        self.fc01 = nn.Linear(3072, 3072)
        self.dropout01 = nn.Dropout(p=0.6)
        self.relu01 = nn.ReLU()
        self.fc1 = nn.Linear(3072, 768)
        self.dropout1 = nn.Dropout(p=0.6)
        self.relu1 = nn.ReLU()
        #self.fc11 = nn.Linear(768, 768)
        #self.dropout11 = nn.Dropout(p=0.6)
        #self.relu11 = nn.ReLU()
        #self.fc2 = nn.Linear(768, 384)
        #self.dropout2 = nn.Dropout(p=0.6)
        #self.relu2 = nn.ReLU()
        #self.fc21 = nn.Linear(384, 384)
        #self.dropout21 = nn.Dropout(p=0.6)
        #self.relu21 = nn.ReLU()
        self.fc3 = nn.Linear(768, 2)

    def forward(self, x):
        x = self.fc0(x)
        x = self.dropout0(x)
        x = self.relu0(x)
        x = self.fc01(x)
        x = self.dropout01(x)
        x = self.relu01(x)
        x = self.fc1(x)
        x = self.dropout1(x)
        x = self.relu1(x)
        #x = self.fc11(x)
        #x = self.dropout1(x)
        #x = self.relu1(x)
        #x = self.fc2(x)
        #x = self.dropout2(x)
        #x = self.relu2(x)
        #x = self.fc21(x)
        #x = self.dropout21(x)
        #x = self.relu21(x)
        x = self.fc3(x)
        return x


In [None]:
def sec2hms(ss):
	(hh, ss)=divmod(ss, 3600)
	(mm, ss)=divmod(ss, 60)
	return (hh, mm, ss)

## Training function

Definition of the training function.

In [None]:
# Function to train the model
def train_model(X_train, y_train, X_val, y_val, input_size):
    train_dataset = NewsDataset(X_train, y_train)
    val_dataset = NewsDataset(X_val, y_val)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

    model = SimpleNN(input_size).to(device)
    criterion = nn.CrossEntropyLoss()
    # optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.01)
    optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

    num_epochs = 40
    best_val_loss = float('inf')
    patience = 2
    patience_counter = 0

    # Initialize lists to store losses and accuracies
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()  # Accumulate training loss

            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()  # Track correct predictions


        avg_train_loss = running_loss / len(train_loader)  # Calculate average training loss
        train_accuracy = correct / total  # Calculate training accuracy

        train_losses.append(avg_train_loss)
        train_accuracies.append(train_accuracy)


        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                total += y_batch.size(0)
                correct += (predicted == y_batch).sum().item()

        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = correct / total

        val_losses.append(avg_val_loss)
        val_accuracies.append(val_accuracy)
        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')
        print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print("Early stopping")
            break

    return model, train_losses, val_losses, train_accuracies, val_accuracies

## Model initialization

## Training

In [None]:
#####################################
# benchmark beginning for modeling  #
#####################################
time_start_model = time.perf_counter()

In [None]:
input_size = 1536

In [None]:
# Set up the device (cuda, mps, or cpu)
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

In [None]:
# Train the model
model, train_losses, val_losses, train_accuracies, val_accuracies = train_model(X_train_embeddings, y_train, X_val_embeddings, y_val, input_size)



In [None]:
#####################################
# benchmark ending for modeling     #
#####################################
time_end_model = time.perf_counter()

## Evaluating

In [None]:
# Evaluate the model on the test set
model.eval()
y_pred = []
with torch.no_grad():
    for X_batch in torch.tensor(X_test_embeddings, dtype=torch.float32).to(device):
        outputs = model(X_batch.unsqueeze(0))
        _, predicted = torch.max(outputs, 1)
        y_pred.append(predicted.cpu().numpy()[0])


In [None]:
# Evaluate performance
acc = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, target_names=["real", "fake"])
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {acc}")
print(f"Classification Report:\n{class_report}")
print(f"Confusion Matrix:{conf_matrix}")


In [None]:
# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["real", "fake"], yticklabels=["real", "fake"])
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Plot the loss curves
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Train Loss')
plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

# Plot the accuracy curves
plt.subplot(1, 2, 2)
plt.plot(range(1, len(train_accuracies) + 1), train_accuracies, label='Train Accuracy')
plt.plot(range(1, len(val_accuracies) + 1), val_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
print("Train Losses:", train_losses)
print("Validation Losses:", val_losses)
print("Train Accuracies:", train_accuracies)
print("Validation Accuracies:", val_accuracies)

## Benchmark results

In [None]:
#####################################
#          Benchmark results        #
#####################################
# calculating the performances
embedding_duration = time_end_embed - time_start_embed
modeling_duration = time_end_model - time_start_model

# formating
embedding_duration_hms = sec2hms(embedding_duration)
modeling_duration_hms = sec2hms(modeling_duration)

# printing the embedding and modeling performances

print(f'Embedding duration : {embedding_duration_hms[0]:.0f}:{embedding_duration_hms[1]:.0f}:{embedding_duration_hms[2]:.3f}')
print(f'Modeling duration : {modeling_duration_hms[0]:.0f}:{modeling_duration_hms[1]:.0f}:{modeling_duration_hms[2]:.3f}')

In [None]:
## we should write a function with an if else condition for when a drive is in and where there's not
# Save the trained model to Google Drive
# final_model_path = '/content/drive/My Drive/fake_news_model_roberta3.pth'
# torch.save(model.state_dict(), final_model_path)
# print(f"Model saved to: {final_model_path}")
torch.save(model.state_dict(), "stella_model.tar")
