# <b><span style='color:#F1A424'>|</span> Import Libraries</b><a class='anchor' id='import_libraries'></a> [↑](#top) 

***

Import all the required libraries for this notebook.

In [4]:
import matplotlib.pyplot as plt
import pandas as pd

import warnings
# import wandb


from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

# ======= OPTIONS =========
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Current device is: {device}")
warnings.filterwarnings("ignore")
!mkdir output

Current device is: cuda
mkdir: cannot create directory ‘output’: File exists


In [5]:
import random
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss
from collections import namedtuple
from dataclasses import dataclass, field, asdict
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
from mamba_ssm.utils.hf import load_config_hf, load_state_dict_hf
# from huggingface_hub import HfApi

# import evaluate
import numpy as np
# from datasets import load_dataset
# from transformers import Trainer
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForCausalLM
import re

# <b><span style='color:#F1A424'>|</span> Load Data</b><a class='anchor' id='load_data'></a> [↑](#top) 

***

Load data.

In [6]:
import pandas as pd
import re
import unicodedata
from tqdm import tqdm

# Load DataFrame
train_df = pd.read_parquet('/home/HardDisk/binh230_intern/Mamba-AI-generated-text-detection/data/Mix-AI-Dataset/train_essays.parquet')
valid_df = pd.read_parquet('/home/HardDisk/binh230_intern/Mamba-AI-generated-text-detection/data/Mix-AI-Dataset/valid_essays.parquet')

# Define characters to remove
char_to_remove = ['{', '£', '\x97', '¹', 'å', '\\', '\x85', '<', '\x99', 
                  'é', ']', '+', 'Ö', '\xa0', '>', '|', '\x80', '~', '©', 
                  '/', '\x93', '$', 'Ó', '²', '^', ';', '`', 'á', '*', '(', 
                  '¶', '®', '[', '\x94', '\x91', '#', '-', 'ó', ')', '}', '=']

# Define preprocessing function
def preprocess_text(text, strategy='light'):    
    if strategy == "none":
        text = text
    elif strategy == "light":
        text = text.encode("ascii", "ignore").decode('ascii')        
        text = text.strip()
        text = text.strip("\"")
        for c in char_to_remove:
            text = text.replace(c, "")
        if text and text[-1] != ".":
            text = text.split(".")
            text = ".".join(text[:-1])
            text += "."
    else:
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s.,;?!:()\'\"%-]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply preprocessing with progress bar
tqdm.pandas(desc="Processing Text")
train_df['text'] = train_df['text'].progress_apply(lambda x: preprocess_text(x, strategy='light'))
valid_df['text'] = valid_df['text'].progress_apply(lambda x: preprocess_text(x, strategy='light'))

# Display the first few rows to verify
print("Trainging DF Processing")
print(train_df.info())
print("Testing DF Processing")
print(valid_df.info())



Processing Text: 100%|██████████| 165767/165767 [00:03<00:00, 44099.68it/s]
Processing Text: 100%|██████████| 1679/1679 [00:00<00:00, 45516.30it/s]


Trainging DF Processing
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165767 entries, 0 to 165766
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         165767 non-null  object
 1   prompt_id  165767 non-null  int64 
 2   text       165767 non-null  object
 3   generated  165767 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 5.1+ MB
None
Testing DF Processing
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1679 entries, 0 to 1678
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         1679 non-null   object
 1   prompt_id  1679 non-null   int64 
 2   text       1679 non-null   object
 3   generated  1679 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 52.6+ KB
None


# <b><span style='color:#F1A424'>|</span> Dataset</b><a class='anchor' id='dataset'></a> [↑](#top) 

***

    
We need to get the `max_len` from our `tokenizer`. We create a `tqdm` iterator and for each text we extract the tokenized length. Then we get the maximum value and we add 3 for the special tokens `CLS`, `SEP`, `SEP`.

- [Hugging Face Padding and Truncation](https://huggingface.co/docs/transformers/pad_truncation): check truncation to `max_length` or `True` (batch max length).

One sample from the dataset should look as following:
```python
{
	'inputs': {
		'input_ids': tensor([1, 279, 883, ..., 0, 0]),
		'token_type_ids': tensor([0, 0, 0, ..., 0, 0]),
		'attention_mask': tensor([1, 1, 1, ..., 0, 0])
	},
	'label': tensor([0.0]),
	'ids': '000e8c3c7ddb'
}
```
You can check it by running the cell below.

# <b><span style='color:#F1A424'>|</span> Model</b><a class='anchor' id='model'></a> [↑](#top) 

***

In [7]:
train_df.head()

Unnamed: 0,id,prompt_id,text,generated
0,e_ddxvqx5i,0,"In recent years, there has been a growing move...",1
1,e_hi0yzrcv,0,\nWhy not cars in our life\n\nI have ever met ...,1
2,e_uesv4xha,0,A car is considered by many a nessecity for ev...,1
3,e_2tl5ylwy,0,"H\n\nello fellow citezens , we are here to inf...",0
4,e_s6ci4vj0,0,Have you ever known how if feels not being abl...,1


In [8]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Assuming train_df is your DataFrame with a 'text' column
# Convert the 'id' column to a string to avoid ArrowTypeError
# df['id'] = df['id'].astype(str)

# Rename the 'generated' column to 'labels'
train_df.rename(columns={'generated': 'labels'}, inplace=True)
valid_df.rename(columns={'generated': 'labels'}, inplace=True)

# # Access the train and test datasets
# train_dataset, test_dataset = train_test_split(df, test_size=0.05)

# Combine the splits into a DatasetDict
dataset_dict = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(valid_df),
})

# Display the first example from each dataset
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'prompt_id', 'text', 'labels'],
        num_rows: 165767
    })
    test: Dataset({
        features: ['id', 'prompt_id', 'text', 'labels'],
        num_rows: 1679
    })
})

In [9]:
tokenizer = AutoTokenizer.from_pretrained("OuteAI/Lite-Oute-2-Mamba2Attn-Base")
# Add eos tokens
# tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token
def preprocess_function(examples):
    # Tokenize the text with truncation
    samples = tokenizer(examples['text'], 
                        truncation=True, 
                        padding='max_length', 
                        max_length=512,         
                        return_tensors="pt")
    
    return samples

# Apply preprocessing to the dataset
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)


Map: 100%|██████████| 165767/165767 [00:52<00:00, 3145.23 examples/s]
Map: 100%|██████████| 1679/1679 [00:00<00:00, 2251.01 examples/s]


In [10]:
# from transformers import AutoModel, AutoModelForCausalLM, AutoConfig, AutoModelForSequenceClassification
# import torch
# from model_sequence_classification import CustomModelForSequenceClassification

# config = AutoConfig.from_pretrained('/home/HardDisk/binh230_intern/model_config/mambaformer_config.json', trust_remote_code=True,)
# model1 = AutoModelForCausalLM.from_config(
#     config,
#     trust_remote_code=True,
#     # Enable flash attention if supported
#     attn_implementation="flash_attention_2",
#     torch_dtype=torch.float16,
# )
# model = CustomModelForSequenceClassification(config, model1)
# model.cuda()

In [11]:
import torch
import numpy as np
from transformers import DataCollatorWithPadding, AutoConfig, AutoModelForCausalLM
from model_sequence_classification import CustomModelForSequenceClassification


# Dataset and Tokenizer Setup
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    "OuteAI/Lite-Oute-2-Mamba2Attn-Base",
    # To allow custom modeling files
    trust_remote_code=True,

    # If you have installed flash attention 2
    attn_implementation="flash_attention_2",
    # torch_dtype=torch.float16
)
config = AutoConfig.from_pretrained("OuteAI/Lite-Oute-2-Mamba2Attn-Base", trust_remote_code=True,)
model = CustomModelForSequenceClassification(config, model)
# model.lm_head = torch.nn.Linear(model.lm_head.in_features, 2)
model.to("cuda")


You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in Mamba2ForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in Mamba2Model is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_d

CustomModelForSequenceClassification(
  (backbone): Mamba2ForCausalLM(
    (backbone): Mamba2Model(
      (embeddings): Embedding(32768, 1024)
      (layers): ModuleList(
        (0-5): 6 x Mamba2Block(
          (norm): Mamba2RMSNorm()
          (mixer): Mamba2Mixer(
            (in_proj): Linear(in_features=1024, out_features=4384, bias=False)
            (conv1d): Conv1d(2304, 2304, kernel_size=(4,), stride=(1,), padding=(3,), groups=2304)
            (act): SiLU()
            (norm): Mamba2RMSNorm()
            (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
          )
        )
        (6): Mamba2Block(
          (norm): Mamba2RMSNorm()
          (mixer): Mamba2FlashAttention2(
            (rotary_emb): Mamba2RotaryEmbedding()
            (in_proj): Linear(in_features=1024, out_features=6144, bias=False)
            (conv1d): Conv1d(6144, 6144, kernel_size=(4,), stride=(1,), padding=(3,), groups=6144)
            (out_proj): Linear(in_features=2048, out_featur

In [12]:
# Set seed cho hàm random
random.seed(42)

# Tạo tập train và test
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]
#  Drop the 'prompt_id' feature from both datasets
train_dataset = train_dataset.remove_columns(["text"]).remove_columns(["id"])
test_dataset = test_dataset.remove_columns(["text"]).remove_columns(["id"])

# Tạo tập evaluation để đánh giá trong lúc train
# Do số lượng tập test lớn nên chỉ lấy mẫu 1% tập dữ liệu test để đánh giá
# total_samples = len(test_dataset)
# eval_samples = int(0.5 * total_samples)
# eval_indices = random.sample(range(total_samples), eval_samples)
# eval_dataset = test_dataset.select(eval_indices)

In [13]:
import torch
import numpy as np
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

def TestModel(test_data_loader, model, criterion):
    test_losses = []
    all_predictions = []
    all_actual_values = []
    
    with torch.no_grad():
        for batch in tqdm(test_data_loader):
            if len(batch.input_ids) == 0:
                # Safeguard against empty sequences.
                continue

            # Have shape (batch size, token count)
            token_sequences = batch.input_ids.cuda()
            attention_masks = batch.attention_mask.cuda()
            # Has shape (batch size)
            labels = batch.labels.cuda()

            with torch.cuda.amp.autocast():
                output = model(token_sequences, attention_masks)

                logits = output.logits
                last_token_indices = torch.clamp(attention_masks.sum(dim=1) - 1, min=0)
                raw_predictions = torch.gather(
                    logits, 
                    dim=1, 
                    index=last_token_indices.unsqueeze(1).unsqueeze(2).expand(-1, -1, logits.shape[2])
                ).squeeze(1)
                
                loss = criterion(raw_predictions, labels)

            test_losses.append(loss.detach().cpu())

            scaled_predictions = raw_predictions.softmax(dim=1)[:, 1]
            all_predictions.extend(scaled_predictions.cpu().numpy())
            all_actual_values.extend(labels.cpu().numpy())

    all_predictions, all_actual_values = np.array(all_predictions), np.array(all_actual_values)

    auroc = roc_auc_score(all_actual_values, all_predictions)

    return auroc, np.mean(test_losses)


In [14]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import Adafactor, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Accuracy Calculation
def compute_accuracy(predictions, labels):
    preds = torch.argmax(predictions, dim=1)
    correct = torch.sum(preds == labels)
    return correct.item() / len(labels)

# Metric Calculation
def compute_metrics(eval_pred):
    """
    Compute metrics for Hugging Face Trainer, including AUROC.
    
    Args:
        eval_pred: tuple of (predictions, labels) where predictions are logits.

    Returns:
        dictionary containing the computed metrics, including AUROC.
    """
    logits, labels = eval_pred
    preds = logits.argmax(-1)  # Get the predicted class

    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

    # Calculate precision, recall, and F1-score
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    # Calculate probabilities using softmax on logits
    probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()
    # For binary classification, take the probability of the positive class (class 1)
    auroc = roc_auc_score(labels, probs[:, 1])

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auroc': auroc
    }

# Variables for the experiment
label_smoothing = 0.03
output_subdir = '3090_1'
max_learning_rates = [5e-6]
accumulation_steps = 4  # Number of steps to accumulate gradients

# Run experiment
for max_learning_rate in max_learning_rates:
    print(f'lr = {max_learning_rate}, label_smoothing = {label_smoothing}, output_subdir = {output_subdir}')
    
    # Dataloader Setup
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_data_loader = DataLoader(
        train_dataset, 
        batch_size=32,  # Adjust as necessary
        num_workers=4, 
        shuffle=True, 
        pin_memory=True, 
        collate_fn=data_collator
    )
    test_data_loader = DataLoader(
        test_dataset, 
        batch_size=8,  # Adjust as necessary
        num_workers=4, 
        shuffle=False, 
        pin_memory=True, 
        collate_fn=data_collator
    )

    # Optimizer, Criterion, and Scaler Setup
    optimizer = AdamW(
        model.parameters(),
        lr=max_learning_rate,  # Define your learning_rate
        weight_decay=0.1
    )
    criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
    scaler = torch.cuda.amp.GradScaler(enabled=True)

    total_step_count = len(train_data_loader)
    lr_schedule = torch.optim.lr_scheduler.OneCycleLR(
        optimizer=optimizer,
        max_lr=max_learning_rate,
        total_steps=total_step_count // accumulation_steps,
        pct_start=0.1,
        anneal_strategy='linear',
        cycle_momentum=False
    )

    best_auroc = -99999999
    train_losses = []
    model.train()

    # Tracking the number of rows processed
    total_rows_processed = 0
    row_threshold = 1500

    print_steps = 500  # Log training accuracy/loss every 500 steps

    for batch_index, train_batch in enumerate(tqdm(train_data_loader)):
        if len(train_batch.input_ids) == 0:
            continue

        # Send data to GPU(s)
        token_sequences = train_batch.input_ids.to("cuda")
        attention_masks = train_batch.attention_mask.to("cuda")
        labels = train_batch.labels.to("cuda")

        # Zero gradients before the new forward pass
        optimizer.zero_grad()

        with torch.cuda.amp.autocast(enabled=True):
            output = model(token_sequences, attention_masks)
            raw_predictions = output.logits

            loss = criterion(raw_predictions, labels)

        # Training accuracy
        accuracy = compute_accuracy(raw_predictions, labels)

        # Scale loss and perform backward pass
        scaler.scale(loss).backward()

        # Update gradients and optimizer every 'accumulation_steps'
        if (batch_index + 1) % accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            lr_schedule.step()
            optimizer.zero_grad()  # Reset gradients after optimizer step

        train_losses.append(loss.detach().cpu())

        # Log training accuracy and loss every 500 steps
        if (batch_index + 1) % print_steps == 0:
            avg_train_loss = sum(train_losses) / len(train_losses)
            print(f"Step {batch_index+1}/{total_step_count}: Avg Train Loss = {avg_train_loss:.4f}, Train Accuracy = {accuracy*100:.2f}%")
            train_losses = []  # Reset train loss tracking for the next 500 steps

        # Increment the number of rows processed
        total_rows_processed += len(train_batch.input_ids)

        # Evaluate the model every 'row_threshold'
        if total_rows_processed >= row_threshold:
            model.eval()
            val_logits, val_labels = [], []

            with torch.no_grad():
                for test_batch in test_data_loader:
                    test_token_sequences = test_batch.input_ids.to("cuda")
                    test_attention_masks = test_batch.attention_mask.to("cuda")
                    test_labels = test_batch.labels.to("cuda")
                    test_output = model(test_token_sequences, test_attention_masks)
                    val_logits.append(test_output.logits.cpu().numpy())
                    val_labels.append(test_labels.cpu().numpy())

            # Concatenate all logits and labels for metric calculation
            val_logits = np.concatenate(val_logits)
            val_labels = np.concatenate(val_labels)
            metrics = compute_metrics((val_logits, val_labels))
            
            print(f'Validation Metrics: {metrics}')

            total_rows_processed = 0  # Reset after each evaluation 
            # Save model and reset
            torch.save(model.state_dict(), f'/kaggle/working/Models/MambaFormer2-Step-{batch_index+1}-Loss-{int(avg_train_loss*1000)}.pth')
    # Save model and reset
    torch.save(model.state_dict(), f'/kaggle/working/Models/MambaFormer2-Step-{batch_index+1}-Loss-{int(avg_train_loss*1000)}.pth')
    print(f'Best AUROC: {best_auroc}')


lr = 5e-06, label_smoothing = 0.03, output_subdir = 3090_1


  0%|          | 0/5181 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling

RuntimeError: FlashAttention only support fp16 and bf16 data type

: 

In [11]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.metrics import confusion_matrix

def binarize(x, threshold):
    if x > threshold:
        x = 1
    else:
        x = 0
    return x

# Assuming df is your pandas DataFrame
oof_df["binary"] = oof_df["preds"].apply(lambda x: binarize(x, 0.5))
true_labels = oof_df["generated"].values
predicted_labels = oof_df["binary"].values

# Get the unique classes from both true and predicted labels
classes = np.unique(np.concatenate((true_labels, predicted_labels)))

# Compute the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels, labels=classes)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")

ModuleNotFoundError: No module named 'seaborn'