# <b><span style='color:#F1A424'>|</span> Import Libraries</b><a class='anchor' id='import_libraries'></a> [↑](#top) 

***

Import all the required libraries for this notebook.

In [2]:
import matplotlib.pyplot as plt
import pandas as pd

import warnings
# import wandb


from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

# ======= OPTIONS =========
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Current device is: {device}")
warnings.filterwarnings("ignore")
!mkdir output

Current device is: cuda
mkdir: cannot create directory ‘output’: File exists


In [3]:
import random
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss
from collections import namedtuple
from dataclasses import dataclass, field, asdict
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
from mamba_ssm.utils.hf import load_config_hf, load_state_dict_hf
# from huggingface_hub import HfApi

# import evaluate
import numpy as np
# from datasets import load_dataset
# from transformers import Trainer
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForCausalLM
import re

import wandb
from huggingface_hub import login

login(token="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")
notes = "Train Mamba With 400k row dataset"

# <b><span style='color:#F1A424'>|</span> Load Data</b><a class='anchor' id='load_data'></a> [↑](#top) 

***

Load data.

In [4]:
import pandas as pd
import re
import unicodedata
from tqdm import tqdm

# Load DataFrame
train_df = pd.read_parquet('./data/train_essays.parquet')
valid_df = pd.read_parquet('./data/valid_essays.parquet')

# Define characters to remove
char_to_remove = ['{', '£', '\x97', '¹', 'å', '\\', '\x85', '<', '\x99', 
                  'é', ']', '+', 'Ö', '\xa0', '>', '|', '\x80', '~', '©', 
                  '/', '\x93', '$', 'Ó', '²', '^', ';', '`', 'á', '*', '(', 
                  '¶', '®', '[', '\x94', '\x91', '#', '-', 'ó', ')', '}', '=']

# Define preprocessing function
def preprocess_text(text, strategy='light'):    
    if strategy == "none":
        text = text
    elif strategy == "light":
        text = text.encode("ascii", "ignore").decode('ascii')        
        text = text.strip()
        text = text.strip("\"")
        for c in char_to_remove:
            text = text.replace(c, "")
        if text and text[-1] != ".":
            text = text.split(".")
            text = ".".join(text[:-1])
            text += "."
    else:
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s.,;?!:()\'\"%-]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply preprocessing with progress bar
tqdm.pandas(desc="Processing Text")
train_df['text'] = train_df['text'].progress_apply(lambda x: preprocess_text(x, strategy='light'))
valid_df['text'] = valid_df['text'].progress_apply(lambda x: preprocess_text(x, strategy='light'))

# Display the first few rows to verify
print("Trainging DF Processing")
print(train_df.info())
print("Testing DF Processing")
print(valid_df.info())



Processing Text: 100%|██████████| 165767/165767 [00:05<00:00, 28178.74it/s]
Processing Text: 100%|██████████| 1679/1679 [00:00<00:00, 30375.81it/s]


Trainging DF Processing
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165767 entries, 0 to 165766
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         165767 non-null  object
 1   prompt_id  165767 non-null  int64 
 2   text       165767 non-null  object
 3   generated  165767 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 5.1+ MB
None
Testing DF Processing
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1679 entries, 0 to 1678
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         1679 non-null   object
 1   prompt_id  1679 non-null   int64 
 2   text       1679 non-null   object
 3   generated  1679 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 52.6+ KB
None


# <b><span style='color:#F1A424'>|</span> Dataset</b><a class='anchor' id='dataset'></a> [↑](#top) 

***

    
We need to get the `max_len` from our `tokenizer`. We create a `tqdm` iterator and for each text we extract the tokenized length. Then we get the maximum value and we add 3 for the special tokens `CLS`, `SEP`, `SEP`.

- [Hugging Face Padding and Truncation](https://huggingface.co/docs/transformers/pad_truncation): check truncation to `max_length` or `True` (batch max length).

One sample from the dataset should look as following:
```python
{
	'inputs': {
		'input_ids': tensor([1, 279, 883, ..., 0, 0]),
		'token_type_ids': tensor([0, 0, 0, ..., 0, 0]),
		'attention_mask': tensor([1, 1, 1, ..., 0, 0])
	},
	'label': tensor([0.0]),
	'ids': '000e8c3c7ddb'
}
```
You can check it by running the cell below.

import wandb
# Định nghĩa tên project để log thông tin quá trình huấn luyện trên wandb
os.environ["WANDB_PROJECT"] = "mamba_LLM_detect_binary_classification"
os.environ["WANDB_API_KEY "] = "e7432690ce6d9bfdee410567f89d7e38844ed584"


wandb.login()
# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="mamba_LLM_detect_binary_classification",

    # track hyperparameters and run metadata
    config={
    "learning_rate": 6e-5,
    "architecture": "Mamba-130m-with-Linear-Head",
    "dataset": "Test",
    "epochs": 1,
    "lr_scheduler_type": "cosine"
    }
)

# <b><span style='color:#F1A424'>|</span> Model</b><a class='anchor' id='model'></a> [↑](#top) 

***

In [5]:
train_df.head()

Unnamed: 0,id,prompt_id,text,generated
0,e_ddxvqx5i,0,"In recent years, there has been a growing move...",1
1,e_hi0yzrcv,0,\nWhy not cars in our life\n\nI have ever met ...,1
2,e_uesv4xha,0,A car is considered by many a nessecity for ev...,1
3,e_2tl5ylwy,0,"H\n\nello fellow citezens , we are here to inf...",0
4,e_s6ci4vj0,0,Have you ever known how if feels not being abl...,1


In [6]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Assuming train_df is your DataFrame with a 'text' column
# Convert the 'id' column to a string to avoid ArrowTypeError
# df['id'] = df['id'].astype(str)

# Rename the 'generated' column to 'labels'
train_df.rename(columns={'generated': 'labels'}, inplace=True)
valid_df.rename(columns={'generated': 'labels'}, inplace=True)

# # Access the train and test datasets
# train_dataset, test_dataset = train_test_split(df, test_size=0.05)

# Combine the splits into a DatasetDict
dataset_dict = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(valid_df),
})

# Display the first example from each dataset
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'prompt_id', 'text', 'labels'],
        num_rows: 165767
    })
    test: Dataset({
        features: ['id', 'prompt_id', 'text', 'labels'],
        num_rows: 1679
    })
})

In [7]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
# Add eos tokens
# tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token
def preprocess_function(examples):
    # Tokenize the text with truncation
    samples = tokenizer(examples['text'], 
                        truncation=True, 
                        padding='max_length', 
                        max_length=512,         
                        return_tensors="pt")
    
    return samples

# Apply preprocessing to the dataset
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 165767/165767 [01:32<00:00, 1792.16 examples/s]
Map: 100%|██████████| 1679/1679 [00:00<00:00, 1976.39 examples/s]


In [8]:
import torch
import numpy as np
from transformers import DataCollatorWithPadding

# Dataset and Tokenizer Setup
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from transformers import Zamba2Config, Zamba2ForCausalLM, Zamba2ForSequenceClassification,Zamba2Model

model_config = Zamba2Config.from_pretrained("/home/HardDisk/binh230_intern/zamba_test/config.json")
# model_config.num_labels = 10
model = Zamba2ForSequenceClassification(model_config)
model.to("cuda")


Zamba2ForSequenceClassification(
  (model): Zamba2Model(
    (embed_tokens): Embedding(50277, 2048, padding_idx=0)
    (blocks): ModuleList(
      (0): Zamba2AttentionDecoderLayer(
        (self_attn): Zamba2SdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2048, bias=False)
          (linear_q_lora_A_list): ParameterList(
              (0): Object of type: Linear
              (1): Object of type: Linear
            (0): Linear(in_features=4096, out_features=128, bias=False)
            (1): Linear(in_features=4096, out_features=128, bias=False)
          )
          (linear_q_lora_B_list): ParameterList(
              (0): Object of type: Linear
              (1): Object of type: Linear
            (0): Linear(in_features=128, out_featur

In [9]:
# Set seed cho hàm random
random.seed(42)

# Tạo tập train và test
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]
#  Drop the 'prompt_id' feature from both datasets
train_dataset = train_dataset.remove_columns(["text"]).remove_columns(["id"])
test_dataset = test_dataset.remove_columns(["text"]).remove_columns(["id"])

# Tạo tập evaluation để đánh giá trong lúc train
# Do số lượng tập test lớn nên chỉ lấy mẫu 1% tập dữ liệu test để đánh giá
# total_samples = len(test_dataset)
# eval_samples = int(0.5 * total_samples)
# eval_indices = random.sample(range(total_samples), eval_samples)
# eval_dataset = test_dataset.select(eval_indices)

In [10]:
import torch
import numpy as np
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, accuracy_score

def TestModel(test_data_loader, model, criterion):
    test_losses = []
    all_predictions = []
    all_actual_values = []
    
    with torch.no_grad():
        model.eval()  # Set model to evaluation mode
        for batch in tqdm(test_data_loader):
            if len(batch.input_ids) == 0:
                # Safeguard against empty sequences.
                continue

            # Move tensors to GPU if available
            token_sequences = batch.input_ids.cuda()
            attention_masks = batch.attention_mask.cuda()
            labels = batch.labels.cuda()
            
            # Forward pass
            output = model(input_ids=token_sequences, attention_mask=attention_masks, labels=labels)
            logits = output.logits
            loss = output.loss

            test_losses.append(loss.item())  # Convert loss to scalar before appending

            # Apply softmax to logits to get predicted probabilities
            probabilities = logits.softmax(dim=1)[:, 1]
            all_predictions.extend(probabilities.cpu().numpy())
            all_actual_values.extend(labels.cpu().numpy())

    all_predictions, all_actual_values = np.array(all_predictions), np.array(all_actual_values)

    # Compute AUROC
    auroc = roc_auc_score(all_actual_values, all_predictions)
    
    # Binarize predictions and compute accuracy
    binary_predictions = (all_predictions > 0.6).astype(int)
    accuracy = accuracy_score(all_actual_values, binary_predictions)
    
    return accuracy, auroc, np.mean(test_losses)


In [11]:
train_dataset

Dataset({
    features: ['prompt_id', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 165767
})

In [15]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AdamW, DataCollatorWithPadding
import wandb  # Add wandb

# Set random seed for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set a specific seed value
set_seed(100)

# Accuracy Calculation
def compute_accuracy(predictions, labels):
    preds = torch.argmax(predictions, dim=1)
    correct = torch.sum(preds == labels)
    return correct.item() / len(labels)

# Initialize wandb run
wandb.init(project="Detect AI Generated Text", 
           name="First run of Zamba",
           config={
               "learning_rate": 4e-5,
               "label_smoothing": 0.03,
               "batch_size": 8,
               "num_epochs": 1,
               "optimizer": "AdamW",
               "model": model.config.model_type,
               "model_params": sum(p.numel() for p in model.parameters() if p.requires_grad)
           })

config = wandb.config  # Access the configuration

# Variables for the experiment
label_smoothing = config.label_smoothing
output_subdir = '3090_1'
max_learning_rate = config.learning_rate

# Run experiment
for max_learning_rate in [max_learning_rate]:
    print(f'lr = {max_learning_rate}, label_smoothing = {label_smoothing}, output_subdir = {output_subdir}')
    
    # Dataloader Setup
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_data_loader = DataLoader(
        train_dataset, 
        batch_size=config.batch_size,  
        num_workers=4, 
        shuffle=True, 
        pin_memory=True, 
        collate_fn=data_collator
    )
    test_data_loader = DataLoader(
        test_dataset, 
        batch_size=config.batch_size,  
        num_workers=4, 
        shuffle=False, 
        pin_memory=True, 
        collate_fn=data_collator
    )

    # Optimizer and Criterion Setup
    optimizer = AdamW(
        model.parameters(),
        lr=max_learning_rate,
        weight_decay=0.1
    )
    criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)

    total_step_count = len(train_data_loader)
    lr_schedule = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer=optimizer,
        T_max=total_step_count,  
        eta_min=1e-8  
    )

    best_auroc = -99999999
    train_losses = []
    model.train()

    total_rows_processed = 0
    row_threshold = 50000
    print_steps = 500  

    for batch_index, train_batch in enumerate(tqdm(train_data_loader)):
        if len(train_batch.input_ids) == 0:
            continue

        token_sequences = train_batch.input_ids.to("cuda")
        attention_masks = train_batch.attention_mask.to("cuda")
        labels = train_batch.labels.to("cuda")

        optimizer.zero_grad()

        # Forward pass
        output = model(input_ids=token_sequences, attention_mask=attention_masks, labels=labels)
        logits = output.logits
        loss = output.loss

        # Training accuracy
        accuracy = compute_accuracy(logits, labels)

        loss.backward()  
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)  
        optimizer.step()  
        lr_schedule.step()  

        train_losses.append(loss.detach().cpu())

        if (batch_index + 1) % print_steps == 0:
            avg_train_loss = sum(train_losses) / len(train_losses)
            print(f"Step {batch_index+1}/{total_step_count}: Avg Train Loss = {avg_train_loss:.4f}, Train Accuracy = {accuracy*100:.2f}%")
            wandb.log({"train_loss": avg_train_loss, "train_accuracy": accuracy * 100, "learning_rate": lr_schedule.get_last_lr()[0]})
            train_losses = []  

        total_rows_processed += len(train_batch.input_ids)

        if total_rows_processed >= row_threshold:
            model.eval()
            val_accuracy, val_auroc, test_loss = TestModel(test_data_loader, model, criterion)
            model.train()
            
            print(f'Validation Loss: {test_loss:.4f}, Validation Accuracy: {val_accuracy*100:.2f}%, AuROC :{val_auroc*100:.2f}%')
            wandb.log({"val_loss": test_loss, "val_accuracy": val_accuracy * 100, "val_auroc": val_auroc * 100})
            
            total_rows_processed = 0  

            if val_auroc > best_auroc:
                best_auroc = val_auroc
                torch.save(model.state_dict(), f'./Models/BestModel-Val_Accuracy-{val_accuracy*100}%-AuROC_Score-{val_auroc*100}-Loss-{int(test_loss*1000)}.pth')
    
    print(f'Best AUROC: {best_auroc}')
    wandb.log({"best_auroc": best_auroc})

wandb.finish()


lr = 4e-05, label_smoothing = 0.03, output_subdir = 3090_1


  0%|          | 0/20721 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disablin

KeyboardInterrupt: 

In [10]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import Adafactor

# Set random seed for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # for multi-GPU setups
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set a specific seed value
set_seed(100)

# Accuracy Calculation
def compute_accuracy(predictions, labels):
    preds = torch.argmax(predictions, dim=1)
    correct = torch.sum(preds == labels)
    return correct.item() / len(labels)

# Variables for the experiment
label_smoothing = 0.03
output_subdir = '3090_1'
max_learning_rates = [2e-5]

# Run experiment
for max_learning_rate in max_learning_rates:
    print(f'lr = {max_learning_rate}, label_smoothing = {label_smoothing}, output_subdir = {output_subdir}')
    
    # Dataloader Setup
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_data_loader = DataLoader(
        train_dataset, 
        batch_size=2,  # Increased batch size since it will be split across GPUs
        num_workers=4, 
        shuffle=True, 
        pin_memory=True, 
        collate_fn=data_collator
    )
    test_data_loader = DataLoader(
        test_dataset, 
        batch_size=2,  # Increased batch size
        num_workers=4, 
        shuffle=False, 
        pin_memory=True, 
        collate_fn=data_collator
    )

    # Optimizer, Criterion, and Scaler Setup
    optimizer = AdamW(
        model.parameters(),
        lr=max_learning_rate,
        weight_decay=0.1
    )
    criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
    scaler = torch.cuda.amp.GradScaler(enabled=True)

    total_step_count = len(train_data_loader)
    # Cosine Annealing Scheduler
    lr_schedule = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer=optimizer,
        T_max=total_step_count,  # The number of steps to reach the minimum learning rate
        eta_min=1e-7  # Minimum learning rate (optional, can be adjusted)
    )

    best_auroc = -99999999
    train_losses = []
    model.train()

    # Tracking the number of rows processed
    total_rows_processed = 0
    row_threshold = 50000

    print_steps = 500  # Log training accuracy/loss every 500 steps

    for batch_index, train_batch in enumerate(tqdm(train_data_loader)):
        if len(train_batch.input_ids) == 0:
            continue

        # Send data to GPU(s)
        token_sequences = train_batch.input_ids.to("cuda")
        attention_masks = train_batch.attention_mask.to("cuda")
        labels = train_batch.labels.to("cuda")

        optimizer.zero_grad()

        with torch.cuda.amp.autocast(enabled=True):
            # output = model(token_sequences, attention_masks)
            output = model(input_ids=token_sequences, attention_mask=attention_masks, labels=labels)
            logits = output.logits
            loss = output.loss

        # Training accuracy
        accuracy = compute_accuracy(logits, labels)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        scaler.step(optimizer)
        scaler.update()
        lr_schedule.step()

        train_losses.append(loss.detach().cpu())

        # Log training accuracy and loss every 500 steps
        if (batch_index + 1) % print_steps == 0:
            avg_train_loss = sum(train_losses) / len(train_losses)
            print(f"Step {batch_index+1}/{total_step_count}: Avg Train Loss = {avg_train_loss:.4f}, Train Accuracy = {accuracy*100:.2f}%")
            train_losses = []  # Reset train loss tracking for the next 500 steps

        # Increment the number of rows processed
        total_rows_processed += len(train_batch.input_ids)

        # Evaluate the model every 50,000 rows
        if total_rows_processed >= row_threshold:
            model.eval()
            val_accuracy, val_auroc, test_loss = TestModel(test_data_loader, model, criterion)
            model.train()
            
            print(f'Validation Loss: {test_loss:.4f}, Validation Accuracy: {val_accuracy*100:.2f}%, AuROC :{val_auroc*100:.2f}%')
            
            total_rows_processed = 0  # Reset after each evaluation

            # Save model and reset
            torch.save(model.state_dict(), f'./Models/MambaFormer2-Val_Accuracy-{val_accuracy*100}%-AuROC_Score-{val_auroc*100}-Loss-{int(test_loss*1000)}.pth')
    print(f'Best AUROC: {best_auroc}')


lr = 2e-05, label_smoothing = 0.03, output_subdir = 3090_1


  0%|          | 0/82884 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disablin

RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16'

In [15]:
output.logits.view(-1, 2), labels.view(-1)

(tensor([[ 1.1261e-01,  8.0811e-02],
         [ 3.6621e-04, -7.1289e-01],
         [ 6.4697e-02, -9.7314e-01],
         [-7.3584e-01, -1.0107e+00]], device='cuda:0', dtype=torch.float16,
        grad_fn=<ViewBackward0>),
 tensor([0, 1, 1, 1], device='cuda:0'))

In [16]:
output.logits, labels

(tensor([[ 1.1261e-01,  8.0811e-02],
         [ 3.6621e-04, -7.1289e-01],
         [ 6.4697e-02, -9.7314e-01],
         [-7.3584e-01, -1.0107e+00]], device='cuda:0', dtype=torch.float16,
        grad_fn=<IndexBackward0>),
 tensor([0, 1, 1, 1], device='cuda:0'))

In [14]:
compute_accuracy(output.logits.view(-1, 2), labels.view(-1))

0.25

In [13]:
model.eval()
val_accuracy, test_loss = TestModel(test_data_loader, model, criterion)
model.train()

print(f'Validation Loss: {test_loss:.4f}, Validation Accuracy: {val_accuracy*100:.2f}%')

  0%|          | 0/105 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling 

ValueError: too many values to unpack (expected 2)

In [None]:
auroc_scores_by_dataset, test_loss

In [None]:
import torch
import numpy as np
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

model.eval()
auroc_scores_by_dataset, test_loss = TestModel(test_data_loader, model, criterion)
model.train()

# average_auroc = np.average(auroc_scores_by_dataset, weights=[1, 1])
# if (average_auroc > best_auroc) or (max(auroc_scores_by_dataset) > 0.993):
#     best_auroc = average_auroc
#     if output_subdir is not None:
#         torch.save(model.state_dict(), f'Models/Mamba/{output_subdir}/S{step_number}_CTX1024.pth')

# train_losses = []

### <b><span style='color:#F1A424'>Confusion Matrix</span></b>


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.metrics import confusion_matrix

def binarize(x, threshold):
    if x > threshold:
        x = 1
    else:
        x = 0
    return x

# Assuming df is your pandas DataFrame
oof_df["binary"] = oof_df["preds"].apply(lambda x: binarize(x, 0.5))
true_labels = oof_df["generated"].values
predicted_labels = oof_df["binary"].values

# Get the unique classes from both true and predicted labels
classes = np.unique(np.concatenate((true_labels, predicted_labels)))

# Compute the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels, labels=classes)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")