# <b><span style='color:#F1A424'>|</span> Import Libraries</b><a class='anchor' id='import_libraries'></a> [↑](#top) 

***

Import all the required libraries for this notebook.

In [1]:
!pip install causal-conv1d>=1.4.0
!pip install mamba-ssm

Collecting mamba-ssm
  Downloading mamba_ssm-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m869.3 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting einops (from mamba-ssm)
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting triton (from mamba-ssm)
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: mamba-ssm
  Building wheel for mamba-ssm (setup.p

In [2]:
import ast
import copy
import gc
import itertools
import joblib
import json
import math
import matplotlib.pyplot as plt
import multiprocessing
import numpy as np
import os
import pandas as pd
import pickle
import random
import re
import scipy as sp
import string
import sys
import time
import warnings
# import wandb


from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

# ======= OPTIONS =========
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Current device is: {device}")
warnings.filterwarnings("ignore")
!mkdir output

Current device is: cuda


In [3]:
import os
import random
import json
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss
from collections import namedtuple
from dataclasses import dataclass, field, asdict
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
from mamba_ssm.utils.hf import load_config_hf, load_state_dict_hf
# from huggingface_hub import HfApi

# import evaluate
import numpy as np
from datasets import load_dataset
from transformers import Trainer
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, TrainingArguments
import re

import wandb
from huggingface_hub import login

login(token="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")
notes = "Train Mamba With 400k row dataset"

# <b><span style='color:#F1A424'>|</span> Load Data</b><a class='anchor' id='load_data'></a> [↑](#top) 

***

Load data.

In [4]:
import pandas as pd
import re
import unicodedata
from tqdm import tqdm

# Load DataFrame
train_df = pd.read_parquet('/kaggle/input/ai-mix-v26/train_essays.parquet')
valid_df = pd.read_parquet('/kaggle/input/ai-mix-v26/valid_essays.parquet')

# Define characters to remove
char_to_remove = ['{', '£', '\x97', '¹', 'å', '\\', '\x85', '<', '\x99', 
                  'é', ']', '+', 'Ö', '\xa0', '>', '|', '\x80', '~', '©', 
                  '/', '\x93', '$', 'Ó', '²', '^', ';', '`', 'á', '*', '(', 
                  '¶', '®', '[', '\x94', '\x91', '#', '-', 'ó', ')', '}', '=']

# Define preprocessing function
def preprocess_text(text, strategy='light'):    
    if strategy == "none":
        text = text
    elif strategy == "light":
        text = text.encode("ascii", "ignore").decode('ascii')        
        text = text.strip()
        text = text.strip("\"")
        for c in char_to_remove:
            text = text.replace(c, "")
        if text and text[-1] != ".":
            text = text.split(".")
            text = ".".join(text[:-1])
            text += "."
    else:
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s.,;?!:()\'\"%-]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply preprocessing with progress bar
tqdm.pandas(desc="Processing Text")
train_df['text'] = train_df['text'].progress_apply(lambda x: preprocess_text(x, strategy='light'))
valid_df['text'] = valid_df['text'].progress_apply(lambda x: preprocess_text(x, strategy='light'))

# Display the first few rows to verify
print("Trainging DF Processing")
print(train_df.info())
print("Testing DF Processing")
print(valid_df.info())



Processing Text: 100%|██████████| 165767/165767 [00:04<00:00, 33807.60it/s]
Processing Text: 100%|██████████| 1679/1679 [00:00<00:00, 33144.46it/s]

Trainging DF Processing
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165767 entries, 0 to 165766
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         165767 non-null  object
 1   prompt_id  165767 non-null  int64 
 2   text       165767 non-null  object
 3   generated  165767 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 5.1+ MB
None
Testing DF Processing
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1679 entries, 0 to 1678
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         1679 non-null   object
 1   prompt_id  1679 non-null   int64 
 2   text       1679 non-null   object
 3   generated  1679 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 52.6+ KB
None





# <b><span style='color:#F1A424'>|</span> Dataset</b><a class='anchor' id='dataset'></a> [↑](#top) 

***

    
We need to get the `max_len` from our `tokenizer`. We create a `tqdm` iterator and for each text we extract the tokenized length. Then we get the maximum value and we add 3 for the special tokens `CLS`, `SEP`, `SEP`.

- [Hugging Face Padding and Truncation](https://huggingface.co/docs/transformers/pad_truncation): check truncation to `max_length` or `True` (batch max length).

One sample from the dataset should look as following:
```python
{
	'inputs': {
		'input_ids': tensor([1, 279, 883, ..., 0, 0]),
		'token_type_ids': tensor([0, 0, 0, ..., 0, 0]),
		'attention_mask': tensor([1, 1, 1, ..., 0, 0])
	},
	'label': tensor([0.0]),
	'ids': '000e8c3c7ddb'
}
```
You can check it by running the cell below.

import wandb
# Định nghĩa tên project để log thông tin quá trình huấn luyện trên wandb
os.environ["WANDB_PROJECT"] = "mamba_LLM_detect_binary_classification"
os.environ["WANDB_API_KEY "] = "e7432690ce6d9bfdee410567f89d7e38844ed584"


wandb.login()
# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="mamba_LLM_detect_binary_classification",

    # track hyperparameters and run metadata
    config={
    "learning_rate": 6e-5,
    "architecture": "Mamba-130m-with-Linear-Head",
    "dataset": "Test",
    "epochs": 1,
    "lr_scheduler_type": "cosine"
    }
)

# <b><span style='color:#F1A424'>|</span> Model</b><a class='anchor' id='model'></a> [↑](#top) 

***

In [5]:
train_df.head()

Unnamed: 0,id,prompt_id,text,generated
0,e_ddxvqx5i,0,"In recent years, there has been a growing move...",1
1,e_hi0yzrcv,0,\nWhy not cars in our life\n\nI have ever met ...,1
2,e_uesv4xha,0,A car is considered by many a nessecity for ev...,1
3,e_2tl5ylwy,0,"H\n\nello fellow citezens , we are here to inf...",0
4,e_s6ci4vj0,0,Have you ever known how if feels not being abl...,1


In [6]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Assuming train_df is your DataFrame with a 'text' column
# Convert the 'id' column to a string to avoid ArrowTypeError
# df['id'] = df['id'].astype(str)

# Rename the 'generated' column to 'labels'
train_df.rename(columns={'generated': 'labels'}, inplace=True)
valid_df.rename(columns={'generated': 'labels'}, inplace=True)

# # Access the train and test datasets
# train_dataset, test_dataset = train_test_split(df, test_size=0.05)

# Combine the splits into a DatasetDict
dataset_dict = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(valid_df),
})

# Display the first example from each dataset
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'prompt_id', 'text', 'labels'],
        num_rows: 165767
    })
    test: Dataset({
        features: ['id', 'prompt_id', 'text', 'labels'],
        num_rows: 1679
    })
})

In [7]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
# Add eos tokens
# tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token
def preprocess_function(examples):
    # Tokenize the text with truncation
    samples = tokenizer(examples['text'], 
                        truncation=True, 
                        padding='max_length', 
                        max_length=512,         
                        return_tensors="pt")
    
    return samples

# Apply preprocessing to the dataset
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)


tokenizer_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/457k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Map:   0%|          | 0/165767 [00:00<?, ? examples/s]

Map:   0%|          | 0/1679 [00:00<?, ? examples/s]

In [8]:
# Set seed cho hàm random
random.seed(42)

# Tạo tập train và test
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]
#  Drop the 'prompt_id' feature from both datasets
train_dataset = train_dataset.remove_columns(["text"]).remove_columns(["id"])
test_dataset = test_dataset.remove_columns(["text"]).remove_columns(["id"])

# Tạo tập evaluation để đánh giá trong lúc train
# Do số lượng tập test lớn nên chỉ lấy mẫu 1% tập dữ liệu test để đánh giá
# total_samples = len(test_dataset)
# eval_samples = int(0.5 * total_samples)
# eval_indices = random.sample(range(total_samples), eval_samples)
# eval_dataset = test_dataset.select(eval_indices)

In [9]:
import torch
import numpy as np
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

# Dataset and Tokenizer Setup
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load the model
FOUNDATION_MODEL_NAME = "state-spaces/mamba-790m"
model = MambaLMHeadModel.from_pretrained(FOUNDATION_MODEL_NAME)
model.lm_head = torch.nn.Linear(model.config.d_model, 2)
model = nn.DataParallel(model)
model.to("cuda")


config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.17G [00:00<?, ?B/s]

DataParallel(
  (module): MambaLMHeadModel(
    (backbone): MixerModel(
      (embedding): Embedding(50280, 1536)
      (layers): ModuleList(
        (0-47): 48 x Block(
          (norm): RMSNorm()
          (mixer): Mamba(
            (in_proj): Linear(in_features=1536, out_features=6144, bias=False)
            (conv1d): Conv1d(3072, 3072, kernel_size=(4,), stride=(1,), padding=(3,), groups=3072)
            (act): SiLU()
            (x_proj): Linear(in_features=3072, out_features=128, bias=False)
            (dt_proj): Linear(in_features=96, out_features=3072, bias=True)
            (out_proj): Linear(in_features=3072, out_features=1536, bias=False)
          )
        )
      )
      (norm_f): RMSNorm()
    )
    (lm_head): Linear(in_features=1536, out_features=2, bias=True)
  )
)

In [10]:
import torch
import numpy as np
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

def TestModel(test_data_loader, model, criterion):
    test_losses = []
    all_predictions = []
    all_actual_values = []
    
    with torch.no_grad():
        for batch in tqdm(test_data_loader):
            if len(batch.input_ids) == 0:
                # Safeguard against empty sequences.
                continue

            # Have shape (batch size, token count)
            token_sequences = batch.input_ids.cuda()
            attention_masks = batch.attention_mask.cuda()
            # Has shape (batch size)
            labels = batch.labels.cuda()

            with torch.cuda.amp.autocast():
                output = model(token_sequences, attention_masks)

                logits = output.logits
                last_token_indices = torch.clamp(attention_masks.sum(dim=1) - 1, min=0)
                raw_predictions = torch.gather(
                    logits, 
                    dim=1, 
                    index=last_token_indices.unsqueeze(1).unsqueeze(2).expand(-1, -1, logits.shape[2])
                ).squeeze(1)
                
                loss = criterion(raw_predictions, labels)

            test_losses.append(loss.detach().cpu())

            scaled_predictions = raw_predictions.softmax(dim=1)[:, 1]
            all_predictions.extend(scaled_predictions.cpu().numpy())
            all_actual_values.extend(labels.cpu().numpy())

    all_predictions, all_actual_values = np.array(all_predictions), np.array(all_actual_values)

    auroc = roc_auc_score(all_actual_values, all_predictions)

    return auroc, np.mean(test_losses)


In [11]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import Adafactor

# Accuracy Calculation
def compute_accuracy(predictions, labels):
    preds = torch.argmax(predictions, dim=1)
    correct = torch.sum(preds == labels)
    return correct.item() / len(labels)

# Variables for the experiment
label_smoothing = 0.03
output_subdir = '3090_1'
max_learning_rates = [5e-6]

# Run experiment
for max_learning_rate in max_learning_rates:
    print(f'lr = {max_learning_rate}, label_smoothing = {label_smoothing}, output_subdir = {output_subdir}')
    
    # Dataloader Setup
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_data_loader = DataLoader(
        train_dataset, 
        batch_size=8,  # Increased batch size since it will be split across GPUs
        num_workers=4, 
        shuffle=True, 
        pin_memory=True, 
        collate_fn=data_collator
    )
    test_data_loader = DataLoader(
        test_dataset, 
        batch_size=8,  # Increased batch size
        num_workers=4, 
        shuffle=False, 
        pin_memory=True, 
        collate_fn=data_collator
    )

    # Optimizer, Criterion, and Scaler Setup
    optimizer = Adafactor(
        model.parameters(),
        lr=max_learning_rate,
        scale_parameter=True,
        relative_step=False  # Fixed learning rate
    )
    criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
    scaler = torch.cuda.amp.GradScaler(enabled=True)

    total_step_count = len(train_data_loader)
    lr_schedule = torch.optim.lr_scheduler.OneCycleLR(
        optimizer=optimizer,
        max_lr=max_learning_rate,
        total_steps=total_step_count,
        pct_start=0.1,
        anneal_strategy='linear',
        cycle_momentum=False
    )

    best_auroc = -99999999
    train_losses = []
    model.train()

    # Tracking the number of rows processed
    total_rows_processed = 0
    row_threshold = 50000

    print_steps = 500  # Log training accuracy/loss every 500 steps

    for batch_index, train_batch in enumerate(tqdm(train_data_loader)):
        if len(train_batch.input_ids) == 0:
            continue

        # Send data to GPU(s)
        token_sequences = train_batch.input_ids.to("cuda")
        attention_masks = train_batch.attention_mask.to("cuda")
        labels = train_batch.labels.to("cuda")

        optimizer.zero_grad()

        with torch.cuda.amp.autocast(enabled=True):
            output = model(token_sequences, attention_masks)
            logits = output.logits
            last_token_indices = torch.clamp(attention_masks.sum(dim=1) - 1, min=0)
            raw_predictions = torch.gather(
                logits, dim=1, index=last_token_indices.unsqueeze(1).unsqueeze(2).expand(-1, -1, logits.shape[2])
            ).squeeze(1)

            loss = criterion(raw_predictions, labels)

        # Training accuracy
        accuracy = compute_accuracy(raw_predictions, labels)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        lr_schedule.step()

        train_losses.append(loss.detach().cpu())

        # Log training accuracy and loss every 500 steps
        if (batch_index + 1) % print_steps == 0:
            avg_train_loss = sum(train_losses) / len(train_losses)
            print(f"Step {batch_index+1}/{total_step_count}: Avg Train Loss = {avg_train_loss:.4f}, Train Accuracy = {accuracy*100:.2f}%")
            train_losses = []  # Reset train loss tracking for the next 500 steps

        # Increment the number of rows processed
        total_rows_processed += len(train_batch.input_ids)

        # Evaluate the model every 50,000 rows
        if total_rows_processed >= row_threshold:
            model.eval()
            val_accuracy, test_loss = TestModel(test_data_loader, model, criterion)
            model.train()
            
            print(f'Validation Loss: {test_loss:.4f}, Validation Accuracy: {val_accuracy*100:.2f}%')
            
            total_rows_processed = 0  # Reset after each evaluation

    # Save model and reset
    torch.save(model.state_dict(), f'/kaggle/working/Models/Mamba-780m-Step-{batch_index+1}-Loss-{int(test_loss*1000)}.pth')
    print(f'Best AUROC: {best_auroc}')


lr = 5e-06, label_smoothing = 0.03, output_subdir = 3090_1


  0%|          | 0/20721 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disablin

Step 500/20721: Avg Train Loss = 1.0302, Train Accuracy = 62.50%


  5%|▍         | 1000/20721 [30:20<9:50:06,  1.80s/it]

Step 1000/20721: Avg Train Loss = 0.8935, Train Accuracy = 50.00%


  7%|▋         | 1500/20721 [45:19<9:36:18,  1.80s/it]

Step 1500/20721: Avg Train Loss = 0.7625, Train Accuracy = 87.50%


 10%|▉         | 2000/20721 [1:00:18<9:21:24,  1.80s/it]

Step 2000/20721: Avg Train Loss = 0.6464, Train Accuracy = 75.00%


 12%|█▏        | 2500/20721 [1:15:16<9:05:11,  1.80s/it]

Step 2500/20721: Avg Train Loss = 0.6076, Train Accuracy = 75.00%


 14%|█▍        | 3000/20721 [1:30:16<8:50:21,  1.80s/it]

Step 3000/20721: Avg Train Loss = 0.5415, Train Accuracy = 87.50%


 17%|█▋        | 3500/20721 [1:45:15<8:35:26,  1.80s/it]

Step 3500/20721: Avg Train Loss = 0.5221, Train Accuracy = 75.00%


 19%|█▉        | 4000/20721 [2:00:14<8:21:01,  1.80s/it]

Step 4000/20721: Avg Train Loss = 0.5045, Train Accuracy = 87.50%


 22%|██▏       | 4500/20721 [2:15:13<8:05:43,  1.80s/it]

Step 4500/20721: Avg Train Loss = 0.4931, Train Accuracy = 75.00%


 24%|██▍       | 5000/20721 [2:30:11<7:51:03,  1.80s/it]

Step 5000/20721: Avg Train Loss = 0.4797, Train Accuracy = 87.50%


 27%|██▋       | 5500/20721 [2:45:10<7:35:45,  1.80s/it]

Step 5500/20721: Avg Train Loss = 0.4705, Train Accuracy = 75.00%


 29%|██▉       | 6000/20721 [3:00:08<7:22:06,  1.80s/it]

Step 6000/20721: Avg Train Loss = 0.4773, Train Accuracy = 75.00%


 30%|███       | 6249/20721 [3:07:36<7:12:58,  1.80s/it]
  0%|          | 0/210 [00:00<?, ?it/s][Ahuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got

Validation Loss: 0.4388, Validation Accuracy: 80.31%


 31%|███▏      | 6500/20721 [3:17:01<7:04:23,  1.79s/it]  

Step 6500/20721: Avg Train Loss = 0.4655, Train Accuracy = 50.00%


 34%|███▍      | 7000/20721 [3:31:59<6:48:59,  1.79s/it]

Step 7000/20721: Avg Train Loss = 0.4525, Train Accuracy = 100.00%


 36%|███▌      | 7500/20721 [3:46:58<6:38:14,  1.81s/it]

Step 7500/20721: Avg Train Loss = 0.4424, Train Accuracy = 75.00%


 39%|███▊      | 8000/20721 [4:01:56<6:22:00,  1.80s/it]

Step 8000/20721: Avg Train Loss = 0.4315, Train Accuracy = 100.00%


 41%|████      | 8500/20721 [4:16:54<6:06:48,  1.80s/it]

Step 8500/20721: Avg Train Loss = 0.4339, Train Accuracy = 87.50%


 43%|████▎     | 9000/20721 [4:31:52<5:51:36,  1.80s/it]

Step 9000/20721: Avg Train Loss = 0.4256, Train Accuracy = 100.00%


 46%|████▌     | 9500/20721 [4:46:49<5:35:26,  1.79s/it]

Step 9500/20721: Avg Train Loss = 0.4145, Train Accuracy = 100.00%


 48%|████▊     | 10000/20721 [5:01:47<5:20:35,  1.79s/it]

Step 10000/20721: Avg Train Loss = 0.4130, Train Accuracy = 62.50%


 51%|█████     | 10500/20721 [5:16:45<5:05:26,  1.79s/it]

Step 10500/20721: Avg Train Loss = 0.4186, Train Accuracy = 75.00%


 53%|█████▎    | 11000/20721 [5:31:43<4:50:26,  1.79s/it]

Step 11000/20721: Avg Train Loss = 0.3907, Train Accuracy = 75.00%


 55%|█████▌    | 11500/20721 [5:46:41<4:36:08,  1.80s/it]

Step 11500/20721: Avg Train Loss = 0.4148, Train Accuracy = 75.00%


 58%|█████▊    | 12000/20721 [6:01:39<4:20:31,  1.79s/it]

Step 12000/20721: Avg Train Loss = 0.3950, Train Accuracy = 100.00%


 60%|██████    | 12499/20721 [6:16:35<4:06:36,  1.80s/it]

Step 12500/20721: Avg Train Loss = 0.4105, Train Accuracy = 87.50%



  0%|          | 0/210 [00:00<?, ?it/s][Ahuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabl

Validation Loss: 0.3703, Validation Accuracy: 88.16%


 63%|██████▎   | 13000/20721 [6:33:28<3:51:20,  1.80s/it] 

Step 13000/20721: Avg Train Loss = 0.3699, Train Accuracy = 100.00%


 65%|██████▌   | 13500/20721 [6:48:26<3:36:43,  1.80s/it]

Step 13500/20721: Avg Train Loss = 0.3832, Train Accuracy = 75.00%


 68%|██████▊   | 14000/20721 [7:03:24<3:21:21,  1.80s/it]

Step 14000/20721: Avg Train Loss = 0.3798, Train Accuracy = 87.50%


 70%|██████▉   | 14500/20721 [7:18:22<3:06:12,  1.80s/it]

Step 14500/20721: Avg Train Loss = 0.3892, Train Accuracy = 100.00%


 72%|███████▏  | 15000/20721 [7:33:20<2:52:55,  1.81s/it]

Step 15000/20721: Avg Train Loss = 0.3846, Train Accuracy = 100.00%


 75%|███████▍  | 15500/20721 [7:48:19<2:36:06,  1.79s/it]

Step 15500/20721: Avg Train Loss = 0.3679, Train Accuracy = 100.00%


 77%|███████▋  | 16000/20721 [8:03:18<2:21:30,  1.80s/it]

Step 16000/20721: Avg Train Loss = 0.3795, Train Accuracy = 87.50%


 80%|███████▉  | 16500/20721 [8:18:17<2:06:24,  1.80s/it]

Step 16500/20721: Avg Train Loss = 0.3772, Train Accuracy = 75.00%


 82%|████████▏ | 17000/20721 [8:33:15<1:51:33,  1.80s/it]

Step 17000/20721: Avg Train Loss = 0.3721, Train Accuracy = 100.00%


 84%|████████▍ | 17500/20721 [8:48:13<1:36:43,  1.80s/it]

Step 17500/20721: Avg Train Loss = 0.3776, Train Accuracy = 87.50%


 87%|████████▋ | 18000/20721 [9:03:12<1:21:34,  1.80s/it]

Step 18000/20721: Avg Train Loss = 0.3692, Train Accuracy = 87.50%


 89%|████████▉ | 18500/20721 [9:18:10<1:06:07,  1.79s/it]

Step 18500/20721: Avg Train Loss = 0.3571, Train Accuracy = 100.00%


 90%|█████████ | 18749/20721 [9:25:37<59:06,  1.80s/it]  
  0%|          | 0/210 [00:00<?, ?it/s][Ahuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just go

Validation Loss: 0.3511, Validation Accuracy: 90.07%


 92%|█████████▏| 19000/20721 [9:35:02<51:30,  1.80s/it]   

Step 19000/20721: Avg Train Loss = 0.3664, Train Accuracy = 37.50%


 94%|█████████▍| 19500/20721 [9:50:01<36:37,  1.80s/it]

Step 19500/20721: Avg Train Loss = 0.3678, Train Accuracy = 100.00%


 97%|█████████▋| 20000/20721 [10:04:59<21:33,  1.79s/it]

Step 20000/20721: Avg Train Loss = 0.3767, Train Accuracy = 75.00%


 99%|█████████▉| 20500/20721 [10:19:57<06:36,  1.80s/it]

Step 20500/20721: Avg Train Loss = 0.3518, Train Accuracy = 62.50%


100%|██████████| 20721/20721 [10:26:35<00:00,  1.81s/it]


RuntimeError: Parent directory /kaggle/working/Models does not exist.

In [13]:
torch.save(model.state_dict(), f'/kaggle/working/Mamba-780m-Step-{batch_index+1}-Loss-{int(test_loss*1000)}.pth')


In [15]:
model.eval()
val_accuracy, test_loss = TestModel(test_data_loader, model, criterion)
model.train()

print(f'Validation Loss: {test_loss:.4f}, Validation Accuracy: {val_accuracy*100:.2f}%')

  0%|          | 0/210 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling 

Validation Loss: 0.3493, Validation Accuracy: 90.18%





In [None]:
auroc_scores_by_dataset, test_loss

In [None]:
import torch
import numpy as np
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

model.eval()
auroc_scores_by_dataset, test_loss = TestModel(test_data_loader, model, criterion)
model.train()

# average_auroc = np.average(auroc_scores_by_dataset, weights=[1, 1])
# if (average_auroc > best_auroc) or (max(auroc_scores_by_dataset) > 0.993):
#     best_auroc = average_auroc
#     if output_subdir is not None:
#         torch.save(model.state_dict(), f'Models/Mamba/{output_subdir}/S{step_number}_CTX1024.pth')

# train_losses = []

### <b><span style='color:#F1A424'>Confusion Matrix</span></b>


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.metrics import confusion_matrix

def binarize(x, threshold):
    if x > threshold:
        x = 1
    else:
        x = 0
    return x

# Assuming df is your pandas DataFrame
oof_df["binary"] = oof_df["preds"].apply(lambda x: binarize(x, 0.5))
true_labels = oof_df["generated"].values
predicted_labels = oof_df["binary"].values

# Get the unique classes from both true and predicted labels
classes = np.unique(np.concatenate((true_labels, predicted_labels)))

# Compute the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels, labels=classes)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")