In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2LMHeadModel, GPT2Config, AutoTokenizer
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention
from typing import Optional, Tuple
from einops import rearrange # For easier tensor manipulation
import math
import logging
from transformers import get_linear_schedule_with_warmup
from transformers import T5Tokenizer

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# --- 1. RoPE (Rotary Positional Embeddings) Implementation ---

def rotate_half(x):
    x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
    return torch.cat((-x2, x1), dim=-1)

def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
    # q, k: (batch_size, num_heads, seq_len, head_dim)
    # cos, sin: (seq_len, head_dim) or (1, seq_len, 1, head_dim) depending on broadcasting
    # position_ids: (batch_size, seq_len)

    # Reshape cos and sin for broadcasting
    # (seq_len, head_dim) -> (1, 1, seq_len, head_dim)
    cos = cos[position_ids].unsqueeze(1)
    sin = sin[position_ids].unsqueeze(1)

    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed

# --- 2. ALiBi (Attention with Linear Biases) Implementation ---

def get_alibi_biases(num_heads, seq_len, dtype=torch.float32, device='cpu'):
    # This generates the ALiBi slopes
    # For a detailed explanation, see: https://ofir.io/train_with_alibi/
    # Or original paper: https://arxiv.org/abs/2108.12409

    # The slopes are usually powers of 2.
    # We choose slopes such that they are spread across the log space.
    def get_slopes(n):
        def get_base(n_heads):
            # Base for geometric series for the slopes
            return math.pow(2, math.pow(2, -(math.log2(n_heads) - 3)))

        # Get the slope for each head
        m = get_base(n)
        slopes = [m * (m ** (i / (n - 1))) for i in range(n)]
        return torch.tensor(slopes, dtype=dtype, device=device)

    slopes = get_slopes(num_heads) # (num_heads,)

    # Create the distance matrix for ALiBi
    # (seq_len, seq_len)
    # Example: [[0, 1, 2],
    #           [-1, 0, 1],
    #           [-2, -1, 0]]
    distance_matrix = torch.arange(seq_len, device=device).unsqueeze(0) - \
                      torch.arange(seq_len, device=device).unsqueeze(1) # (seq_len, seq_len)

    # Expand slopes to match the attention matrix shape
    # (num_heads, 1, 1) * (1, seq_len, seq_len) -> (num_heads, seq_len, seq_len)
    alibi_biases = slopes.unsqueeze(-1).unsqueeze(-1) * distance_matrix.unsqueeze(0)
    
    # We typically want negative biases for longer distances, so attention is penalized.
    # The ALiBi paper suggests negative slopes for Q-K.
    # Our distance_matrix is already set up to produce negative values for relative positions
    # (query index - key index) where query is before key.
    # We can negate the slopes if we want positive values for shorter distances (closer to 0)
    # and more negative for longer distances.
    # The typical formulation in ALiBi is to add a bias based on `i - j`, where `i` is query position and `j` is key position.
    # If `i < j`, `i - j` is negative. We want to penalize `i - j` further when it's more negative (longer distance to the right).
    # If `i > j`, `i - j` is positive. We want to penalize `i - j` further when it's more positive (longer distance to the left).
    # This means the slope should be negative. Let's make sure our slopes are negative.
    alibi_biases = -torch.abs(alibi_biases) # Ensure biases are negative for a decay effect.
    return alibi_biases

# --- 3. Custom GPT2Attention with RoPE and ALiBi ---

class CustomGPT2Attention(GPT2Attention):
    def __init__(self, config, is_cross_attention=False, layer_idx=None):
        super().__init__(config, is_cross_attention, layer_idx)
        # Disable original positional embeddings if they are used by parent class,
        # though GPT2Attention itself doesn't directly use them for Q/K/V.
        # It's more about the GPT2Model's input embeddings.

        # We'll need to pass seq_len to RoPE, so `_attn` method needs modification.
        self.head_dim = self.embed_dim // self.num_heads
        self.register_buffer("cos_cached", None, persistent=False)
        self.register_buffer("sin_cached", None, persistent=False)
        self.max_position_embeddings = config.max_position_embeddings

    def _update_cos_sin_caches(self, seq_len, device, dtype):
        # Generate RoPE cos and sin waves
        if self.cos_cached is not None and seq_len <= self.cos_cached.shape[-2]:
            return

        inv_freq = 1.0 / (10000 ** (torch.arange(0, self.head_dim, 2, device=device).float() / self.head_dim))
        t = torch.arange(seq_len, device=device, dtype=torch.float32)
        freqs = torch.einsum("i,j->ij", t, inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1) # (seq_len, head_dim)
        self.cos_cached = emb.cos().to(dtype)
        self.sin_cached = emb.sin().to(dtype)
        logger.info(f"Updated RoPE caches for seq_len={seq_len}")

    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
        # query, key, value: (batch_size * num_heads, seq_len, head_dim)

        # Apply RoPE
        batch_size, num_heads, seq_len, head_dim = query.shape
        position_ids = torch.arange(seq_len, device=query.device).unsqueeze(0).repeat(batch_size, 1) # (batch_size, seq_len)
        
        self._update_cos_sin_caches(seq_len, query.device, query.dtype)

        query, key = apply_rotary_pos_emb(
            query, key,
            self.cos_cached[:seq_len, :].unsqueeze(0).unsqueeze(0), # (1, 1, seq_len, head_dim)
            self.sin_cached[:seq_len, :].unsqueeze(0).unsqueeze(0), # (1, 1, seq_len, head_dim)
            position_ids
        )

        # Calculate attention scores
        attn_weights = torch.matmul(query, key.transpose(-1, -2))
        attn_weights = attn_weights / (value.size(-1) ** 0.5)

        # Apply ALiBi bias
        # (batch_size, num_heads, seq_len, seq_len)
        alibi_bias = get_alibi_biases(self.num_heads, seq_len, dtype=attn_weights.dtype, device=attn_weights.device)
        attn_weights = attn_weights + alibi_bias.unsqueeze(0) # Unsqueeze batch dim for broadcasting

        # Apply attention mask
        if attention_mask is not None:
            # attention_mask: (batch_size, 1, 1, seq_len) or (batch_size, 1, seq_len, seq_len)
            # Add a large negative value to masked positions
            attn_weights = attn_weights + attention_mask

        # Apply head_mask
        if head_mask is not None:
            attn_weights = attn_weights * head_mask

        attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        # Apply attention dropout
        attn_weights = self.attn_dropout(attn_weights)

        attn_output = torch.matmul(attn_weights, value)

        return attn_output, attn_weights

# --- 4. Model Modification Helper Function ---

def replace_attention_with_custom(model):
    for i, block in enumerate(model.transformer.h):
        if isinstance(block.attn, GPT2Attention):
            logger.info(f"Replacing GPT2Attention in block {i} with CustomGPT2Attention.")
            block.attn = CustomGPT2Attention(model.config, layer_idx=i)
        else:
            logger.warning(f"Block {i} attention is not GPT2Attention, skipping replacement.")
    return model






2025-05-22 17:09:29.068478: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747913969.083039   60723 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747913969.087490   60723 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747913969.099832   60723 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747913969.099850   60723 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747913969.099851   60723 computation_placer.cc:177] computation placer alr

In [2]:
# --- 5. Dataset and Dataloader ---



class HindiTextDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, file_path, block_size):
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.examples = []

        logger.info(f"Loading dataset from {file_path}")
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

        # Tokenize the entire text
        tokenized_text = self.tokenizer(text, return_attention_mask=False, truncation=False)["input_ids"]
        
        # Split into blocks
        for i in range(0, len(tokenized_text) - block_size + 1, block_size):
            self.examples.append(tokenized_text[i : i + block_size])
        
        logger.info(f"Loaded {len(self.examples)} examples.")

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        # For language modeling, input and labels are the same, shifted.
        # The GPT2LMHeadModel internally handles the shifting.
        return torch.tensor(self.examples[idx], dtype=torch.long)

In [None]:
# # --- 6. Training Script ---

# def train_model(
#     tokenizer_path: str,
#     data_file_path: str,
#     output_dir: str,
#     model_name_or_path: str = "gpt2",
#     block_size: int = 512,
#     batch_size: int = 8,
#     gradient_accumulation_steps: int = 2,
#     num_epochs: int = 3,
#     learning_rate: float = 5e-5,
#     warmup_steps: int = 500,
#     max_grad_norm: float = 1.0,
#     eval_steps: int = 1000,
#     save_steps: int = 5000,
# ):
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     logger.info(f"Using device: {device}")

#     # Load tokenizer
#     # tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
#     tokenizer = T5Tokenizer.from_pretrained("/home/dixit/Project/May try/my_hindi_t5_tokenizer")
#     if tokenizer.pad_token is None:
#         tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#         logger.info("Added [PAD] token to tokenizer.")

#     # Load GPT-2 config and model
#     config = GPT2Config.from_pretrained(model_name_or_path)
#     # Important: Disable default positional embeddings in GPT2Model if it's explicitly adding them.
#     # For GPT2, it's typically handled by `transformer.wpe` (word position embeddings).
#     # We will let wpe exist, but our custom attention should handle positional information.
#     # The key is that `GPT2Attention` itself doesn't directly use `position_ids` for its QKV calculation,
#     # it receives already-processed `hidden_states`. Our RoPE will operate on these `hidden_states` as Q/K.

#     model = GPT2LMHeadModel.from_pretrained(model_name_or_path, config=config)
    
#     # Resize token embeddings if PAD token was added
#     if len(tokenizer) != model.config.vocab_size:
#         model.resize_token_embeddings(len(tokenizer))
#         logger.info(f"Resized model embeddings to {len(tokenizer)} vocabulary size.")

#     # Replace attention mechanisms
#     model = replace_attention_with_custom(model)
#     model.to(device)

#     # Prepare dataset and dataloader
#     dataset = HindiTextDataset(tokenizer=tokenizer, file_path=data_file_path, block_size=block_size)
#     dataloader = torch.utils.data.DataLoader(
#         dataset,
#         batch_size=batch_size,
#         shuffle=True,
#         num_workers=4, # Adjust based on your system
#         pin_memory=True if torch.cuda.is_available() else False
#     )

#     # Optimizer and Scheduler
#     optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
#     num_training_steps = len(dataloader) // gradient_accumulation_steps * num_epochs
#     # scheduler = torch.optim.lr_scheduler.get_linear_schedule_with_warmup(
#     scheduler = get_linear_schedule_with_warmup(
#         optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps
#     )

#     # Training loop
#     model.train()
#     global_step = 0
#     total_loss = 0.0

#     logger.info("Starting training...")
#     for epoch in range(num_epochs):
#         for step, batch in enumerate(dataloader):
#             inputs = batch.to(device)
#             labels = inputs.clone() # For standard LM, labels are shifted inputs

#             outputs = model(input_ids=inputs, labels=labels)
#             loss = outputs.loss / gradient_accumulation_steps
#             loss.backward()
#             total_loss += loss.item()

#             if (step + 1) % gradient_accumulation_steps == 0:
#                 torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
#                 optimizer.step()
#                 scheduler.step()
#                 optimizer.zero_grad()
#                 global_step += 1

#                 if global_step % 100 == 0:
#                     logger.info(f"Epoch {epoch+1}, Step {step+1}/{len(dataloader)}, Global Step {global_step}, Loss: {total_loss:.4f}, LR: {scheduler.get_last_lr()[0]:.6f}")
#                     total_loss = 0.0 # Reset total loss for logging

#                 if global_step % save_steps == 0:
#                     output_model_path = f"{output_dir}/checkpoint-{global_step}"
#                     model.save_pretrained(output_model_path)
#                     tokenizer.save_pretrained(output_model_path)
#                     logger.info(f"Model saved to {output_model_path}")

#     logger.info("Training complete!")
#     model.save_pretrained(f"{output_dir}/final_model")
#     tokenizer.save_pretrained(f"{output_dir}/final_model")
#     logger.info(f"Final model saved to {output_dir}/final_model")


In [None]:
# ... (imports and other functions remain the same, including CustomGPT2Attention, etc.) ...

from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling # Import these
from datasets import load_dataset
import numpy as np
seed = 42
# The `train_model` function will be significantly simpler
def train_model_with_trainer(
    tokenizer_path: str,
    data_file_path: str,
    output_dir: str,
    custom_config: Optional[GPT2Config] = None,
    custom_training_args: Optional[TrainingArguments] = None, # New argument for TrainingArguments
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")

    tokenizer = T5Tokenizer.from_pretrained("/home/dixit/Project/May try/my_hindi_t5_tokenizer")
    tokenizer.model_max_length = 256
    tokenizer.pad_token = "<pad>"  
    tokenizer.padding_size = "right"
    if tokenizer.pad_token is None:
        if "[PAD]" in tokenizer.vocab:
            tokenizer.pad_token = "[PAD]"
        elif "<pad>" in tokenizer.vocab:
            tokenizer.pad_token = "<pad>"
        elif tokenizer.unk_token is not None:
            tokenizer.pad_token = tokenizer.unk_token
            logger.warning(f"Using tokenizer.unk_token ({tokenizer.unk_token}) as pad_token. Consider training with a dedicated pad_token.")
        else:
            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            logger.info("Added new [PAD] token to tokenizer and set it as pad_token.")
    else:
        logger.info(f"Tokenizer already has pad_token: {tokenizer.pad_token}")


    # Load or create GPT-2 config
    if custom_config:
        config = custom_config
        config.vocab_size = len(tokenizer)
        if tokenizer.bos_token_id is not None: config.bos_token_id = tokenizer.bos_token_id
        if tokenizer.eos_token_id is not None: config.eos_token_id = tokenizer.eos_token_id
        if tokenizer.pad_token_id is not None: config.pad_token_id = tokenizer.pad_token_id
    else:
        config = GPT2Config.from_pretrained("gpt2")
        config.vocab_size = len(tokenizer)
        if tokenizer.bos_token_id is not None: config.bos_token_id = tokenizer.bos_token_id
        if tokenizer.eos_token_id is not None: config.eos_token_id = tokenizer.eos_token_id
        if tokenizer.pad_token_id is not None: config.pad_token_id = tokenizer.pad_token_id
    logger.info(f"Model config: {config}")

    model = GPT2LMHeadModel(config=config)
    if len(tokenizer) != model.config.vocab_size:
        model.resize_token_embeddings(len(tokenizer))
        logger.info(f"Resized model embeddings to {len(tokenizer)} vocabulary size.")
    else:
        logger.info(f"Model embeddings vocabulary size ({model.config.vocab_size}) matches tokenizer ({len(tokenizer)}).")

    # Replace attention mechanisms
    model = replace_attention_with_custom(model)
    model.to(device)

    # Prepare dataset
    # For Trainer, the dataset can be simpler if DataCollator handles padding/truncation
    # dataset = HindiTextDataset(tokenizer=tokenizer, file_path=data_file_path, block_size=config.n_ctx) # Use config.n_ctx for block_size
    dataset = load_dataset("text", data_files={"train": data_file_path})
    def tokenize_function(examples):
        # return {"input_ids": tokenizer.EncodeAsIds(examples["text"])}
        tokens = tokenizer(examples["text"], truncation=True)
        return {"input_ids": tokens["input_ids"]}

    print("tokenizing...")
    tokenized_datasets = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"],
        batch_size=500,
        num_proc=8,  # Adjust based on CPU cores
        load_from_cache_file=True
    )
    # Data Collator for Language Modeling: handles shifting labels and padding
    # mlm=False for causal language modeling (like GPT-2)
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    train_dataset_1 = tokenized_datasets["train"]

    # Shuffle and select a fixed subset for eval_dataset
    eval_dataset_1 = train_dataset_1.shuffle(seed=42).select(range(2000))

    # Define your TrainingArguments
    if custom_training_args:
        training_args = custom_training_args
        # You might want to update output_dir and logging_dir if they are relative
        # training_args.output_dir = os.path.join(output_dir, training_args.output_dir)
        # training_args.logging_dir = os.path.join(output_dir, training_args.logging_dir)
    else:
        # Define some default TrainingArguments if not provided
        training_args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=True,
            num_train_epochs=10,
            per_device_train_batch_size=6,  
            per_device_eval_batch_size=2,  
            gradient_accumulation_steps=4,  
            learning_rate=2e-4,
            optim="adafactor",
            max_grad_norm=1.0,
            save_total_limit=100000,
            logging_dir="/home/dixit/Project/May try/hindi_gpt2_rope_alibi_model_logs",
            eval_strategy ="steps", 
            save_strategy="epoch",
            logging_steps=1000,
            eval_steps=5000, 
            warmup_steps=1000,
            save_steps = 10000,
            weight_decay=0.01,
            dataloader_num_workers=2,
            remove_unused_columns=False, 
            resume_from_checkpoint=True,  
            fp16=True,  
            eval_accumulation_steps=None,
            report_to="tensorboard",
        )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_1,
        eval_dataset=eval_dataset_1,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Train the model
    logger.info("Starting training with Hugging Face Trainer...")
    # trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
    trainer.train(resume_from_checkpoint=True)

    logger.info("Training complete!")
    trainer.save_model(f"{output_dir}/final_model")
    tokenizer.save_pretrained(f"{output_dir}/final_model")
    logger.info(f"Final model saved to {output_dir}/final_model")


if __name__ == "__main__":
    # ... (tokenizer and data setup, including custom_config definition, remains the same) ...
    output_dir = "./hindi_gpt2_rope_alibi_model"
    tokenizer = T5Tokenizer.from_pretrained("/home/dixit/Project/May try/my_hindi_t5_tokenizer")
    tokenizer.model_max_length = 256
    tokenizer.pad_token = "<pad>"  
    tokenizer.padding_size = "right"
    my_config= GPT2Config(
        vocab_size = tokenizer.vocab_size,
        n_positions=256,
        n_ctx=256,
        n_embd=512,
        n_layer=6,
        n_head=8,
        bos_token_id = tokenizer.bos_token_id,
        eos_token_id = tokenizer.eos_token_id,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        summary_activation='tanh',
        use_cache=False,
        dropout= 0.1,
    )
    # Define your TrainingArguments
    my_training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=10,
        per_device_train_batch_size=6,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        optim="adafactor",
        max_grad_norm=1.0,
        save_total_limit=None, # Set to None or a specific number. 100000 is very high.
        logging_dir="/home/dixit/Project/May try/hindi_gpt2_rope_alibi_model_logs",
        eval_strategy ="steps", # Corrected from eval_strategy
        save_strategy="epoch",
        logging_steps=1000,
        eval_steps=5000,
        warmup_steps=1000,
        save_steps = 10000,
        weight_decay=0.01,
        dataloader_num_workers=2,
        remove_unused_columns=False,
        resume_from_checkpoint=True,
        fp16=True,
        eval_accumulation_steps=None,
        report_to="tensorboard",
    )
    
    tokenizer = T5Tokenizer.from_pretrained("/home/dixit/Project/May try/my_hindi_t5_tokenizer")
    

    # 4. Run the training function with Trainer
    train_model_with_trainer(
        tokenizer_path="/home/dixit/Project/May try/my_hindi_t5_tokenizer",
        data_file_path="/home/dixit/Project/May try/hindi_corpus_fixed.txt",
        output_dir="/home/dixit/Project/May try/hindi_gpt2_rope_alibi_model", # This will be the base for output_dir in TrainingArguments
        custom_config=my_config,
        custom_training_args=my_training_args # Pass your custom training args here
    )

INFO:datasets:PyTorch version 2.7.0 available.
INFO:datasets:TensorFlow version 2.19.0 available.
INFO:__main__:Using device: cuda
INFO:__main__:Tokenizer already has pad_token: <pad>
INFO:__main__:Model config: GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 1,
  "dropout": 0.1,
  "embd_pdrop": 0.1,
  "eos_token_id": 2,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 256,
  "n_embd": 512,
  "n_head": 8,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 256,
  "pad_token_id": 0,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": "tanh",
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.51.3",
  "use_cache": false,
  "vocab_size": 30100
}

INFO:__main__:Model embeddings vocabulary size (30100)

tokenizing...


Map (num_proc=8):   0%|          | 0/3669059 [00:00<?, ? examples/s]

  trainer = Trainer(
INFO:__main__:Starting training with Hugging Face Trainer...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
5000,5.2636,5.216003
10000,4.8425,4.828564
15000,4.6244,4.618337
20000,4.4889,4.481425
25000,4.3843,4.376651
30000,4.3047,4.292759
35000,4.2529,4.223422
40000,4.2132,4.169271
45000,4.153,4.111844
50000,4.1006,4.061904


In [None]:
# if __name__ == "__main__":
#     # Create dummy data and tokenizer for demonstration
#     # In a real scenario, you would have your pre-trained Hindi tokenizer
#     # and a large Hindi text file.

#     # 1. Prepare your Hindi tokenizer and save it
#     # Example (replace with your actual tokenizer training):
#     # from tokenizers import ByteLevelBPETokenizer
#     # tokenizer = ByteLevelBPETokenizer()
#     # tokenizer.train(files=["./hindi_corpus.txt"], vocab_size=50257, min_frequency=2,
#     #                 special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
#     # tokenizer.save_model("./my_hindi_tokenizer")
#     # tokenizer_path = "./my_hindi_tokenizer"

#     # For demonstration, we'll use a pre-trained tokenizer and just add a dummy pad token
#     # If you have a custom tokenizer, make sure it's saved to a directory
#     # and provide that path.
#     # tokenizer_path = "google/byt5-small" # Using a placeholder for demonstration
#     # You would replace this with the path to your actual Hindi tokenizer
#     # e.g., './my_hindi_tokenizer' if you saved it there.
#     tokenizer_path = "/home/dixit/Project/May try/my_hindi_t5_tokenizer" 

#     # 2. Create a dummy Hindi text file
#     # dummy_text = """
#     # नमस्ते, मेरा नाम Gemini है। मैं गूगल द्वारा प्रशिक्षित एक बड़ा भाषा मॉडल हूँ।
#     # मैं आपकी कैसे मदद कर सकता हूँ? यह एक परीक्षण पाठ है जिसमें कुछ हिन्दी वाक्य हैं।
#     # भाषा मॉडल प्रशिक्षण के लिए यह एक महत्वपूर्ण कदम है।
#     # पोजीशनल एम्बेडिंग को संशोधित करना एक चुनौतीपूर्ण कार्य है।
#     # रोटरी पोजीशनल एम्बेडिंग और एएलआईबीआई को जोड़ना दिलचस्प है।
#     # """ * 100 # Repeat for more data

#     # with open("hindi_dummy_data.txt", "w", encoding="utf-8") as f:
#     #     f.write(dummy_text)
#     data_file_path = "hindi_dummy_data.txt"

#     # 3. Define output directory
#     output_dir = "./hindi_gpt2_rope_alibi_model"
#     import os
#     os.makedirs(output_dir, exist_ok=True)

#     # 4. Run the training function
#     train_model(
#         tokenizer_path=tokenizer_path,
#         data_file_path=data_file_path,
#         output_dir=output_dir,
#         model_name_or_path="gpt2", # Start with base GPT-2 weights
#         block_size=256, # Smaller block size for demonstration
#         batch_size=2,   # Smaller batch size for demonstration
#         gradient_accumulation_steps=None,
#         num_epochs=10,   # Shorter training for demonstration
#         learning_rate=2e-4,
#         warmup_steps=1000,
#         eval_steps=5000,
#         save_steps=10000
#     )

#     # After training, you can load your model and generate text:
#     # from transformers import pipeline
#     # model_path = f"{output_dir}/final_model"
#     # loaded_tokenizer = AutoTokenizer.from_pretrained(model_path)
#     # loaded_model = GPT2LMHeadModel.from_pretrained(model_path)
#     # # Make sure to re-replace attention for the loaded model if it was saved without the custom class definition
#     # # (though PyTorch should save the custom class definition if it's in the same file)
#     # # loaded_model = replace_attention_with_custom(loaded_model)
#     # loaded_model.to(device)
#     # loaded_model.eval()
#     #
#     # generator = pipeline('text-generation', model=loaded_model, tokenizer=loaded_tokenizer, device=0 if torch.cuda.is_available() else -1)
#     #
#     # prompt = "नमस्ते, मैं हिन्दी में"
#     # generated_text = generator(prompt, max_length=50, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)
#     # print(generated_text)

INFO:__main__:Using device: cuda
INFO:__main__:Resized model embeddings to 30100 vocabulary size.
INFO:__main__:Replacing GPT2Attention in block 0 with CustomGPT2Attention.
INFO:__main__:Replacing GPT2Attention in block 1 with CustomGPT2Attention.
INFO:__main__:Replacing GPT2Attention in block 2 with CustomGPT2Attention.
INFO:__main__:Replacing GPT2Attention in block 3 with CustomGPT2Attention.
INFO:__main__:Replacing GPT2Attention in block 4 with CustomGPT2Attention.
INFO:__main__:Replacing GPT2Attention in block 5 with CustomGPT2Attention.
INFO:__main__:Replacing GPT2Attention in block 6 with CustomGPT2Attention.
INFO:__main__:Replacing GPT2Attention in block 7 with CustomGPT2Attention.
INFO:__main__:Replacing GPT2Attention in block 8 with CustomGPT2Attention.
INFO:__main__:Replacing GPT2Attention in block 9 with CustomGPT2Attention.
INFO:__main__:Replacing GPT2Attention in block 10 with CustomGPT2Attention.
INFO:__main__:Replacing GPT2Attention in block 11 with CustomGPT2Attention.


TypeError: unsupported operand type(s) for //: 'int' and 'NoneType'