In [1]:
!pip install wandb sentencepiece GPUtil datasets accelerate

[0m

In [14]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import wandb
from huggingface_hub import HfApi
from tqdm import tqdm
import logging
from transformers import AutoConfig
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [3]:
from huggingface_hub import login

# Login with the token
login(token="secret")
file_path= 'b-ner-train.csv'
tag_to_id = {
    'B-geo': 0, 'O': 1, 'B-gpe': 2, 'B-per': 3, 'I-per': 4, 'B-tim': 5,
    'B-org': 6, 'I-org': 7, 'B-art': 8, 'I-art': 9, 'I-tim': 10,
    'B-eve': 11, 'I-eve': 12, 'I-geo': 13, 'I-gpe': 14, 'B-nat': 15, 'I-nat': 16
}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/.cache/huggingface/token
Login successful


In [4]:
class SentenceLengthAnalyzer:
    def __init__(self, file_path, tokenizer):
        self.file_path = file_path
        self.tokenizer = tokenizer
        
    def analyze_lengths(self):
        df = pd.read_csv(self.file_path)
        sentences = df.groupby('Sentence #')['Word'].apply(list).values
        
        lengths = []
        for sentence in tqdm(sentences, desc="Analyzing sentence lengths"):
            tokens = self.tokenizer.encode(" ".join(sentence), add_special_tokens=True)
            lengths.append(len(tokens))
            
        max_len = int(np.percentile(lengths, 99))
        logger.info(f"99th percentile length: {max_len}")
        return max_len

In [5]:
class NERDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_len, tag_to_id):
        self.data = pd.read_csv(file_path,nrows=300)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.tag_to_id = tag_to_id
        
        # Group by sentence and validate data
        self.sentences = self.data.groupby('Sentence #').agg({
            'Word': list,
            'Tag': list
        }).reset_index()
        
        # Fix any length mismatches in sentences
        self._fix_length_mismatches()
        
    def _fix_length_mismatches(self):
        """Ensure word and tag lists have matching lengths"""
        for idx, row in self.sentences.iterrows():
            words = row['Word']
            tags = row['Tag']
            if len(words) != len(tags):
                min_len = min(len(words), len(tags))
                self.sentences.at[idx, 'Word'] = words[:min_len]
                self.sentences.at[idx, 'Tag'] = tags[:min_len]
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences.iloc[idx]
        words = sentence['Word']
        tags = sentence['Tag']
        
        # Handle non-string inputs
        text = " ".join(str(word) for word in words)
        
        # Tokenize with truncation
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        # Initialize labels with ignore index
        labels = torch.ones(self.max_len, dtype=torch.long) * -100
        
        # Get word IDs safely
        word_ids = self.tokenizer(text, add_special_tokens=True).word_ids()
        
        # Handle label assignment safely
        current_word_idx = None
        for i, word_idx in enumerate(word_ids):
            if i >= self.max_len:
                break
                
            if word_idx is not None:
                try:
                    if word_idx < len(tags):  # Check if index is valid
                        labels[i] = self.tag_to_id[tags[word_idx]]
                except (IndexError, KeyError):
                    # Default to 'O' tag for any errors
                    labels[i] = self.tag_to_id['O']
                current_word_idx = word_idx
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': labels
        }

In [15]:
class NERTrainer:
    def __init__(self, model_name, tokenizer, max_len, tag_to_id, train_file, 
                 output_dir, wandb_key, device='cuda'):
        self.model_name = model_name
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.tag_to_id = tag_to_id
        self.train_file = train_file
        self.output_dir = output_dir
        self.device = device
        self.wandb_key = wandb_key
        
        # Create output directory if it doesn't exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # Get the config first
        config = AutoConfig.from_pretrained(
            model_name,
            num_labels=len(tag_to_id),
            id2label={v: k for k, v in tag_to_id.items()},
            label2id=tag_to_id,
        )
        
        # Initialize model with config
        self.model = AutoModelForTokenClassification.from_pretrained(
            model_name,
            config=config
        )
        
        # Move model to device
        self.model = self.model.to(self.device)

    def _make_state_dict_contiguous(self, state_dict):
        """Make all tensors in state dict contiguous and on CPU"""
        contiguous_state_dict = {}
        for key, tensor in state_dict.items():
            if isinstance(tensor, torch.Tensor):
                # Move to CPU first if needed
                if tensor.device.type != "cpu":
                    tensor = tensor.cpu()
                # Make contiguous and clone to ensure memory ownership
                tensor = tensor.contiguous().clone()
            contiguous_state_dict[key] = tensor
        return contiguous_state_dict

    def save_model(self, output_dir):
        """Custom model saving function"""
        # Get model state dict
        state_dict = self.model.state_dict()
        
        # Make all tensors contiguous
        contiguous_state_dict = self._make_state_dict_contiguous(state_dict)
        
        # Save using PyTorch's native format
        torch.save(contiguous_state_dict, os.path.join(output_dir, "pytorch_model.bin"))
        
        # Save config
        self.model.config.save_pretrained(output_dir)
        
        # Save tokenizer
        self.tokenizer.save_pretrained(output_dir)

    def train(self, batch_size=16, num_epochs=3, learning_rate=2e-5):
        # Set up wandb
        current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
        gpu_info = torch.cuda.get_device_name() if torch.cuda.is_available() else "CPU"
        run_name = f"Ben_NER_indic-bert_{current_time}_{gpu_info}"
        
        wandb.login(key=self.wandb_key)
        wandb.init(project=run_name)
        
        # Prepare dataset
        train_dataset = NERDataset(self.train_file, self.tokenizer, self.max_len, self.tag_to_id)
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=100,
            save_strategy="epoch",
            report_to="wandb",
            fp16=True,
            gradient_accumulation_steps=2,
            save_safetensors=False,  # Disable safetensors
        )
        
        # Initialize trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset
        )
        
        try:
            # Train the model
            trainer.train()
            
            # Save model using custom save function
            logger.info("Saving the model...")
            self.save_model(self.output_dir)
            
            logger.info("Model saved successfully")
            
        except Exception as e:
            logger.error(f"Training error: {str(e)}")
            raise
        finally:
            wandb.finish()

IndentationError: expected an indented block after function definition on line 2 (2369225868.py, line 4)

In [16]:
class ModelPusher:
    def __init__(self, model_path, repo_id, token):
        self.model_path = model_path
        self.repo_id = repo_id
        self.token = token
        self.api = HfApi()
        
    def push_to_hub(self):
        try:
            self.api.create_repo(repo_id=self.repo_id, exist_ok=True)
            self.api.upload_folder(
                folder_path=self.model_path,
                repo_id=self.repo_id,
                repo_type="model",
                token=self.token
            )
        except Exception as e:
            logger.error(f"Failed to push model: {str(e)}")
            raise

In [17]:
#def main():
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

# Analyze sentence lengths
analyzer = SentenceLengthAnalyzer(file_path, tokenizer)
max_len = analyzer.analyze_lengths()

trainer = NERTrainer(
    model_name="ai4bharat/indic-bert",
    tokenizer=tokenizer,
    max_len=max_len,
    tag_to_id=tag_to_id,
    train_file=file_path,
    output_dir="./model_output",
    wandb_key="secret"
)

# Train model
trainer.train(num_epochs=1)

# Push to Hub
pusher = ModelPusher(
    model_path="./model_output",
    repo_id="Debk/Ben_NER_indic-bert",
    token="secret"
)
pusher.push_to_hub()



Analyzing sentence lengths: 100%|██████████| 17715/17715 [00:02<00:00, 8254.15it/s]
INFO:__main__:99th percentile length: 110
Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/.netrc


ERROR:__main__:Training error: AlbertForTokenClassification does not support gradient checkpointing.


ValueError: AlbertForTokenClassification does not support gradient checkpointing.