In [1]:
!pip install pytorch-crf datasets seqeval tqdm wandb GPUtil matplotlib seaborn

[0m

In [2]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizerFast, XLMRobertaModel
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm.auto import tqdm
from huggingface_hub import HfFolder, HfApi
import wandb
from datetime import datetime
import os
import logging
import gc

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)



In [3]:
tag_to_id = {
    'B-geo': 0, 'O': 1, 'B-gpe': 2, 'B-per': 3, 'I-per': 4, 'B-tim': 5,
    'B-org': 6, 'I-org': 7, 'B-art': 8, 'I-art': 9, 'I-tim': 10,
    'B-eve': 11, 'I-eve': 12, 'I-geo': 13, 'I-gpe': 14, 'B-nat': 15, 'I-nat': 16
}

from huggingface_hub import HfFolder
# Set your HuggingFace token
HF_TOKEN = "secret"  # Replace with your token
HfFolder.save_token(HF_TOKEN)
wandb_key='secret'  # Your W&B API key
wandb.login(key=wandb_key)

[34m[1mwandb[0m: Currently logged in as: [33mdeb[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/.netrc


True

In [4]:
class SentenceLengthAnalyzer:
    def __init__(self, file_path, tokenizer):
        self.file_path = file_path
        self.tokenizer = tokenizer

    def calculate_max_length(self):
        """Calculate the 99th percentile of token lengths in the dataset"""
        logger.info("Analyzing sequence lengths...")
        df = pd.read_csv(self.file_path)
        sentences = df.groupby('Sentence #')['Word'].apply(list).values

        lengths = []
        for sentence in tqdm(sentences, desc="Calculating sequence lengths"):
            try:
                tokens = self.tokenizer(sentence, is_split_into_words=True, truncation=False)
                lengths.append(len(tokens['input_ids']))
            except Exception as e:
                logger.warning(f"Error processing sentence: {e}")
                continue

        max_len = int(np.percentile(lengths, 99))
        logger.info(f"Sequence length statistics:")
        logger.info(f"Mean length: {np.mean(lengths):.2f}")
        logger.info(f"Median length: {np.median(lengths):.2f}")
        logger.info(f"99th percentile length: {max_len}")
        logger.info(f"Max length: {max(lengths)}")

        return max_len

In [5]:
class NERDataset(Dataset):
    def __init__(self, texts, tags, tokenizer, tag_to_id, max_len):
        self.texts = texts
        self.tags = tags
        self.tokenizer = tokenizer
        self.tag_to_id = tag_to_id
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        try:
            words = self.texts[idx]
            tags = self.tags[idx]

            encoding = self.tokenizer(
                words,
                is_split_into_words=True,
                padding='max_length',
                truncation=True,
                max_length=self.max_len,
                return_tensors='pt'
            )

            label_ids = []
            word_ids = encoding.word_ids()

            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                else:
                    label_ids.append(self.tag_to_id[tags[word_idx]])

            encoding = {key: val.squeeze() for key, val in encoding.items()}
            encoding['labels'] = torch.tensor(label_ids)

            return encoding
        except Exception as e:
            logger.error(f"Error processing item {idx}: {e}")
            raise e

In [6]:
class XLMRobertaBiLSTM(nn.Module):
    def __init__(self, num_labels, dropout=0.1, lstm_hidden_size=256):
        super().__init__()
        self.roberta = XLMRobertaModel.from_pretrained('xlm-roberta-large')
        # Get the actual hidden size from the model config
        hidden_size = self.roberta.config.hidden_size  # This will be 1024 for xlm-roberta-large

        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(
            input_size=hidden_size,  # Using the actual hidden size (1024)
            hidden_size=lstm_hidden_size,
            num_layers=2,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if 2 > 1 else 0
        )
        self.classifier = nn.Linear(lstm_hidden_size * 2, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get RoBERTa outputs
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]  # Shape: [batch_size, seq_len, hidden_size(1024)]

        # Apply dropout
        sequence_output = self.dropout(sequence_output)

        # Pass through BiLSTM
        lstm_output, _ = self.lstm(sequence_output)  # Shape: [batch_size, seq_len, lstm_hidden_size*2]
        lstm_output = self.dropout(lstm_output)

        # Get logits
        logits = self.classifier(lstm_output)  # Shape: [batch_size, seq_len, num_labels]

        # Calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            # Only consider loss on valid tokens
            active_loss = labels.view(-1) != -100
            active_logits = logits.view(-1, logits.shape[-1])
            active_labels = labels.view(-1)
            loss = loss_fct(active_logits[active_loss], active_labels[active_loss])

        return {'loss': loss, 'logits': logits} if loss is not None else {'logits': logits}

In [7]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

class NERTrainer:
    def __init__(self, model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=10):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = device
        self.num_epochs = num_epochs
        self.early_stopping = EarlyStopping(patience=3)

    def validate(self):
        """
        Validate the model on the validation set
        """
        self.model.eval()
        total_val_loss = 0
        val_pbar = tqdm(self.val_loader, desc='Validation')

        with torch.no_grad():
            for batch in val_pbar:
                try:
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    labels = batch['labels'].to(self.device)

                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )

                    loss = outputs['loss']
                    total_val_loss += loss.item()

                    val_pbar.set_postfix({'loss': f'{loss.item():.4f}'})

                except RuntimeError as e:
                    if "out of memory" in str(e):
                        logger.warning("CUDA OOM during validation. Clearing cache...")
                        if hasattr(torch.cuda, 'empty_cache'):
                            torch.cuda.empty_cache()
                        continue
                    raise e

        avg_val_loss = total_val_loss / len(self.val_loader)
        return avg_val_loss

    def train(self):
        best_val_loss = float('inf')
        accumulated_batches = 2  # Reduced for small dataset

        try:
            self.model.roberta.gradient_checkpointing_enable()

            for epoch in range(self.num_epochs):
                # Training
                self.model.train()
                total_train_loss = 0
                train_pbar = tqdm(self.train_loader,
                                desc=f'Epoch {epoch + 1}/{self.num_epochs} [Train]')

                self.optimizer.zero_grad()

                for batch_idx, batch in enumerate(train_pbar):
                    try:
                        input_ids = batch['input_ids'].to(self.device)
                        attention_mask = batch['attention_mask'].to(self.device)
                        labels = batch['labels'].to(self.device)

                        outputs = self.model(
                            input_ids=input_ids,
                            attention_mask=attention_mask,
                            labels=labels
                        )

                        loss = outputs['loss'] / accumulated_batches
                        total_train_loss += loss.item() * accumulated_batches

                        loss.backward()

                        if (batch_idx + 1) % accumulated_batches == 0:
                            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                            self.optimizer.step()
                            self.scheduler.step()
                            self.optimizer.zero_grad()

                        train_pbar.set_postfix({
                            'loss': f'{loss.item() * accumulated_batches:.4f}'
                        })

                        # Log to wandb with reduced frequency
                        if batch_idx % 50 == 0:  # Reduced frequency for small dataset
                            wandb.log({
                                "train_batch_loss": loss.item() * accumulated_batches,
                                "learning_rate": self.scheduler.get_last_lr()[0],
                            })

                    except RuntimeError as e:
                        if "out of memory" in str(e):
                            logger.warning(f"CUDA OOM in batch {batch_idx}. Skipping...")
                            torch.cuda.empty_cache()
                            gc.collect()
                            self.optimizer.zero_grad()
                            continue
                        raise e

                avg_train_loss = total_train_loss / len(self.train_loader)

                # Validation
                val_loss = self.validate()

                # Log epoch metrics
                wandb.log({
                    "train_epoch_loss": avg_train_loss,
                    "val_epoch_loss": val_loss,
                    "epoch": epoch
                })

                # Early stopping check
                self.early_stopping(val_loss)

                # Save best model
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    self.save_checkpoint(
                        f"best_model_epoch_{epoch}_valloss_{val_loss:.4f}.pt"
                    )

                logger.info(f'Epoch {epoch + 1} Summary:')
                logger.info(f'Average training loss: {avg_train_loss:.4f}')
                logger.info(f'Average validation loss: {val_loss:.4f}')

                if self.early_stopping.early_stop:
                    logger.info("Early stopping triggered")
                    break

        except Exception as e:
            logger.error(f"Error during training: {str(e)}")
            raise e

    def save_checkpoint(self, filename):
        """Save a checkpoint of the model"""
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'epoch': self.num_epochs,
        }, filename)

In [8]:
from transformers import PreTrainedModel
from huggingface_hub import HfApi
import os
import torch
import json

class ModelPublisher:
    def __init__(self, model_id, token):
        self.model_id = model_id
        self.token = token
        self.api = HfApi()

    def push_to_hub(self, model, tokenizer):
        try:
            # Create temporary directory
            os.makedirs("temp_model", exist_ok=True)

            # Save model state dict
            torch.save(model.state_dict(), "temp_model/pytorch_model.bin")

            # Save config
            config = {
                "architectures": ["XLMRobertaBiLSTM"],
                "model_type": "xlm-roberta-bilstm",
                "num_labels": model.classifier.out_features,
                "lstm_hidden_size": model.lstm.hidden_size,
                "dropout": model.dropout.p,
                "base_model": "xlm-roberta-large",
            }

            with open("temp_model/config.json", "w") as f:
                json.dump(config, f)

            # Delete the repository if it exists
            try:
                self.api.delete_repo(self.model_id, token=self.token)
                logger.info(f"Deleted existing repository: {self.model_id}")
            except Exception as e:
                logger.info(f"Repository doesn't exist or couldn't be deleted: {str(e)}")

            # Create new repository
            self.api.create_repo(self.model_id, token=self.token)

            # Upload files
            self.api.upload_file(
                path_or_fileobj="temp_model/pytorch_model.bin",
                path_in_repo="pytorch_model.bin",
                repo_id=self.model_id,
                token=self.token
            )

            self.api.upload_file(
                path_or_fileobj="temp_model/config.json",
                path_in_repo="config.json",
                repo_id=self.model_id,
                token=self.token
            )

            # Push tokenizer files
            tokenizer.save_pretrained("temp_model")
            tokenizer_files = [f for f in os.listdir("temp_model") if f.startswith("tokenizer") or f.endswith(".json")]

            for file in tokenizer_files:
                self.api.upload_file(
                    path_or_fileobj=f"temp_model/{file}",
                    path_in_repo=file,
                    repo_id=self.model_id,
                    token=self.token
                )

            # Clean up
            import shutil
            shutil.rmtree("temp_model")

            logger.info(f"Successfully pushed model to {self.model_id}")

        except Exception as e:
            logger.error(f"Error pushing to hub: {str(e)}")
            # Clean up on error
            if os.path.exists("temp_model"):
                shutil.rmtree("temp_model")
            raise e

In [9]:
#def main():
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info(f"Using device: {device}")

# Set environmental variables for CUDA debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Initialize wandb
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
gpu_type = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
wandb_run_name = f"Ben_NER_xlm-roberta-large_BiLSTM_{current_time}_{gpu_type}"

wandb.init(
    project="bengali_ner",
    name=wandb_run_name,
    config={
        "model_name": "xlm-roberta-large",
        "architecture": "BiLSTM",
        "max_length": "99th percentile",
        "epochs": 10,
        "batch_size": 16,
        "learning_rate": 2e-5
    }
)

try:
    # Initialize tokenizer
    tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-large')

    # Calculate max length
    analyzer = SentenceLengthAnalyzer('b-ner-train.csv', tokenizer)
    max_len = analyzer.calculate_max_length()

    # Load and prepare data
    df = pd.read_csv('b-ner-train.csv')
    sentences = df.groupby('Sentence #')['Word'].apply(list).values
    tags = df.groupby('Sentence #')['Tag'].apply(list).values

    # Split data
    train_texts, val_texts, train_tags, val_tags = train_test_split(
        sentences, tags, test_size=0.2, random_state=42
    )

    # Create datasets
    train_dataset = NERDataset(train_texts, train_tags, tokenizer, tag_to_id, max_len)
    val_dataset = NERDataset(val_texts, val_tags, tokenizer, tag_to_id, max_len)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    # Initialize model
    model = XLMRobertaBiLSTM(num_labels=len(tag_to_id))
    model.to(device)

    # Setup training
    optimizer = AdamW(model.parameters(), lr=2e-5)
    num_training_steps = 10 * len(train_loader)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    # Train model
    # Setup training
    trainer = NERTrainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        device=device,
        num_epochs=10  # Increased epochs
    )

    trainer.train()

    # Push to Hub
    publisher = ModelPublisher(
        model_id="Debk/Ben_NER_xlm-roberta-large_BiLSTM",
        token="secret"
    )
    publisher.push_to_hub(model, tokenizer)

except Exception as e:
    logger.error(f"Error in main execution: {str(e)}")
    raise e
finally:
    wandb.finish()


INFO:__main__:Using device: cuda
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


INFO:__main__:Analyzing sequence lengths...


Calculating sequence lengths:   0%|          | 0/17715 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (599 > 512). Running this sequence through the model will result in indexing errors
INFO:__main__:Sequence length statistics:
INFO:__main__:Mean length: 26.82
INFO:__main__:Median length: 23.00
INFO:__main__:99th percentile length: 91
INFO:__main__:Max length: 599


Epoch 1/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Validation:   0%|          | 0/222 [00:00<?, ?it/s]

INFO:__main__:Epoch 1 Summary:
INFO:__main__:Average training loss: 0.7490
INFO:__main__:Average validation loss: 0.3821


Epoch 2/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Validation:   0%|          | 0/222 [00:00<?, ?it/s]

INFO:__main__:Epoch 2 Summary:
INFO:__main__:Average training loss: 0.2883
INFO:__main__:Average validation loss: 0.2298


Epoch 3/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Validation:   0%|          | 0/222 [00:00<?, ?it/s]

INFO:__main__:Epoch 3 Summary:
INFO:__main__:Average training loss: 0.1778
INFO:__main__:Average validation loss: 0.1701


Epoch 4/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Validation:   0%|          | 0/222 [00:00<?, ?it/s]

INFO:__main__:Epoch 4 Summary:
INFO:__main__:Average training loss: 0.1249
INFO:__main__:Average validation loss: 0.1543


Epoch 5/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Validation:   0%|          | 0/222 [00:00<?, ?it/s]

INFO:__main__:Epoch 5 Summary:
INFO:__main__:Average training loss: 0.0988
INFO:__main__:Average validation loss: 0.1381


Epoch 6/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Validation:   0%|          | 0/222 [00:00<?, ?it/s]

INFO:__main__:Epoch 6 Summary:
INFO:__main__:Average training loss: 0.0795
INFO:__main__:Average validation loss: 0.1417


Epoch 7/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Validation:   0%|          | 0/222 [00:00<?, ?it/s]

INFO:__main__:Epoch 7 Summary:
INFO:__main__:Average training loss: 0.0633
INFO:__main__:Average validation loss: 0.1497


Epoch 8/10 [Train]:   0%|          | 0/886 [00:00<?, ?it/s]

Validation:   0%|          | 0/222 [00:00<?, ?it/s]

INFO:__main__:Epoch 8 Summary:
INFO:__main__:Average training loss: 0.0552
INFO:__main__:Average validation loss: 0.1505
INFO:__main__:Early stopping triggered
INFO:__main__:Deleted existing repository: Debk/Ben_NER_xlm-roberta-large_BiLSTM


pytorch_model.bin:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

INFO:__main__:Successfully pushed model to Debk/Ben_NER_xlm-roberta-large_BiLSTM


0,1
epoch,▁▂▃▄▅▆▇█
learning_rate,███▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁
train_batch_loss,█▆█▆▅▃▂▃▂▃▃▃▂▃▁▃▂▂▂▂▂▂▁▂▁▁▁▁▁▁▂▁▁▁▁▁▂▁▁▁
train_epoch_loss,█▃▂▂▁▁▁▁
val_epoch_loss,█▄▂▁▁▁▁▁

0,1
epoch,7.0
learning_rate,1e-05
train_batch_loss,0.00773
train_epoch_loss,0.0552
val_epoch_loss,0.15048


In [None]:
import pandas as pd
import torch
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

class TrainingVerifier:
    def __init__(self, train_file_path, tag_to_id):
        self.train_file_path = train_file_path
        self.tag_to_id = tag_to_id
        self.id_to_tag = {v: k for k, v in tag_to_id.items()}
        
    def analyze_training_data(self):
        """Analyze training data distribution"""
        print("\n=== Training Data Analysis ===")
        
        # Load training data
        df = pd.read_csv(self.train_file_path)
        
        # Analyze tag distribution
        tag_counts = df['Tag'].value_counts()
        total_tags = len(df)
        
        print("\nTag Distribution in Training Data:")
        for tag, count in tag_counts.items():
            percentage = (count / total_tags) * 100
            print(f"{tag}: {count} ({percentage:.2f}%)")
            
        # Calculate class weights for loss function
        class_weights = {
            self.tag_to_id[tag]: (1.0 / count) * (total_tags / len(self.tag_to_id))
            for tag, count in tag_counts.items()
        }
        
        print("\nSuggested Class Weights for Loss Function:")
        for tag_id, weight in class_weights.items():
            print(f"{self.id_to_tag[tag_id]}: {weight:.4f}")
            
        return class_weights
        
    def verify_model_outputs(self, model, tokenizer, sample_sentence):
        """Verify model output distribution before training"""
        print("\n=== Model Output Verification ===")
        
        # Tokenize sample sentence
        encoding = tokenizer(
            sample_sentence,
            is_split_into_words=True,
            return_tensors='pt',
            padding=True,
            truncation=True
        )
        
        # Get model predictions
        with torch.no_grad():
            outputs = model(**encoding)
            logits = outputs['logits']
            probs = torch.nn.functional.softmax(logits, dim=-1)[0]
            
        # Analyze output distribution
        print("\nOutput Distribution for Sample Sentence:")
        for word, word_probs in zip(sample_sentence, probs):
            top_probs, top_ids = torch.topk(word_probs, 3)
            print(f"\nWord: {word}")
            for prob, tag_id in zip(top_probs, top_ids):
                print(f"{self.id_to_tag[tag_id.item()]}: {prob.item():.4f}")
                
        # Check if outputs are too concentrated
        entropy = -torch.sum(probs * torch.log(probs + 1e-10), dim=-1)
        print(f"\nAverage Output Entropy: {entropy.mean().item():.4f}")
        if entropy.mean() < 0.5:
            print("WARNING: Low entropy indicates model might be too confident!")
            
    def suggest_training_improvements(self, class_weights):
        """Suggest improvements based on analysis"""
        print("\n=== Training Improvement Suggestions ===")
        
        # Check for extreme class imbalance
        weight_values = np.array(list(class_weights.values()))
        weight_ratio = np.max(weight_values) / np.min(weight_values)
        
        if weight_ratio > 10:
            print("\n1. Severe class imbalance detected. Suggested fixes:")
            print("   - Use weighted loss function with calculated class weights")
            print("   - Consider data augmentation for minority classes")
            print("   - Implement stratified sampling in DataLoader")
            
        print("\n2. General Training Recommendations:")
        print("   - Start with a smaller learning rate (1e-5)")
        print("   - Use learning rate warmup")
        print("   - Implement gradient clipping")
        print("   - Monitor validation loss for early stopping")
        
        # Suggested loss function code
        print("\n3. Suggested Loss Function Implementation:")
        print("""
        # Convert class weights to tensor
        class_weights = torch.FloatTensor(class_weights).to(device)
        
        # Define weighted loss function
        loss_fct = nn.CrossEntropyLoss(weight=class_weights, ignore_index=-100)
        
        # In forward pass
        loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))
        """)
        
def verify_training():
    # Initialize verifier
    verifier = TrainingVerifier('b-ner-train.csv', tag_to_id)
    
    # Analyze training data
    class_weights = verifier.analyze_training_data()
    
    # Create sample sentence for testing
    sample_sentence = ["বাংলাদেশে ", "পরিস্থিতি", "সঙ্কটাপন্ন"]
    
    # Initialize model
    model = XLMRobertaBiLSTM(num_labels=len(tag_to_id))
    tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-large')
    
    # Verify model outputs
    verifier.verify_model_outputs(model, tokenizer, sample_sentence)
    
    # Get training suggestions
    verifier.suggest_training_improvements(class_weights)
    
if __name__ == "__main__":
    verify_training()


=== Training Data Analysis ===

Tag Distribution in Training Data:
O: 198077 (83.50%)
B-geo: 8256 (3.48%)
I-per: 6904 (2.91%)
B-per: 6881 (2.90%)
B-org: 4198 (1.77%)
B-tim: 3787 (1.60%)
I-org: 3404 (1.44%)
B-gpe: 2003 (0.84%)
I-tim: 1957 (0.83%)
I-geo: 507 (0.21%)
B-art: 336 (0.14%)
B-eve: 327 (0.14%)
I-eve: 274 (0.12%)
I-art: 246 (0.10%)
B-nat: 29 (0.01%)
I-nat: 15 (0.01%)
I-gpe: 7 (0.00%)

Suggested Class Weights for Loss Function:
O: 0.0704
B-geo: 1.6901
I-per: 2.0211
B-per: 2.0278
B-org: 3.3238
B-tim: 3.6846
I-org: 4.0991
B-gpe: 6.9663
I-tim: 7.1300
I-geo: 27.5215
B-art: 41.5280
B-eve: 42.6710
I-eve: 50.9249
I-art: 56.7212
B-nat: 481.1521
I-nat: 930.2275
I-gpe: 1993.3445





=== Model Output Verification ===
