In [None]:
import os
import sys
import copy
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import argparse

# Add this to debug and fix path issues
print("Current directory:", os.getcwd())

# First, determine if we're running in Colab or locally
IN_COLAB = 'google.colab' in sys.modules
print(f"Running in Google Colab: {IN_COLAB}")

# Add the parent directory to the path to handle imports correctly
current_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.dirname(current_dir)
print(f"Adding {parent_dir} to path")
sys.path.append(parent_dir)

# If we're in the Attention_Is_All_You_Need directory, add the current directory too
if os.path.basename(current_dir) == "Attention_Is_All_You_Need":
    sys.path.append(current_dir)
    print(f"Also adding {current_dir} to path")

# Try to find the model_utils.py file
import glob
model_utils_files = glob.glob("**/model_utils.py", recursive=True)
print("Found model_utils.py at:", model_utils_files)

# If we found model_utils.py, add its directory to the path
if model_utils_files:
    model_dir = os.path.dirname(model_utils_files[0])
    print(f"Adding {model_dir} to path")
    sys.path.append(model_dir)
    
    # Add the Attention_Is_All_You_Need directory explicitly
    attention_dir = os.path.join(os.getcwd(), "Attention_Is_All_You_Need")
    if os.path.exists(attention_dir):
        sys.path.append(attention_dir)
        print(f"Adding {attention_dir} to path")

# Try different import strategies
try:
    # Try direct import first (if we're in the same directory)
    from model_utils import Generator, Encoder, Decoder, EncoderLayer, DecoderLayer, MultiHeadedAttention
    from model_utils import PositionwiseFeedForward, PositionalEncoding, Embeddings, subsequent_mask
    from encode_decode import EncodeDecode
    print("Direct import worked!")
except ImportError:
    try:
        # Try with full module path
        from Attention_Is_All_You_Need.model_utils import Generator, Encoder, Decoder, EncoderLayer, DecoderLayer, MultiHeadedAttention
        from Attention_Is_All_You_Need.model_utils import PositionwiseFeedForward, PositionalEncoding, Embeddings, subsequent_mask
        from Attention_Is_All_You_Need.encode_decode import EncodeDecode
        print("Import with Attention_Is_All_You_Need prefix worked!")
    except ImportError as e:
        print(f"Import error: {e}")
        
        # Last resort: try to dynamically import from the found file
        if model_utils_files:
            import importlib.util
            spec = importlib.util.spec_from_file_location("model_utils", model_utils_files[0])
            model_utils = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(model_utils)
            
            # Get the required classes and functions
            Generator = model_utils.Generator
            Encoder = model_utils.Encoder
            Decoder = model_utils.Decoder
            EncoderLayer = model_utils.EncoderLayer
            DecoderLayer = model_utils.DecoderLayer
            MultiHeadedAttention = model_utils.MultiHeadedAttention
            PositionwiseFeedForward = model_utils.PositionwiseFeedForward
            PositionalEncoding = model_utils.PositionalEncoding
            Embeddings = model_utils.Embeddings
            subsequent_mask = model_utils.subsequent_mask
            
            # Now try to import encode_decode
            encode_decode_files = glob.glob("**/encode_decode.py", recursive=True)
            if encode_decode_files:
                spec = importlib.util.spec_from_file_location("encode_decode", encode_decode_files[0])
                encode_decode = importlib.util.module_from_spec(spec)
                spec.loader.exec_module(encode_decode)
                EncodeDecode = encode_decode.EncodeDecode
                print("Dynamic import worked!")
            else:
                print("Could not find encode_decode.py")
        else:
            print("All import attempts failed.")

Current directory: /kaggle/working/FoNu_NLP_TG/Attention_Is_All_You_Need/FoNu_NLP_TG/FoNu_NLP_TG
Running in Google Colab: False
Adding /kaggle/working/FoNu_NLP_TG/Attention_Is_All_You_Need/FoNu_NLP_TG to path
Found model_utils.py at: ['Attention_Is_All_You_Need/model_utils.py']
Adding Attention_Is_All_You_Need to path
Adding /kaggle/working/FoNu_NLP_TG/Attention_Is_All_You_Need/FoNu_NLP_TG/FoNu_NLP_TG/Attention_Is_All_You_Need to path
Direct import worked!


In [None]:
def make_model(src_vocab_size, tgt_vocab_size, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
    """Construct a full transformer model"""
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    
    model = EncodeDecode(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab_size), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab_size), c(position)),
        Generator(d_model, tgt_vocab_size)
    )
    
    # Initialize parameters with Glorot / fan_avg
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    
    return model

In [None]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        src = batch["source"].to(device)
        tgt = batch["target"].to(device)
        
        # Create masks
        src_mask = (src != 0).unsqueeze(-2)
        tgt_mask = (tgt != 0).unsqueeze(-2)
        
        # Create subsequent mask for target sequence
        tgt_len = tgt.size(1) - 1  # Adjust for the shifted target
        subsequent_mask_tensor = subsequent_mask(tgt_len).to(device)
        
        # Apply both padding mask and subsequent mask
        tgt_mask = tgt_mask[:, :, :-1] & subsequent_mask_tensor
        
        # Forward pass - shift target by 1 for teacher forcing
        output = model(src, tgt[:, :-1], src_mask, tgt_mask)
        
        # Calculate loss
        loss = criterion(output.contiguous().view(-1, output.size(-1)), 
                         tgt[:, 1:].contiguous().view(-1))
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

In [None]:
# Import data processing modules
try:
    from data_processing.load_tokenizers import load_sentencepiece_tokenizer, load_huggingface_tokenizer, create_translation_dataset
    print("Successfully imported data processing modules")
except ImportError as e:
    print(f"Error importing data processing modules: {e}")
    print("Make sure the data_processing directory is in your path")
    
    # Try to find and import the modules
    data_processing_files = glob.glob("**/load_tokenizers.py", recursive=True)
    if data_processing_files:
        data_dir = os.path.dirname(data_processing_files[0])
        print(f"Found load_tokenizers.py at {data_processing_files[0]}")
        print(f"Adding {data_dir} to path")
        sys.path.append(data_dir)
        
        try:
            from load_tokenizers import load_sentencepiece_tokenizer, load_huggingface_tokenizer, create_translation_dataset
            print("Successfully imported data processing modules after path adjustment")
        except ImportError as e:
            print(f"Still couldn't import: {e}")

Successfully imported data processing modules


In [None]:
def main():
    # Define default arguments
    data_dir = "./data/processed"
    src_lang = "ewe"
    tgt_lang = "english"
    tokenizer_type = "sentencepiece"
    batch_size = 32
    epochs = 10
    lr = 0.0001
    d_model = 512
    d_ff = 2048
    heads = 8
    layers = 6
    dropout = 0.1
    max_len = 128
    save_dir = "./models"
    
    # Create save directory
    os.makedirs(save_dir, exist_ok=True)
    
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load tokenizers
    if tokenizer_type == "sentencepiece":
        src_tokenizer_path = os.path.join(data_dir, f"{src_lang}_sp.model")
        tgt_tokenizer_path = os.path.join(data_dir, f"{tgt_lang}_sp.model")
        
        src_tokenizer = load_sentencepiece_tokenizer(src_tokenizer_path)
        tgt_tokenizer = load_sentencepiece_tokenizer(tgt_tokenizer_path)
    else:
        tokenizers_dir = os.path.join(data_dir, "tokenizers")
        src_tokenizer_path = os.path.join(tokenizers_dir, f"{src_lang}_hf_tokenizer")
        tgt_tokenizer_path = os.path.join(tokenizers_dir, f"{tgt_lang}_hf_tokenizer")
        
        src_tokenizer = load_huggingface_tokenizer(src_tokenizer_path)
        tgt_tokenizer = load_huggingface_tokenizer(tgt_tokenizer_path)
    
    # Create dataset
    train_data_path = os.path.join(data_dir, f"{src_lang}_{tgt_lang}_train.csv")
    val_data_path = os.path.join(data_dir, f"{src_lang}_{tgt_lang}_val.csv")
    
    train_dataset = create_translation_dataset(
        src_tokenizer=src_tokenizer,
        tgt_tokenizer=tgt_tokenizer,
        data_path=train_data_path,
        src_lang_col=src_lang.capitalize(),
        tgt_lang_col=tgt_lang.capitalize(),
        max_len=max_len
    )
    
    val_dataset = create_translation_dataset(
        src_tokenizer=src_tokenizer,
        tgt_tokenizer=tgt_tokenizer,
        data_path=val_data_path,
        src_lang_col=src_lang.capitalize(),
        tgt_lang_col=tgt_lang.capitalize(),
        max_len=max_len
    )
    
    # Create dataloaders
    train_dataloader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True,
        collate_fn=lambda batch: {
            "source": torch.nn.utils.rnn.pad_sequence([item["source"] for item in batch], batch_first=True, padding_value=0),
            "target": torch.nn.utils.rnn.pad_sequence([item["target"] for item in batch], batch_first=True, padding_value=0),
            "source_text": [item["source_text"] for item in batch],
            "target_text": [item["target_text"] for item in batch]
        }
    )
    
    val_dataloader = DataLoader(
        val_dataset, 
        batch_size=batch_size,
        collate_fn=lambda batch: {
            "source": torch.nn.utils.rnn.pad_sequence([item["source"] for item in batch], batch_first=True, padding_value=0),
            "target": torch.nn.utils.rnn.pad_sequence([item["target"] for item in batch], batch_first=True, padding_value=0),
            "source_text": [item["source_text"] for item in batch],
            "target_text": [item["target_text"] for item in batch]
        }
    )
    
    # Get vocabulary sizes
    if tokenizer_type == "sentencepiece":
        src_vocab_size = src_tokenizer.get_piece_size()
        tgt_vocab_size = tgt_tokenizer.get_piece_size()
    else:
        src_vocab_size = len(src_tokenizer)
        tgt_vocab_size = len(tgt_tokenizer)
    
    # Create model
    model = make_model(
        src_vocab_size=src_vocab_size,
        tgt_vocab_size=tgt_vocab_size,
        N=layers,
        d_model=d_model,
        d_ff=d_ff,
        h=heads,
        dropout=dropout
    )
    model.to(device)
    
    # Define optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding index
    
    # Train the model
    for epoch in range(epochs):
        train_loss = train_epoch(model, train_dataloader, optimizer, criterion, device)
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}")
        
        # Save checkpoint
        checkpoint_path = os.path.join(save_dir, f"transformer_{src_lang}_{tgt_lang}_epoch{epoch+1}.pt")
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': train_loss,
            'src_vocab_size': src_vocab_size,
            'tgt_vocab_size': tgt_vocab_size,
            'args': {
                'src_lang': src_lang,
                'tgt_lang': tgt_lang,
                'd_model': d_model,
                'd_ff': d_ff,
                'heads': heads,
                'layers': layers,
                'dropout': dropout
            }
        }, checkpoint_path)
        print(f"Saved checkpoint to {checkpoint_path}")
    
    # Save final model
    final_model_path = os.path.join(save_dir, f"transformer_{src_lang}_{tgt_lang}_final.pt")
    torch.save({
        'model_state_dict': model.state_dict(),
        'src_vocab_size': src_vocab_size,
        'tgt_vocab_size': tgt_vocab_size,
        'args': {
            'src_lang': src_lang,
            'tgt_lang': tgt_lang,
            'd_model': d_model,
            'd_ff': d_ff,
            'heads': heads,
            'layers': layers,
            'dropout': dropout
        }
    }, final_model_path)
    print(f"Saved final model to {final_model_path}")

In [None]:
# Run the main function
main()

Using device: cpu
Loaded SentencePiece tokenizer with vocabulary size 8000
Loaded SentencePiece tokenizer with vocabulary size 8000


FileNotFoundError: [Errno 2] No such file or directory: './data/processed/ewe_english_train.csv'