In [1]:
!pip install datasets



In [2]:
#conda create -n translation_env
#conda activate translation_env
#conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
#pip install transformers datasets sentencepiece scipy

import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm

In [3]:
# Load the dataset
dataset = load_dataset("Tamazight-NLP/DGLAI", split="train")


In [4]:
dataset

Dataset({
    features: ['zgh', 'fr', 'ar'],
    num_rows: 1834
})

In [5]:
amazigh_to_arabic = {
    'ⴰ': 'ا',  # a
    'ⴱ': 'ب',  # b
    'ⴳ': 'ج',  # g
    'ⴷ': 'د',  # d
    'ⴹ': 'ض', # ḍ
    'ⴻ': 'ه',  # e
    'ⴼ': 'ف',  # f
    'ⴽ': 'ك',  # k
    'ⵀ': 'ه',  # h
    'ⵃ': 'ح',  # ḥ
    'ⵄ': 'ع',  # ɛ
    'ⵅ': 'خ',  # x
    'ⵇ': 'ق',  # q
    'ⵉ': 'ي',  # i
    'ⵊ': 'ج',  # j
    'ⵍ': 'ل',  # l
    'ⵎ': 'م',  # m
    'ⵏ': 'ن',  # n
    'ⵓ': 'و',  # u
    'ⵔ': 'ر',  # r
    'ⵕ': 'ر',  # ṛ
    'ⵖ': 'غ',  # ɣ
    'ⵙ': 'س',  # s
    'ⵚ': 'ص',  # ṣ
    'ⵛ': 'ش',  # c
    'ⵜ': 'ت',  # t
    'ⵟ': 'ط',  # ṭ
    'ⵡ': 'و',  # w
    'ⵢ': 'ي',  # y
    'ⵣ': 'ز',  # z
    'ⵥ': 'ز',  # ẓ
    'ⵯ': 'و',  # ʷ
}
def preprocess_text(text):
   return ''.join(amazigh_to_arabic.get(c, c) for c in text)
def preprocess_dataset(ds):
    ds['zgh_mapped'] = preprocess_text(ds['zgh'])
    return ds
dataset = dataset.map(preprocess_dataset)

In [6]:
dataset["zgh_mapped"]

['ا يمما !',
 'يغرا ادليس ا',
 'صمميضن ووسسان اد',
 'تاجرست ا',
 'مشش باكتيري',
 'افدوددر ن وبانكرياس',
 'ابدا ن يبيدان',
 'ابداد (-خف )',
 'ابداد غر ونبضو',
 'ابددي (-خف)',
 'ابككاس ن تنفروت',
 'مشش بخخوي',
 'ابلازما ن ومضورو',
 'ابننج ادغران',
 'ابننج اماتاي',
 'ابوكض ن ييض',
 'ققن يبوركسن',
 'يبوركسن ن توننونت',
 'ابريد اجدودان',
 'جر يبردان',
 'ابغلي يدوسن',
 'ابششهش يولسن',
 'ابزاض يولسن',
 'تاسوتلت ن واجاد',
 'اجاداز س توجت',
 'اجاز اجامان',
 'اجاز اكاربون',
 'اجاز يزيني',
 'اجاز ن ووزون',
 'تامسغالت ن وجاز',
 'ينهم واججاج',
 'يططوققز واججاج',
 'اججوا ن وانزا',
 'اججود يميق',
 'اججود يميل',
 'اججود يوكك',
 'يفيلن ن وجددول',
 'اجينان ن وخديل',
 'اجينان ن وول',
 'اججا ن يزدار',
 'اججا ن وفللا',
 'تيكني ن واججا',
 'وفريغ ن واججا',
 'اججدي اجدودان',
 'اججدي ن تسولفت',
 'اجلا اجدودان',
 'اجلا امجرو',
 'اجلا اوانكان',
 'باب ن تاتتويت تاجلدانت',
 'اجلمام انفتاس',
 'اجلمام اتاكتون',
 'اجلمام وجريس',
 'اجلوجل ادمسان',
 'ارجال ن يجميرن',
 'يجميرن يجامانن',
 'اجمماض اكودان',
 'اجممو بو وغيا

In [7]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

MAX_LENGTH = 128
def tokenize_data(batch):
    # Extract English and French sentences from the batch
    amazigh_sentences = batch['zgh_mapped']
    french_sentences = batch['fr']

    # Tokenize English and French sentences
    english_tokens = tokenizer(amazigh_sentences, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    french_tokens = tokenizer(french_sentences, truncation=True, padding="max_length", max_length=MAX_LENGTH)

    # Return the tokenized outputs
    return {
        'input_ids_zgh': english_tokens['input_ids'],
        'attention_mask_zgh': english_tokens['attention_mask'],
        'input_ids_fr': french_tokens['input_ids'],
        'attention_mask_fr': french_tokens['attention_mask']
    }

# Apply tokenization
dataset = dataset.map(tokenize_data, batched=True)

In [8]:

# Convert dataset to PyTorch tensors
ids_zgh = torch.tensor([item['input_ids_zgh'] for item in dataset], dtype=torch.long)
attention_mask_zgh = torch.tensor([item['attention_mask_zgh'] for item in dataset], dtype=torch.long)
ids_fr = torch.tensor([item['input_ids_fr'] for item in dataset], dtype=torch.long)
attention_mask_fr = torch.tensor([item['attention_mask_fr'] for item in dataset], dtype=torch.long)

# Split the dataset into training and validation tensors
train_size = int(0.8 * len(ids_zgh))
val_size = len(ids_zgh) - train_size

train_en, val_en = torch.split(ids_zgh, [train_size, val_size])
train_attention_mask_en, val_attention_mask_en = torch.split(attention_mask_zgh, [train_size, val_size])
train_fr, val_fr = torch.split(ids_fr, [train_size, val_size])
train_attention_mask_fr, val_attention_mask_fr = torch.split(attention_mask_fr, [train_size, val_size])

# Create TensorDatasets
train_tensor_dataset = TensorDataset(train_en,train_attention_mask_en, train_fr, train_attention_mask_fr)

val_tensor_dataset = TensorDataset(val_en, val_attention_mask_en, val_fr, val_attention_mask_fr)


In [9]:
# Create DataLoaders
train_dataloader = DataLoader(train_tensor_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_tensor_dataset, batch_size=64, shuffle=False)

In [10]:
import torch.optim as optim
import torch.nn as nn
from torch import nn
import torch.optim as optim
from torch.nn import TransformerEncoder, TransformerDecoder
from torch.nn import TransformerEncoderLayer, TransformerDecoderLayer

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, nhead, num_layers, dim_feedforward, max_seq_length, dropout):
        super().__init__()

        self.embedding_src = nn.Embedding(src_vocab_size, d_model)
        self.embedding_tgt = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoder = nn.Embedding(max_seq_length, d_model)

        encoder_layer = TransformerEncoderLayer(d_model=d_model, nhead=nhead,
                                              dim_feedforward=dim_feedforward,
                                              dropout=dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers)

        decoder_layer = TransformerDecoderLayer(d_model=d_model, nhead=nhead,
                                              dim_feedforward=dim_feedforward,
                                              dropout=dropout)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers)

        self.output_layer = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt):
        src_positions = torch.arange(0, src.shape[1]).expand(src.shape[0], -1).to(src.device)
        tgt_positions = torch.arange(0, tgt.shape[1]).expand(tgt.shape[0], -1).to(tgt.device)

        src_emb = self.embedding_src(src) + self.pos_encoder(src_positions)
        tgt_emb = self.embedding_tgt(tgt) + self.pos_encoder(tgt_positions)

        encoder_output = self.transformer_encoder(src_emb.transpose(0, 1))
        decoder_output = self.transformer_decoder(tgt_emb.transpose(0, 1), encoder_output)

        output = self.output_layer(decoder_output.transpose(0, 1))
        return output

if torch.cuda.is_available():
    print("Device Name:", torch.cuda.get_device_name(0))
    print("CUDA Version:", torch.version.cuda)
else:
    print("GPU not available or CUDA not installed correctly.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Hyperparameters
src_vocab_size = tokenizer.vocab_size
tgt_vocab_size = tokenizer.vocab_size
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = MAX_LENGTH  # Max length used for tokenization
dropout = 0.2
learning_rate = 0.0001

# Initialize the transformer model
transformer = Transformer(src_vocab_size,
                tgt_vocab_size, d_model,
                num_heads, num_layers,
                d_ff, max_seq_length,dropout)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Assuming padding index is 0
optimizer = optim.Adam(transformer.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)

torch.cuda.empty_cache()
# Move model to device
transformer.to(device)

# Training loop
num_epochs = 1  # Adjust the number of epochs

transformer.train()  # Set the model to training mode

for epoch in range(num_epochs):

    epoch_loss = 0
    progress_bar = tqdm(iterable=None, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
    for batch in train_dataloader:

        src_batch = batch[0].to(device) # source english
        src_attention_mask = batch[1].to(device) # source mask english
        tgt_batch = batch[2].to(device) # target french
        tgt_attention_mask = batch[3].to(device) # target mask french

        optimizer.zero_grad()

        # Forward pass through the transformer model
        output = transformer(src_batch, tgt_batch[:, :-1])

        loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_batch[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
        progress_bar.refresh()
    print(f"Epoch {epoch + 1}, TrainLoss: {epoch_loss / len(train_dataloader)}")

    # Evaluation of the model on the validation set after each epoch
    transformer.eval()  # Set the model to evaluation mode
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            src_batch = batch[0].to(device) # source english
            src_attention_mask = batch[1].to(device) # source mask english
            tgt_batch = batch[2].to(device) # target french
            tgt_attention_mask = batch[3].to(device) # target mask french

            optimizer.zero_grad()

             # Forward pass through the transformer model
            output = transformer(src_batch, tgt_batch[:, :-1])

            loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_batch[:, 1:].contiguous().view(-1))

            val_loss += loss.item()
            #progress_bar.set_postfix(loss=loss.item())
            #progress_bar.refresh()
        print(f"Epoch {epoch + 1}, ValLoss: {val_loss / len(val_dataloader)}")

# Save the model's state_dict
torch.save(transformer.state_dict(), "translate_model.pth")
print("Model parameters saved successfully.")

GPU not available or CUDA not installed correctly.


Epoch 1/1: 0it [00:00, ?it/s]

: 

In [None]:
def translate_sentence(model, sentence, tokenizer, device, max_len=MAX_LENGTH):
    model.eval()

    # Tokenize input sentence
    tokens = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=max_len)
    src_input = tokens["input_ids"].to(device)
    print(f"Tokenized Input (IDs): {tokens['input_ids']}")
    print(f"Tokenized Input (Text): {tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])}")

    tgt_input = torch.tensor([[tokenizer.cls_token_id]], device=device)  # Starting with CLS token
    generated_tokens = []

    with torch.no_grad():
        for i in range(max_len):
            output = model(src_input, tgt_input)
            logits = output[:, -1, :]
            print(f"Logits for the next token at Step {i}: {logits}")


            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            print(f"Probabilities for next token at Step {i}: {probabilities}")

            next_token = logits.argmax(dim=-1).item()
            generated_tokens.append(next_token)
            print(f"Step {i}, Generated token:", next_token)

            if next_token == tokenizer.sep_token_id or len(generated_tokens) >= max_len:
                break
            if len(generated_tokens) >= max_len:
                break

            print(f"Tokenized Input (Text): {tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])}")

            tgt_input = torch.cat([tgt_input, torch.tensor([[next_token]], device=device)], dim=1)
            tgt_input = torch.tensor([[tokenizer.pad_token_id]], device=device)
            print(f"Tokenized Input (Text): {tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])}")

    # Return the translated sentence by decoding the generated tokens
    translation = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return translation

In [None]:
torch.save(transformer.state_dict(), "translate_model.pth")
print("Model parameters saved successfully.")

NameError: name 'transformer' is not defined

In [None]:
def code_amazigh_text(text):
   amazigh_to_arabic = {
       'ⴰ': 'ا', 'ⴱ': 'ب', 'ⴳ': 'ج', 'ⴷ': 'د', 'ⴹ': 'ض', 'ⴻ': 'ه', 
       'ⴼ': 'ف', 'ⴽ': 'ك', 'ⵀ': 'ه', 'ⵃ': 'ح', 'ⵄ': 'ع', 'ⵅ': 'خ',
       'ⵇ': 'ق', 'ⵉ': 'ي', 'ⵊ': 'ج', 'ⵍ': 'ل', 'ⵎ': 'م', 'ⵏ': 'ن',
       'ⵓ': 'و', 'ⵔ': 'ر', 'ⵕ': 'ر', 'ⵖ': 'غ', 'ⵙ': 'س', 'ⵚ': 'ص',
       'ⵛ': 'ش', 'ⵜ': 'ت', 'ⵟ': 'ط', 'ⵡ': 'و', 'ⵢ': 'ي', 'ⵣ': 'ز',
       'ⵥ': 'ز', 'ⵯ': 'و'
   }
   return ''.join(amazigh_to_arabic.get(c, c) for c in text)

def decode_amazigh_text(text):
   arabic_to_amazigh = {
       'ا': 'ⴰ', 'ب': 'ⴱ', 'ج': 'ⴳ', 'د': 'ⴷ', 'ض': 'ⴹ', 'ه': 'ⴻ',
       'ف': 'ⴼ', 'ك': 'ⴽ', 'ح': 'ⵃ', 'ع': 'ⵄ', 'خ': 'ⵅ', 'ق': 'ⵇ',
       'ي': 'ⵉ', 'ل': 'ⵍ', 'م': 'ⵎ', 'ن': 'ⵏ', 'و': 'ⵓ', 'ر': 'ⵔ',
       'غ': 'ⵖ', 'س': 'ⵙ', 'ص': 'ⵚ', 'ش': 'ⵛ', 'ت': 'ⵜ', 'ط': 'ⵟ',
       'ز': 'ⵣ'
   }
   return ''.join(arabic_to_amazigh.get(c, c) for c in text)


In [None]:
# Load the saved model
loaded_model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)
loaded_model.load_state_dict(torch.load("translate_model.pth"))
loaded_model.to(device)

# Example usage

source_sentence = code_amazigh_text("ⴰⵜⵉⵍⵉⴼⵉⵣⵢⵓⵏ ⵏ ⵜⵉⵍⴰⵡⵜ") # "How are you?" in Amazigh
translated = translate_sentence(loaded_model, source_sentence, tokenizer, device)
print(f"Source: {source_sentence}")
print(f"Translation: {translated}")

  loaded_model.load_state_dict(torch.load("translate_model.pth"))


Tokenized Input (IDs): tensor([[  101,   763,   793, 10700, 14495,   106,   102]])
Tokenized Input (Text): ['[CLS]', 'ا', 'ي', '##م', '##ما', '!', '[SEP]']
Logits for the next token at Step 0: tensor([[-4.4659, -5.7341, -5.3274,  ..., -5.2657, -4.0357, -4.0468]],
       device='cuda:0')
Probabilities for next token at Step 0: tensor([[2.7695e-06, 7.7923e-07, 1.1703e-06,  ..., 1.2448e-06, 4.2584e-06,
         4.2114e-06]], device='cuda:0')
Step 0, Generated token: 102
Source: ا يمما !
Translation: 


In [None]:
# Recreate the model architecture
# Hyperparameters
src_vocab_size = tokenizer.vocab_size
tgt_vocab_size = tokenizer.vocab_size
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = MAX_LENGTH  # Max length used for tokenization
dropout = 0.2
learning_rate = 0.0001

loaded_transformer = Transformer(tokenizer.vocab_size, tokenizer.vocab_size, d_model,
num_heads, num_layers, d_ff, max_seq_length,dropout)

# Load the saved parameters
loaded_transformer.load_state_dict(torch.load("transformer_model.pth"))
loaded_transformer.to(device)  # Move to the appropriate device
loaded_transformer.eval()  # Set the model to evaluation mode
print("Model parameters loaded successfully.")

In [None]:
sentence = "We are happy to invite the foreigner minister of Frane"

translation = translate_sentence(
    transformer=transformer,
    sentence=sentence,
    tokenizer_src=tokenizer,
    tokenizer_tgt=tokenizer,
    device=device,
    max_len=32
)
print("Translated Sentence:", translation)

In [1]:
!pip show datasets


Name: datasets
Version: 3.2.0
Summary: HuggingFace community-driven open-source library of datasets
Home-page: https://github.com/huggingface/datasets
Author: HuggingFace Inc.
Author-email: thomas@huggingface.co
License: Apache 2.0
Location: /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages
Requires: aiohttp, dill, filelock, fsspec, huggingface-hub, multiprocess, numpy, packaging, pandas, pyarrow, pyyaml, requests, tqdm, xxhash
Required-by: evaluate


In [2]:
!pip show datasets

Name: datasets
Version: 3.2.0
Summary: HuggingFace community-driven open-source library of datasets
Home-page: https://github.com/huggingface/datasets
Author: HuggingFace Inc.
Author-email: thomas@huggingface.co
License: Apache 2.0
Location: /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages
Requires: aiohttp, dill, filelock, fsspec, huggingface-hub, multiprocess, numpy, packaging, pandas, pyarrow, pyyaml, requests, tqdm, xxhash
Required-by: evaluate


In [3]:
!pip install --upgrade datasets



In [4]:
!pip install evaluate



In [5]:
!pip install sacrebleu



In [6]:
import torch
from evaluate import load

# Load BLEU metric
metric = load("sacrebleu")

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the device
model = model.to(device)

# Evaluate on the test set
predictions = []
references = []

for example in tokenized_datasets["test"]:
    # Move input tensors to the same device as the model
    input_ids = tokenizer(
        example["src_texts"], return_tensors="pt", truncation=True, padding=True
    ).input_ids.to(device)

    # Generate predictions
    output_ids = model.generate(input_ids)

    # Decode predictions and store them
    predictions.append(tokenizer.decode(output_ids[0], skip_special_tokens=True))

    # Append references (ensure references are lists of lists)
    references.append([example["tgt_texts"]])

# Compute BLEU score
results = metric.compute(predictions=predictions, references=references)
print(f"BLEU Score: {results['score']}")

2025-01-25 02:18:15.750325: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-25 02:18:16.078742: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737771496.205709 2041201 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737771496.232672 2041201 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-25 02:18:16.403370: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

NameError: name 'model' is not defined

In [None]:
import torch
from evaluate import load

# Load BLEU metric
metric = load("sacrebleu")

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the device
model = model.to(device)

# Evaluate on the test set
predictions = []
references = []

for example in tokenized_datasets["test"]:
    # Move input tensors to the same device as the model
    input_ids = tokenizer(
        example["src_texts"], return_tensors="pt", truncation=True, padding=True
    ).input_ids.to(device)

    # Generate predictions
    output_ids = model.generate(input_ids)

    # Decode predictions and store them
    predictions.append(tokenizer.decode(output_ids[0], skip_special_tokens=True))

    # Append references (ensure references are lists of lists)
    references.append([example["tgt_texts"]])

# Compute BLEU score
results = metric.compute(predictions=predictions, references=references)
print(f"BLEU Score: {results['score']}")