In [1]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()

if "GPU" not in device_name:
    print("GPU device not found")
else:
    print(f'Found GPU at: {device_name}')

# Alternative way to check
print("GPU", "available (YESS!!!!)" if tf.config.list_physical_devices("GPU") else "not available :(")


Found GPU at: /device:GPU:0
GPU available (YESS!!!!)


In [2]:
!pip install tensorflow-addons

  pid, fd = os.forkpty()


Collecting tensorflow-addons
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-none-any.whl.metadata (3.6 kB)
Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
  Attempting uninstall: typeguard
    Found existing installation: typeguard 4.3.0
    Uninstalling typeguard-4.3.0:
      Successfully uninstalled typeguard-4.3.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ydata-profiling 4.10.0 r

In [3]:
!pip install transformers




In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/tamil-bert")
model = AutoModelForMaskedLM.from_pretrained("l3cube-pune/tamil-bert")

# Sample Tamil sentence with a [MASK] token to predict
sentence = "தமிழ்நாட்டின் தலைநகரம் [MASK] ஆகும்."

# Tokenize the sentence
inputs = tokenizer(sentence, return_tensors="pt")

# Get model predictions
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

# Get the predicted token for [MASK]
masked_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
predicted_token_id = torch.argmax(predictions[0, masked_index, :], dim=-1)

# Decode the predicted token
predicted_token = tokenizer.decode(predicted_token_id)
print(f"Predicted word for [MASK]: {predicted_token}")


tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/951M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Predicted word for [MASK]: சென்னை


# **PART 2**

In [5]:
!!pip install datasets



In [9]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import numpy as np
from sklearn.metrics import classification_report
import torch

# Load dataset
ds = load_dataset("unimelb-nlp/wikiann", "ta")

# Initialize tokenizer and model for Tamil BERT
model_name = "l3cube-pune/tamil-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)


README.md:   0%|          | 0.00/158k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/92.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/92.7k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/15000 [00:00<?, ? examples/s]



In [11]:
# Preprocessing the dataset to align NER tags with tokens
label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    previous_word_id = None
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)  # Ignore these tokens in the loss
        elif word_id != previous_word_id:
            new_labels.append(labels[word_id])  # Only label first token of the word
        else:
            # Continue with I- tags for inner tokens, otherwise ignore
            new_labels.append(labels[word_id] if label_list[labels[word_id]].startswith("I-") else -100)
        previous_word_id = word_id
    return new_labels

def preprocess_data(batch):
    tokenized_inputs = tokenizer(batch["tokens"], truncation=True, is_split_into_words=True, padding="max_length", max_length=128)
    all_word_ids = [tokenized_inputs.word_ids(i) for i in range(len(batch["tokens"]))]

    # Align labels with word ids for each example in the batch
    all_aligned_labels = [
        align_labels_with_tokens(labels, word_ids) for labels, word_ids in zip(batch["ner_tags"], all_word_ids)
    ]

    # Add aligned labels to tokenized inputs
    tokenized_inputs["labels"] = all_aligned_labels
    return tokenized_inputs


In [12]:
# Apply preprocessing to dataset
tokenized_ds = ds.map(preprocess_data, batched=True)

# Set format for PyTorch
tokenized_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [14]:
!pip install seqeval

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=08c7777034b1bf1fdd852bc806af2b40cfbf66c2b7f34a346a8c452dafc579b8
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [16]:
from seqeval.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score
import numpy as np

# Define the compute_metrics function
def compute_metrics(pred):
    # Get the predicted token classes and the true labels
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=2)  # Convert logits to class predictions
    
    # Remove ignored index (usually -100) from predictions and labels
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [[label_list[p] for p, l in zip(prediction, label) if l != -100]
                        for prediction, label in zip(predictions, labels)]
    
    # Calculate scores using seqeval
    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)
    accuracy = accuracy_score(true_labels, true_predictions)
    
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

In [18]:
import logging
from transformers import EarlyStoppingCallback, TrainingArguments, Trainer, AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from torch.optim import AdamW
import pandas as pd

logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)  # Suppress weight initialization warning

# Load Tamil BERT model for token classification
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list), hidden_dropout_prob=0.3)
data_collator = DataCollatorForTokenClassification(tokenizer)

# Layer-wise learning rate decay function without pooler layer
def get_optimizer(model):
    lr = 1e-5
    layerwise_decay = 0.9

    optimizer_grouped_parameters = []
    for i, layer in enumerate(model.bert.encoder.layer):
        layer_lr = lr * (layerwise_decay ** (len(model.bert.encoder.layer) - i - 1))
        optimizer_grouped_parameters += [{"params": layer.parameters(), "lr": layer_lr}]
    optimizer_grouped_parameters += [{"params": model.classifier.parameters(), "lr": lr}]

    return AdamW(optimizer_grouped_parameters, lr=lr, weight_decay=0.01)

# Define the hyperparameter configurations to test
hyperparameter_configs = [
    {"model_name": "l3cube-pune/tamil-bert", "learning_rate": 1e-5, "batch_size": 16, "dropout_rate": 0.3},
    {"model_name": "l3cube-pune/tamil-bert", "learning_rate": 5e-5, "batch_size": 16, "dropout_rate": 0.4},
    {"model_name": "l3cube-pune/tamil-bert", "learning_rate": 1e-5, "batch_size": 32, "dropout_rate": 0.3},
    # Add more configurations as needed
]

# Initialize a DataFrame to store results
results_df = pd.DataFrame(columns=["Model", "Learning Rate", "Batch Size", "Dropout Rate", "Eval Loss", "Eval F1"])

for config in hyperparameter_configs:
    # Initialize model with dropout
    model = AutoModelForTokenClassification.from_pretrained(config["model_name"], num_labels=len(label_list), hidden_dropout_prob=config["dropout_rate"])
    
    # Define optimizer with layer-wise learning rate decay if needed
    optimizer = get_optimizer(model)
    
    # Update training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=config["learning_rate"],
        per_device_train_batch_size=config["batch_size"],
        per_device_eval_batch_size=config["batch_size"],
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        logging_dir="./logs",
        report_to="none"
    )
    
    # Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        optimizers=(optimizer, None),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        compute_metrics=compute_metrics
    )
    
    # Train and evaluate the model
    trainer.train()
    eval_metrics = trainer.evaluate()
    
    # Append results to DataFrame
    new_row = pd.DataFrame([{
        "Model": config["model_name"],
        "Learning Rate": config["learning_rate"],
        "Batch Size": config["batch_size"],
        "Dropout Rate": config["dropout_rate"],
        "Eval Loss": eval_metrics["eval_loss"],
        "Eval F1": eval_metrics["eval_f1"]
    }])

    results_df = pd.concat([results_df, new_row], ignore_index=True)

# Display results sorted by F1-score
results_df = results_df.sort_values(by="Eval F1", ascending=False)
print(results_df)



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.8037,1.521608,0.756854,0.521083,0.568248,0.481148
2,1.473,1.28648,0.890837,0.643092,0.57912,0.722951
3,1.2927,1.214009,0.897753,0.669646,0.613652,0.736885


  results_df = pd.concat([results_df, new_row], ignore_index=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.8025,1.618985,0.555075,0.0,0.0,0.0
2,1.589,1.45986,0.726723,0.417285,0.446312,0.391803
3,1.4252,1.396355,0.726599,0.445238,0.431538,0.459836


  _warn_prf(average, modifier, msg_start, len(result))




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.698812,0.555075,0.0,0.0,0.0
2,1.793500,1.545036,0.761299,0.50567,0.518519,0.493443
3,1.626600,1.499604,0.769326,0.530579,0.535,0.52623


  _warn_prf(average, modifier, msg_start, len(result))


                    Model  Learning Rate Batch Size  Dropout Rate  Eval Loss  \
0  l3cube-pune/tamil-bert        0.00001         16           0.3   1.214009   
2  l3cube-pune/tamil-bert        0.00001         32           0.3   1.499604   
1  l3cube-pune/tamil-bert        0.00005         16           0.4   1.396355   

    Eval F1  
0  0.669646  
2  0.530579  
1  0.445238  


In [23]:
def test_model(sentences):
    # Set the model to evaluation mode
    model.eval()
    
    for sentence in sentences:
        # Tokenize the input sentence
        tokenized_input = tokenizer(sentence, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
        
        # Move input tensors to the appropriate device (CPU or GPU)
        input_ids = tokenized_input["input_ids"].to(model.device)  # Ensure input is on the correct device
        attention_mask = tokenized_input["attention_mask"].to(model.device)

        # Make predictions using the model
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        
        # Get predictions and convert logits to labels
        predictions = np.argmax(outputs.logits.cpu().numpy(), axis=2)  # Move logits to CPU if necessary

        # Align tokens with predictions
        tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu().numpy())  # Ensure tokens are on CPU
        predicted_labels = [label_list[p] for p in predictions[0]]

        # Print the results
        print(f"\nTesting sentence: {sentence}")
        for token, label in zip(tokens, predicted_labels):
            if token not in tokenizer.special_tokens_map.values() and token.strip():  # Skip special tokens and empty tokens
                print(f"Word: '{token}', Predicted Label: {label}")

# Example sentences for testing
test_sentences = [
    "என் பெயர் வின்ஸ்டன் சர்ச்சில் மற்றும் நான் கோயம்புத்தூரில் வருகிறேன்.",
    "இது ஒரு புதிய தொழில்நுட்ப மேம்பாடு.",
    "தமிழ் மொழி தமிழ்நாட்டின் நிலையான மொழியாகும்.",
    "இத்தாவரம் சீனா, இந்தியா, ஜப்பான், பப்பாசியா, மைக்குரோனீசியா போன்ற நாடுகளில் காணப்படுகிறது.",
    "சைஃப் அலி கான், தீபிகா படுகோண், ஜான் ஆபிரகாம் (நடிகர்), ஜாக்குலின் பெர்னாண்டஸ், அனில் கபூர், அமீஷா பட்டேல், மற்றும் ரஜினிகாந்த் ஆகியோர்."
]

# Test the model
test_model(test_sentences)



Testing sentence: என் பெயர் வின்ஸ்டன் சர்ச்சில் மற்றும் நான் கோயம்புத்தூரில் வருகிறேன்.
Word: 'என்', Predicted Label: O
Word: 'பெயர்', Predicted Label: O
Word: 'வின்', Predicted Label: I-PER
Word: '##ஸ்டன்', Predicted Label: I-PER
Word: 'சர்ச்', Predicted Label: I-PER
Word: '##சில்', Predicted Label: I-PER
Word: 'மற்றும்', Predicted Label: O
Word: 'நான்', Predicted Label: O
Word: 'கோ', Predicted Label: B-LOC
Word: '##யம்', Predicted Label: B-LOC
Word: '##புத்', Predicted Label: I-ORG
Word: '##தூரில்', Predicted Label: I-ORG
Word: 'வருகிறேன்', Predicted Label: O
Word: '.', Predicted Label: O

Testing sentence: இது ஒரு புதிய தொழில்நுட்ப மேம்பாடு.
Word: 'இது', Predicted Label: O
Word: 'ஒரு', Predicted Label: O
Word: 'புதிய', Predicted Label: O
Word: 'தொழில்நுட்ப', Predicted Label: O
Word: 'மேம்பாடு', Predicted Label: O
Word: '.', Predicted Label: O

Testing sentence: தமிழ் மொழி தமிழ்நாட்டின் நிலையான மொழியாகும்.
Word: 'தமிழ்', Predicted Label: I-ORG
Word: 'மொழி', Predicted Label: I-ORG
Wo