In [1]:
from huggingface_hub import notebook_login
from datasets import load_dataset
from evaluate import load as load_metric
from scipy.stats import differential_entropy

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from datasets import load_from_disk

# Path where the dataset was saved
dataset_path = 'final_dataset'  # Replace with the actual path if different

# Load the dataset
dataset = load_from_disk(dataset_path)

# Example: Print details of the loaded dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 867972
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 185994
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 185995
    })
})


In [4]:
def lowercase_tokens(example):
    # Convert all tokens to lowercase and strip punctuation
    example['tokens'] = [token.strip(',[]').lower() for token in example['tokens']]
    return example

# Apply to all subsets (train, validation, test)
dataset = dataset.map(lowercase_tokens, batched=False)

# Now the dataset['train'], dataset['validation'], and dataset['test'] will have cleaned lowercase tokens


In [6]:
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification, Trainer, TrainingArguments

task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "xlm-roberta-base"
# arm_model = 'ai-forever/mGPT-armenian'
batch_size = 16

In [7]:
# dataset["train"].features[f"ner_tags"]

In [8]:
# label_list = dataset["train"].features[f"{task}_tags"].feature.names
label_list = ['l', 'u', 'uu']

In [9]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_checkpoint)



In [10]:
label_all_tokens = True

In [11]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    
    for i, label in enumerate(examples[f"{task}_tags"]):
        # print(label)
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [12]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/185994 [00:00<?, ? examples/s]

In [13]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

In [14]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=3)  # Update `label_names`

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model

XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bi

In [16]:
total_params = sum(p.numel() for p in model.parameters())
print(f'Total number of parameters: {total_params}')

Total number of parameters: 277455363


In [17]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    eval_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=True,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [19]:
metric = load_metric("seqeval")

In [20]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [21]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0724,0.064859,0.912854,0.918492,0.915665,0.978428
2,0.0593,0.060763,0.929158,0.925038,0.927094,0.980237
3,0.0483,0.059455,0.921628,0.932429,0.926997,0.981205
4,0.041,0.06265,0.918281,0.936071,0.927091,0.981732
5,0.0345,0.066567,0.929264,0.93623,0.932734,0.981855


No files have been modified since last commit. Skipping to prevent empty commit.


TrainOutput(global_step=271245, training_loss=0.05374796981866372, metrics={'train_runtime': 51258.5404, 'train_samples_per_second': 84.666, 'train_steps_per_second': 5.292, 'total_flos': 1.8127161969483776e+17, 'train_loss': 0.05374796981866372, 'epoch': 5.0})

In [27]:
trainer.evaluate()

{'eval_loss': 0.0003890861407853663,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_f1': 1.0,
 'eval_accuracy': 0.9999527005720978,
 'eval_runtime': 30.3931,
 'eval_samples_per_second': 641.298,
 'eval_steps_per_second': 40.108,
 'epoch': 3.0}

In [23]:
output_dir = "./milion_ner_model_3_epoch"  # Directory to save the model
trainer.save_model(output_dir)  # Save model checkpoint
tokenizer.save_pretrained(output_dir)  # Save the tokenizer

('./milion_ner_model_3_epoch/tokenizer_config.json',
 './milion_ner_model_3_epoch/special_tokens_map.json',
 './milion_ner_model_3_epoch/sentencepiece.bpe.model',
 './milion_ner_model_3_epoch/added_tokens.json',
 './milion_ner_model_3_epoch/tokenizer.json')

In [3]:
output_dir = "/home/vahan/Documents/NER/milion_ner_model_3_epoch"

In [4]:
phrases_with_i = [
    "ի վեր", "ի վերջո", "ի նպաստ", "ի հեճուկս", "ի դեպ"
                                                "ի նշան", "ի պատիվ", "ի դեմ", "ի պաշտպանություն",
    "ի պահպանություն", "ի միջի", "ի հիշատակ", "ի ցույց", "ի գործ"
]


In [5]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [6]:
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForTokenClassification.from_pretrained(output_dir)


In [15]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import re
from text_converter import ArmenianTextToNumberConverter

tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForTokenClassification.from_pretrained(output_dir)

# Function to replace 'եւ' with 'և'
def replace_and(sentence):
    return sentence.replace('եւ', 'և')

# Function to ensure the sentence ends with a colon
def ensure_colon(sentence):
    return sentence if sentence.endswith('։') else sentence + '։'

# Function to make the letter after a colon uppercase
def uppercase_after_colon(sentence):
    # Use regular expression to find a colon followed by a space and a letter, and uppercase that letter
    return re.sub(r'(։\s*)(\w)', lambda match: match.group(1) + match.group(2).upper(), sentence)

# Function to correct sentence based on token classification model predictions
def correct_sentence(input_sentence, tokenizer, model):
    # Tokenize input sentence
    tokenized_input = tokenizer(
        input_sentence.split(),
        truncation=True,
        return_tensors="pt",
        is_split_into_words=True,
        padding=True,
        max_length=128
    ).to(model.device)

    # Get model predictions
    with torch.no_grad():
        output = model(**tokenized_input)

    # Extract predicted token IDs and convert back to tokens
    predicted_ids = output.logits.argmax(dim=2)[0]
    tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'][0])

    # Define label names and special tokens
    label_names = ['O', '1', '2']
    special_tokens = set(tokenizer.all_special_tokens)

    corrected_sentence = []
    current_word = ""

    # Process each token and its predicted label
    for token, predicted_id in zip(tokens, predicted_ids):
        label = label_names[predicted_id]

        if token in special_tokens:
            continue

        # Handle word continuation or start
        if token.startswith("▁"):
            if current_word:
                corrected_sentence.append(current_word)
            current_word = token[1:]
        else:
            current_word += token

        # Apply corrections based on label
        if label == '2':
            current_word = current_word.upper()
        elif label != 'O':
            current_word = current_word.capitalize()

    if current_word:
        corrected_sentence.append(current_word)

    # Join corrected words to form the final sentence
    final_sentence = " ".join(corrected_sentence)

    # Uppercase the letter after a colon
    final_sentence = uppercase_after_colon(final_sentence)

    return final_sentence


# Define allowed punctuations
allowed_punctuations = {',', '։', '՝', '՞', '-', '.'}


# Function to add spaces around punctuations
def add_space_between_punctuation(sentence, punctuations):
    # Regular expression for allowed punctuations
    pattern = f"([{''.join(re.escape(p) for p in punctuations)}])"

    # Add space around hyphen
    corrected_sentence = re.sub(r"(\w)-(\w)", r"\1 - \2", sentence)

    # Add space before and after other punctuations
    corrected_sentence = re.sub(rf"(\S)({pattern})", r"\1 \2", corrected_sentence)
    corrected_sentence = re.sub(rf"({pattern})(\S)", r"\1 \2", corrected_sentence)

    return corrected_sentence


# Function to merge spaces around punctuations back to original form
def merge_same_punctuation(sentence, punctuations):
    # Regular expression for allowed punctuations
    pattern = f"([{''.join(re.escape(p) for p in punctuations)}])"

    # Merge spaces around hyphen
    merged_sentence = re.sub(r"\s-\s", "-", sentence)

    # Merge spaces around other punctuations
    merged_sentence = re.sub(rf"\s({pattern})", r"\1", merged_sentence)

    return merged_sentence


def clean_armenian_strings(strings):
    # Armenian Unicode range: '\u0531-\u0587' covers Armenian capital and small letters
    armenian_pattern = re.compile(r'[^\u0531-\u0587]')

    cleaned_strings = []
    for s in strings:
        cleaned_string = armenian_pattern.sub('', s)  # Remove all non-Armenian characters
        cleaned_strings.append(cleaned_string)

    return cleaned_strings


def add_hyphen(sentence):
    # Avoid changing the phrases in the valid list
    for phrase in phrases_with_i:
        if phrase in sentence:
            sentence = sentence.replace(phrase, phrase.replace(" ", "_"))  # Temporarily replace valid phrases

    # Add hyphen before standalone "ի" when it's not part of a valid phrase
    updated_sentence = re.sub(r'\b(\w+)\s(ի)\b', r'\1-\2', sentence)

    # Restore the valid phrases back to their original form
    for phrase in phrases_with_i:
        sentence_with_valid_phrases = phrase.replace(" ", "_")
        updated_sentence = updated_sentence.replace(sentence_with_valid_phrases, phrase)

    return updated_sentence

# Example usage
input_sentence = "բարև ձեզ խնդրում եմ փոխանցել քսանվեց հազար դրամը վահան եղոյանին։"

# Replace 'եւ' with 'և' in the input sentence
input_sentence = replace_and(input_sentence)

# Ensure the input sentence ends with a colon
input_sentence = ensure_colon(input_sentence)

corrected_sentence = add_space_between_punctuation(add_hyphen(input_sentence), allowed_punctuations)
after_model = correct_sentence(corrected_sentence, tokenizer, model)
merged_sentence = merge_same_punctuation(replace_and(after_model), allowed_punctuations)


converter = ArmenianTextToNumberConverter()



# Output results
print(f"Original Sentence: {input_sentence}")
print(f"Corrected Sentence (Spaces Added): {corrected_sentence}")
print(f"Model-Corrected Sentence: {after_model}")
print(f"Final Merged Sentence: {merged_sentence}")


Original Sentence: բարև ձեզ խնդրում եմ փոխանցել քսանվեց հազար դրամը վահան եղոյանին։
Corrected Sentence (Spaces Added): բարև ձեզ խնդրում եմ փոխանցել քսանվեց հազար դրամը վահան եղոյանին ։
Model-Corrected Sentence: Բարեւ Ձեզ խնդրում եմ փոխանցել քսանվեց հազար դրամը Վահան Եղոյանին ։
Final Merged Sentence: Բարև Ձեզ խնդրում եմ փոխանցել քսանվեց հազար դրամը Վահան Եղոյանին։
