In [1]:
# import transformers
from datasets import load_dataset
# from transformers import AutoModelForQuestionAnswering, BertModel, BertConfig, BertTokenizer, pipeline, AutoTokenizer
from transformers import AutoModelForQuestionAnswering, BertConfig, BertTokenizer, pipeline, AutoTokenizer
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import nltk
import math



dataset = load_dataset("squad")
train = dataset['train']
validation = dataset['validation']

model_checkpoint = "atharvamundada99/bert-large-question-answering-finetuned-legal"
pretrained_model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Found cached dataset parquet (C:/Users/DELL/.cache/huggingface/datasets/parquet/plain_text-57edf78d6033ac9a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

# Building BERT

In [2]:
def QA(model, tokenizer, question, context):
    # Process the inputs
    inputs = tokenizer(question, context, return_tensors='pt')

    # Pass the inputs through the model and get the start and end scores
    start_scores, end_scores = model(**inputs)

    # Get the start and end positions
    start_position = torch.argmax(start_scores)
    end_position = torch.argmax(end_scores)

    # Get the answer
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_position:end_position+1]))

    return answer

def QAs(model, tokenizer, questions, contexts):
    answers = []
    for question, context in zip(questions, contexts):
        answer = QA(model, tokenizer, question, context)
        answers.append(answer)
    return answers

def Evaluation(model, tokenizer, validation):
    correct = 0
    EM = 0
    total = 0
    errors = []
    for record in tqdm(validation):
        try:
            total += 1
            if (total % 500 == 0):
                print(f"\nAccuracy: {100*correct/total}")
                print(f"Correct: {correct}, out of {total}")
                print(f"EM: {100*EM/total}")
                print(f"EM Correct: {EM}, out of {total}\n")

            predicted_answer = QA(model, tokenizer, record['question'], record['context'])
            if predicted_answer.lower() in record['answers']['text'][0].lower() or record['answers']['text'][0].lower() in predicted_answer.lower():
                correct += 1
            if predicted_answer.lower() == record['answers']['text'][0].lower():
                EM += 1
        except Exception as e:
            errors.append(total)
            print(f"Error at {total}: {e} ")
            continue
    return correct, EM, total

In [3]:
# Moved
# class BertConfig:
#     def __init__(self, vocab_size=30522, hidden_size=1024, num_hidden_layers=24, intermediate_size=4096, num_attention_heads=16, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, **kwargs):
#         self.vocab_size = vocab_size
#         self.hidden_size = hidden_size
#         self.num_hidden_layers = num_hidden_layers
#         self.intermediate_size = intermediate_size
#         self.num_attention_heads = num_attention_heads
#         self.attention_probs_dropout_prob = attention_probs_dropout_prob
#         self.max_position_embeddings = max_position_embeddings
#         self.type_vocab_size = type_vocab_size
#         for key, value in kwargs.items():
#             setattr(self, key, value)

#     @classmethod
#     def from_dict(cls, json_object):
#         return cls(**json_object)

#     def to_dict(self):
#         return self.__dict__

In [4]:
class BertEmbeddings(nn.Module):
    def __init__(self, vocab_size=30522, hidden_size=1024, pad_token_id=0, max_position_embeddings=512, type_vocab_size=2):
        super(BertEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=pad_token_id)
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)

        # Make position_ids a nn.Parameter
        self.position_ids = nn.Parameter(torch.arange(max_position_embeddings).unsqueeze(0), requires_grad=False)

        # LayerNorm and dropout Module
        self.LayerNorm = nn.LayerNorm(hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids=None, token_type_ids=None, position_ids=None):
        if position_ids is None:
            position_ids = self.position_ids[:, :input_ids.size(1)]  # use pre-computed position_ids

        position_embeddings = self.position_embeddings(position_ids)

        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        word_embeddings = self.word_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        if position_embeddings.size(1) < word_embeddings.size(1):       # to handle size mismatch by padding
            padding = torch.zeros((position_embeddings.size(0), word_embeddings.size(1) - position_embeddings.size(1), position_embeddings.size(2)), device=position_embeddings.device)
            position_embeddings = torch.cat([position_embeddings, padding], dim=1)

        embeddings = word_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

In [5]:
class BertSelfAttention(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, dropout_prob):
        super(BertSelfAttention, self).__init__()
        self.num_attention_heads = num_attention_heads
        self.attention_head_size = int(hidden_size / num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(hidden_size, self.all_head_size)
        self.key = nn.Linear(hidden_size, self.all_head_size)
        self.value = nn.Linear(hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask=None):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        if attention_mask is not None:
            attention_scores = attention_scores + attention_mask

        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)

        return context_layer

class BertSelfOutput(nn.Module):
    def __init__(self, hidden_size=1024, dropout_prob=0.1):
        super(BertSelfOutput, self).__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.LayerNorm = nn.LayerNorm(hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, hidden_states, input_tensor):
        # Implement the forward pass
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

In [6]:
class BertAttention(nn.Module):
    def __init__(self, hidden_size=1024, num_attention_heads=16, attention_probs_dropout_prob=0.1):
        super(BertAttention, self).__init__()

        self.self = BertSelfAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob)
        self.output = BertSelfOutput(hidden_size, attention_probs_dropout_prob)

    def forward(self, input_tensor, attention_mask):
        # Implement the forward pass
        self_output = self.self(input_tensor, attention_mask)
        if isinstance(self_output, tuple):
            self_output = self_output[0]
        attention_output = self.output(self_output, input_tensor)
        return attention_output

In [7]:
class GELUActivation(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return F.gelu(x)

In [8]:
class BertIntermediate(nn.Module):
    def __init__(self, hidden_size=1024, intermediate_size=4096):
        super(BertIntermediate, self).__init__()
        self.dense = nn.Linear(hidden_size, intermediate_size)
        self.intermediate_act_fn = GELUActivation()

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = F.gelu(hidden_states)
        return hidden_states

In [9]:
class BertOutput(nn.Module):
    def __init__(self, intermediate_size=4096, hidden_size=1024, dropout_prob=0.1):
        super(BertOutput, self).__init__()
        self.dense = nn.Linear(intermediate_size, hidden_size)
        self.LayerNorm = nn.LayerNorm(hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, hidden_states, input_tensor):
        # Implement the forward pass
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

In [10]:
class BertLayer(nn.Module):
    def __init__(self, hidden_size=1024, intermediate_size=4096, num_attention_heads=16, attention_probs_dropout_prob=0.1):
        super(BertLayer, self).__init__()
        self.attention = BertAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob)
        self.intermediate = BertIntermediate(hidden_size, intermediate_size)
        self.output = BertOutput(intermediate_size, hidden_size, attention_probs_dropout_prob)


    def forward(self, hidden_states, attention_mask):
        # Implement the forward pass
        attention_output = self.attention(hidden_states, attention_mask)
        if isinstance(attention_output, tuple):
                attention_output = attention_output[0]
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output

class BertEncoder(nn.Module):
    def __init__(self, num_hidden_layers=24, hidden_size=1024, intermediate_size=4096, num_attention_heads=16, attention_probs_dropout_prob=0.1):
        super(BertEncoder, self).__init__()
        self.layer = nn.ModuleList([BertLayer(hidden_size, intermediate_size, num_attention_heads, attention_probs_dropout_prob) for _ in range(num_hidden_layers)])
        

    def forward(self, hidden_states, attention_mask):
        # Implement the forward pass
        for layer in self.layer:
            # check type of hidden_states
            if isinstance(hidden_states, tuple):
                hidden_states = hidden_states[0]
            hidden_states = layer(hidden_states, attention_mask)
        return hidden_states

In [11]:
# No longer needed
# import torch.nn as nn

# class BertPooler(nn.Module):
#     def __init__(self, hidden_size=1024):
#         super(BertPooler, self).__init__()
#         self.dense = nn.Linear(hidden_size, hidden_size)
#         self.activation = nn.Tanh()

#     def forward(self, hidden_states):
#         # We "pool" the model by simply taking the hidden state corresponding to the first token.
#         first_token_tensor = hidden_states[:, 0]
#         pooled_output = self.dense(first_token_tensor)
#         pooled_output = self.activation(pooled_output)
#         return pooled_output

In [12]:
class BertModel(nn.Module):
    def __init__(self, vocab_size=30522, hidden_size=1024, num_hidden_layers=24, intermediate_size=4096, num_attention_heads=16, attention_probs_dropout_prob=0.1, pad_token_id = 0, max_position_embeddings=512, type_vocab_size=2):
        super(BertModel, self).__init__()
        self.embeddings = BertEmbeddings(vocab_size, hidden_size, pad_token_id, max_position_embeddings, type_vocab_size)
        self.encoder = BertEncoder(num_hidden_layers, hidden_size, intermediate_size, num_attention_heads, attention_probs_dropout_prob)
        

        # self.pooler = BertPooler(hidden_size)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        # Implement the forward pass
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        embedding_output = self.embeddings(input_ids, token_type_ids)
        encoder_output = self.encoder(embedding_output, extended_attention_mask)
        # pooled_output = self.pooler(encoder_output)

        return encoder_output
        # return pooled_output  # or return pooled_output lw hnst3ml el pooler

# BERT

In [13]:
class CustomBertForQuestionAnswering(nn.Module):
    def __init__(self, config):
        super(CustomBertForQuestionAnswering, self).__init__()
        self.config = config
        self.bert = BertModel(vocab_size=config.vocab_size, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, intermediate_size=config.intermediate_size, num_attention_heads=config.num_attention_heads, attention_probs_dropout_prob=config.attention_probs_dropout_prob, pad_token_id=config.pad_token_id ,max_position_embeddings=config.max_position_embeddings, type_vocab_size=config.type_vocab_size)
        
        self.qa_outputs = nn.Linear(config.hidden_size, 2)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        sequence_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        if isinstance(sequence_output, tuple):
            sequence_output = sequence_output[0]
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        return start_logits, end_logits

# Instantiate the model with the provided configuration
config = BertConfig.from_dict({
    "_name_or_path": "ourModel",
    "architectures": [
        "BertForQuestionAnswering"
    ],
    "attention_probs_dropout_prob": 0.1,
    "gradient_checkpointing": False,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 1024,
    "initializer_range": 0.02,
    "intermediate_size": 4096,
    "layer_norm_eps": 1e-12,
    "max_position_embeddings": 512,
    "model_type": "bert",
    "num_attention_heads": 16,
    "num_hidden_layers": 24,
    "pad_token_id": 0,
    "position_embedding_type": "absolute",
    "transformers_version": "4.17.0",
    "type_vocab_size": 2,
    "use_cache": True,
    "vocab_size": 30522
})

model = CustomBertForQuestionAnswering(config)

In [14]:
# Get state dictionary of pre-trained model
pretrained_dict = pretrained_model.state_dict()

# Get state dictionary of custom model
model_dict = model.state_dict()

print(len(pretrained_dict))
print(len(model_dict))


# Check the keys that are not in the model_dict
for k, v in pretrained_dict.items():
    if k not in model_dict:
        print(k, ":", v.shape)

# Filter out unnecessary keys
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
    

print(len(pretrained_dict))
print(len(model_dict))

# Overwrite entries in the existing state dict
model_dict.update(pretrained_dict)

# Load the new state dict
model.load_state_dict(model_dict)

392
392
392
392


<All keys matched successfully>

In [15]:
print(model.training)
model.eval()
print(model.training)

True
False


In [16]:
print(pretrained_model.training)
pretrained_model.eval()
print(pretrained_model.training)

False
False


In [17]:
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)

The model 'CustomBertForQuestionAnswering' is not supported for question-answering. Supported models are ['YosoForQuestionAnswering', 'NystromformerForQuestionAnswering', 'QDQBertForQuestionAnswering', 'FNetForQuestionAnswering', 'GPTJForQuestionAnswering', 'LayoutLMv2ForQuestionAnswering', 'RemBertForQuestionAnswering', 'CanineForQuestionAnswering', 'RoFormerForQuestionAnswering', 'BigBirdPegasusForQuestionAnswering', 'BigBirdForQuestionAnswering', 'ConvBertForQuestionAnswering', 'LEDForQuestionAnswering', 'DistilBertForQuestionAnswering', 'AlbertForQuestionAnswering', 'CamembertForQuestionAnswering', 'BartForQuestionAnswering', 'MBartForQuestionAnswering', 'LongformerForQuestionAnswering', 'XLMRobertaXLForQuestionAnswering', 'XLMRobertaForQuestionAnswering', 'RobertaForQuestionAnswering', 'SqueezeBertForQuestionAnswering', 'BertForQuestionAnswering', 'XLNetForQuestionAnsweringSimple', 'FlaubertForQuestionAnsweringSimple', 'MegatronBertForQuestionAnswering', 'MobileBertForQuestionAnsw

In [18]:
torch.save(model.state_dict(), 'CustomModel.pth')

# Evaluation

In [19]:
total = 0
correct = 0
EM = 0
BLEU = 0
bblleeuu = 0
errors = []
for record in tqdm(validation):
        # try:
                total += 1
                if (total % 1000 == 0):
                        print(f"Correct:\t\t {correct}, out of {total}: {100*correct/total}%")
                        print(f"EM:\t\t\t {EM}, out of {total}: {100*EM/total}%")
                        print(f"BLEU:\t\t\t {BLEU}, out of {total}: {100*BLEU/total}%")
                        print(f"BLEU Score:\t\t {bblleeuu}, out of {total}: {100*bblleeuu/total}%")
                result = question_answerer(question=record['question'], context=record['context'], truncation=True, padding=True, return_tensors='pt')
                # result = QA(model, tokenizer,record['question'], record['context'])
                n = min(len(result['answer'].split()), 4)
                # n = min(len(result.split()), 4)
                if n == 0:
                        BLEUscore = 0
                else:
                        weights = [1.0/n]*n
                        BLEUscore = nltk.translate.bleu_score.sentence_bleu([record['answers']['text'][0].lower()], result['answer'].lower(), weights=weights)
                        # BLEUscore = nltk.translate.bleu_score.sentence_bleu([record['answers']['text'][0].lower()], result.lower(), weights=weights)
                if result['answer'] != '' and (result['answer'].lower() in record['answers']['text'][0].lower() or record['answers']['text'][0].lower() in result['answer'].lower()):
                # if result != '' and (result.lower() in record['answers']['text'][0].lower() or record['answers']['text'][0].lower() in result.lower()):
                        correct += 1
                if record['answers']['text'][0].lower() == result['answer'].lower():
                # if record['answers']['text'][0].lower() == result.lower():
                        EM += 1
                if BLEUscore > 0.5:
                        BLEU += 1
                bblleeuu += BLEUscore
                 
        # except Exception as e:
        #         errors.append(total)
        #         print(f"Error at {total}: {e}")
        #         continue
print(f"Correct: {correct}, out of {total}: {100*correct/total}%")
print(f"EM: {EM}, out of {total}: {100*EM/total}%")
print(f"BLEU: {BLEU}, out of {total}: {100*BLEU/total}%")
print(f"BLEU Score: {bblleeuu}, out of {total}: {100*bblleeuu/total}%")



The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  9%|▉         | 999/10570 [09:01<1:17:51,  2.05it/s]

Correct:		 942, out of 1000: 94.2%
EM:			 762, out of 1000: 76.2%
BLEU:			 878, out of 1000: 87.8%
BLEU Score:		 872.7723680467068, out of 1000: 87.27723680467068%


 10%|▉         | 1005/10570 [09:05<1:26:34,  1.84it/s]


KeyboardInterrupt: 

In [None]:
# dh bl pipeline el ndeefa (el model kan fl eval mode), el tany aw7ash men dyh
# 1%|          | 114/10570 [00:32<27:50,  6.26it/s] c:\Users\DELL\anaconda3\Lib\site-packages\nltk\translate\bleu_score.py:552: UserWarning: 
# The hypothesis contains 0 counts of 3-gram overlaps.
# Therefore the BLEU score evaluates to 0, independently of
# how many N-gram overlaps of lower order it contains.
# Consider using lower n-gram order or use SmoothingFunction()
#   warnings.warn(_msg)
# c:\Users\DELL\anaconda3\Lib\site-packages\nltk\translate\bleu_score.py:552: UserWarning: 
# The hypothesis contains 0 counts of 4-gram overlaps.
# Therefore the BLEU score evaluates to 0, independently of
# how many N-gram overlaps of lower order it contains.
# Consider using lower n-gram order or use SmoothingFunction()
#   warnings.warn(_msg)
#   4%|▍         | 439/10570 [04:40<2:04:55,  1.35it/s]c:\Users\DELL\anaconda3\Lib\site-packages\nltk\translate\bleu_score.py:552: UserWarning: 
# The hypothesis contains 0 counts of 2-gram overlaps.
# Therefore the BLEU score evaluates to 0, independently of
# how many N-gram overlaps of lower order it contains.
# Consider using lower n-gram order or use SmoothingFunction()
#   warnings.warn(_msg)
#   9%|▉         | 999/10570 [08:31<56:16,  2.83it/s]  
# Correct:			 942, out of 1000: 94.2%
# EM:				 762, out of 1000: 76.2%
# BLEU:				 878, out of 1000: 87.8%
# BLEU Score:		 872.7723680467068, out of 1000: 87.27723680467068
#  19%|█▉        | 1999/10570 [13:49<51:12,  2.79it/s]  
# Correct:			 1869, out of 2000: 93.45%
# EM:				 1432, out of 2000: 71.6%
# BLEU:				 1703, out of 2000: 85.15%
# BLEU Score:		 1697.4421047529827, out of 2000: 84.87210523764912
#  28%|██▊       | 2999/10570 [19:44<43:38,  2.89it/s]  
# Correct:			 2799, out of 3000: 93.3%
# EM:				 2083, out of 3000: 69.43333333333334%
# BLEU:				 2507, out of 3000: 83.56666666666666%
# BLEU Score:		 2508.141037469119, out of 3000: 83.60470124897064
#  38%|███▊      | 3999/10570 [26:03<46:35,  2.35it/s]  
# Correct:			 3728, out of 4000: 93.2%
# EM:				 2750, out of 4000: 68.75%
# BLEU:				 3308, out of 4000: 82.7%
# BLEU Score:		 3317.115421181209, out of 4000: 82.92788552953022
#  47%|████▋     | 4999/10570 [41:24<2:08:46,  1.39s/it]
# Correct:			 4651, out of 5000: 93.02%
# EM:				 3317, out of 5000: 66.34%
# BLEU:				 4056, out of 5000: 81.12%
# BLEU Score:		 4067.8638478020785, out of 5000: 81.35727695604156
#  57%|█████▋    | 5999/10570 [53:28<53:14,  1.43it/s]  
# Correct:			 5601, out of 6000: 93.35%
# EM:				 4042, out of 6000: 67.36666666666666%
# BLEU:				 4891, out of 6000: 81.51666666666667%
# BLEU Score:		 4909.089732703673, out of 6000: 81.81816221172788
#  66%|██████▌   | 6999/10570 [1:07:14<27:44,  2.15it/s]  
# Correct:			 6494, out of 7000: 92.77142857142857%
# EM:				 4693, out of 7000: 67.04285714285714%
# BLEU:				 5667, out of 7000: 80.95714285714286%
# BLEU Score:		 5691.8794216420865, out of 7000: 81.31256316631551
#  76%|███████▌  | 7999/10570 [1:16:37<27:38,  1.55it/s]  
# Correct:			 7429, out of 8000: 92.8625%
# EM:				 5390, out of 8000: 67.375%
# BLEU:				 6491, out of 8000: 81.1375%
# BLEU Score:		 6516.97302106118, out of 8000: 81.46216276326474
#  85%|████████▌ | 8999/10570 [1:28:00<11:31,  2.27it/s]  
# Correct:			 8362, out of 9000: 92.91111111111111%
# EM:				 5986, out of 9000: 66.5111111111111%
# BLEU:				 7270, out of 9000: 80.77777777777777%
# BLEU Score:		 7292.617317963844, out of 9000: 81.02908131070937
#  95%|█████████▍| 9999/10570 [1:43:21<07:44,  1.23it/s]
# Correct:			 9279, out of 10000: 92.79%
# EM:				 6578, out of 10000: 65.78%
# BLEU:				 8042, out of 10000: 80.42%
# BLEU Score:		 8068.880321520323, out of 10000: 80.68880321520324
# 100%|██████████| 10570/10570 [1:52:25<00:00,  1.57it/s]
# Correct: 9789, out of 10570: 92.61116367076632%
# EM: 6901, out of 10570: 65.28855250709556%
# BLEU: 8477, out of 10570: 80.19867549668874%
# BLEU Score: 8501.452133048304, out of 10570: 80.43001071947307


# ===================================================================================================================
# hena kan fl training mode (aw7ash mn el eval)
#   9%|▉         | 999/10570 [11:38<1:59:37,  1.33it/s] 
# Correct:		 932, out of 1000: 93.2%
# EM:			 745, out of 1000: 74.5%
# BLEU:			 864, out of 1000: 86.4%
# BLEU Score:		 861.1770426881361, out of 1000: 86.11770426881361%
#  19%|█▉        | 1999/10570 [20:07<1:51:32,  1.28it/s]
# Correct:		 1855, out of 2000: 92.75%
# EM:			 1411, out of 2000: 70.55%
# BLEU:			 1674, out of 2000: 83.7%
# BLEU Score:		 1676.7278192732056, out of 2000: 83.83639096366028%
#  28%|██▊       | 2999/10570 [32:43<1:33:01,  1.36it/s]
# Correct:		 2780, out of 3000: 92.66666666666667%
# EM:			 2039, out of 3000: 67.96666666666667%
# BLEU:			 2465, out of 3000: 82.16666666666667%
# BLEU Score:		 2472.351418400786, out of 3000: 82.41171394669286%
#  38%|███▊      | 3999/10570 [46:19<1:40:16,  1.09it/s]
# Correct:		 3702, out of 4000: 92.55%
# EM:			 2694, out of 4000: 67.35%
# BLEU:			 3255, out of 4000: 81.375%
# BLEU Score:		 3270.4277132334437, out of 4000: 81.76069283083609%
#  47%|████▋     | 4999/10570 [1:05:18<2:03:05,  1.33s/it]
# Correct:		 4615, out of 5000: 92.3%
# EM:			 3260, out of 5000: 65.2%
# BLEU:			 3998, out of 5000: 79.96%
# BLEU Score:		 4018.995368299551, out of 5000: 80.37990736599103%
#  57%|█████▋    | 5999/10570 [1:16:29<46:29,  1.64it/s]  
# Correct:		 5566, out of 6000: 92.76666666666667%
# EM:			 3975, out of 6000: 66.25%
# BLEU:			 4824, out of 6000: 80.4%
# BLEU Score:		 4855.943115278409, out of 6000: 80.93238525464015%
#  66%|██████▌   | 6999/10570 [1:25:45<22:18,  2.67it/s]  
# Correct:		 6462, out of 7000: 92.31428571428572%
# EM:			 4616, out of 7000: 65.94285714285714%
# BLEU:			 5590, out of 7000: 79.85714285714286%
# BLEU Score:		 5629.781707379695, out of 7000: 80.42545296256706%
#  76%|███████▌  | 7999/10570 [1:33:31<22:30,  1.90it/s]  
# Correct:		 7388, out of 8000: 92.35%
# EM:			 5300, out of 8000: 66.25%
# BLEU:			 6398, out of 8000: 79.975%
# BLEU Score:		 6441.607009922551, out of 8000: 80.52008762403189%
#  85%|████████▌ | 8999/10570 [1:46:01<20:18,  1.29it/s]  
# Correct:		 8313, out of 9000: 92.36666666666666%
# EM:			 5881, out of 9000: 65.34444444444445%
# BLEU:			 7162, out of 9000: 79.57777777777778%
# BLEU Score:		 7205.7352295038, out of 9000: 80.06372477226445%
#  95%|█████████▍| 9999/10570 [1:54:27<07:04,  1.34it/s]
# Correct:		 9231, out of 10000: 92.31%
# EM:			 6479, out of 10000: 64.79%
# BLEU:			 7938, out of 10000: 79.38%
# BLEU Score:		 7989.309027525096, out of 10000: 79.89309027525096%
# 100%|██████████| 10570/10570 [2:02:19<00:00,  1.44it/s]
# Correct: 9737, out of 10570: 92.11920529801324%
# EM: 6801, out of 10570: 64.34247871333964%
# BLEU: 8369, out of 10570: 79.17691579943235%
# BLEU Score: 8420.278207712268, out of 10570: 79.66204548450585%