# Import Libraries

In [2]:
import datasets
import transformers
import pandas as pd
import torch
from torch.utils.data.dataset import Dataset
from pathlib import Path



#Tokenizer from scratch on vocabulary of corpus
from tokenizers import ByteLevelBPETokenizer

# Decoder
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM # RobertaLM for learning
from transformers import RobertaTokenizerFast # After training tokenizern we will wrap it so it can be used by Roberta model


#Training
# When using previous version of the library you need the following two lines
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


# Parameters for Training

In [3]:
TRAIN_BATCH_SIZE = 20   # input batch size for training (default: 64)
VALID_BATCH_SIZE = 5   # input batch size for testing (default: 1000)
VAL_EPOCHS = 1 
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 128           # Max length for product description
SUMMARY_LEN = 20         # Max length for product names

TRAIN_EPOCHS = 2       # number of epochs to train (default: 10)
WEIGHT_DECAY = 0.01
SEED = 42               # random seed (default: 42)
MAX_LEN = 128
SUMMARY_LEN = 20   # Maximum length of caption generated by the model

# Preparing the Dataset

In [4]:
import os
os.chdir(r'D:\Downloads\ML\Image_Captioning_using_Hugging_Face-main\Image_Captioning_using_Hugging_Face-main')
import json


with open('data.json', 'r') as openfile:

    json_object = json.load(openfile)

images_caption_dict = dict(json_object)


images_path = 'D:\\Downloads\\ML\\flickr8k\\Images'
images = list(images_caption_dict.keys())

for image_path in images:
    if image_path.endswith('jpg'):
        new = images_path + image_path.split('/')[-1]
        images_caption_dict[new] = images_caption_dict.pop(image_path)
    else:
        images_caption_dict.pop(image_path)

In [5]:
import pandas as pd

df = pd.DataFrame([])

captions = []
images = []
for image in list(images_caption_dict.keys()):
    caption = images_caption_dict[image]
#     captions.append(('.'.join([ sent.rstrip() for sent in ('.'.join(caption)).split('<e>.<s>')]))\
#                             .replace('<s> ','').replace('  <e>','.'))
    for capt in caption:
        captions.append(capt.replace('<s> ','').replace('  <e>','').strip())
        images.append(image)
        
df['images'] = images
df['captions'] = captions

# ROBERTA
### Training the Decoder Model for Language Understanding and build Vocabulary

### Tokenizer
#### Converting captions in to .txt file for training of the tokenizer

In [7]:
# Store values in a dataframe column (Series object) to files, one file per record
os.mkdir("./text_split")
def column_to_files(column, prefix, txt_files_dir = "./text_split"):
    # The prefix is a unique ID to avoid to overwrite a text file
    i=prefix
    #For every value in the df, with just one column
    for row in column.to_list():
      # Create the filename using the prefix ID
        file_name = os.path.join(txt_files_dir, str(i)+'.txt')
        try:
            # Create the file and write the column text to it
            f = open(file_name, 'wb')
            f.write(row.encode('utf-8'))
            f.close()
        except Exception as e:  #catch exceptions(for eg. empty rows)
            print(row, e) 
        i+=1
    # Return the last ID
    return i

data = df["captions"]
# Removing the end of line character \n
data = data.replace("\n"," ")
# Set the ID to 0
prefix=0
# Create a file for every description value
prefix = column_to_files(data, prefix)
# Print the last ID

#### Training tokenizer

In [8]:
%%time 
paths = [str(x) for x in Path(".").glob("text_split/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=True)

# Customize training
tokenizer.train(files=paths, vocab_size=10000, min_frequency=2,
                show_progress=True,
                special_tokens=[
                                "<s>",
                                "<pad>",
                                "<e>",
                                "<unk>",
                                "<mask>",
])

CPU times: total: 7.28 s
Wall time: 4min 52s


#### Save Tokenizer

In [10]:

os.mkdir('Byte_tokenizer')
tokenizer.save_model('Byte_tokenizer')

['Byte_tokenizer\\vocab.json', 'Byte_tokenizer\\merges.txt']

## Decoder
#### Intialization & Training

In [11]:
config = RobertaConfig(
    vocab_size=10000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

model = RobertaForMaskedLM(config=config)

print('Num parameters: ',model.num_parameters())

# Create the tokenizer from a trained one
tokenizer = RobertaTokenizerFast.from_pretrained('Byte_tokenizer', max_len=MAX_LEN)

Num parameters:  51206416


In [12]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer):
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []
        
        for example in df.values:
            x=tokenizer.encode_plus(example, max_length = MAX_LEN, truncation=True, padding=True)
            self.examples += [x.input_ids]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

In [13]:
# Create the train and evaluation dataset
train_dataset = CustomDataset(df['captions'][:38000], tokenizer)
eval_dataset = CustomDataset(df['captions'][38000:], tokenizer)

#### Batching Data

In [14]:
from transformers import DataCollatorForLanguageModeling

# Define the Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

## Training the Decoder

In [15]:
model_folder = "RobertaMLM"
# Define the training arguments
training_args = TrainingArguments(
    output_dir=model_folder,
    overwrite_output_dir=True,
    evaluation_strategy = 'epoch',
    num_train_epochs=TRAIN_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    save_steps=8192,
    #eval_steps=4096,
    save_total_limit=1,
)
# Create the trainer for our model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    #prediction_loss_only=True,
)

In [16]:
# Train the model
trainer.train()

  0%|          | 0/3800 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


: 

#### Check Perplexity score of the model

In [16]:
import math
eval_results = trainer.evaluate()

print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

100%|██████████| 491/491 [00:05<00:00, 92.07it/s] 

Perplexity: 30.73





### Saving tokenizer & Model to use in Encoder Decoder architecture

In [17]:
tokenizer.save_pretrained('Byte_tokenizer')

('Byte_tokenizer\\tokenizer_config.json',
 'Byte_tokenizer\\special_tokens_map.json',
 'Byte_tokenizer\\vocab.json',
 'Byte_tokenizer\\merges.txt',
 'Byte_tokenizer\\added_tokens.json',
 'Byte_tokenizer\\tokenizer.json')

In [18]:
trainer.save_model(model_folder)

# Evaluating Decoder(ROBERTA)

In [19]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model= r'RobertaMLM',
    tokenizer= 'Byte_tokenizer'
)

In [20]:
fill_mask("a girl going into a <mask> building")

[{'score': 0.09009407460689545,
  'token': 491,
  'token_str': ' large',
  'sequence': 'a girl going into a large building'},
 {'score': 0.05363229662179947,
  'token': 377,
  'token_str': ' red',
  'sequence': 'a girl going into a red building'},
 {'score': 0.044735923409461975,
  'token': 618,
  'token_str': ' dirt',
  'sequence': 'a girl going into a dirt building'},
 {'score': 0.03572113811969757,
  'token': 475,
  'token_str': ' small',
  'sequence': 'a girl going into a small building'},
 {'score': 0.03269856795668602,
  'token': 402,
  'token_str': ' blue',
  'sequence': 'a girl going into a blue building'}]

## This Roberta Model will be used as Decoder in Our Image Captioning model and will be connnected to ViT Encoder model using cross attention heads.