## Installing dataset

In [None]:
%%capture
!pip install datasets

In [None]:
%%capture
!pip install sentence_transformers

In [None]:
%%capture
from datasets import load_dataset

dataset_id = "embedding-data/coco_captions"
dataset = load_dataset(dataset_id)

In [None]:
print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['set'])} as value.")
print(f"- Examples look like this: {dataset['train'][0]['set']}")

- The embedding-data/coco_captions dataset has 82783 examples.
- Each example is a <class 'dict'> with a <class 'list'> as value.
- Examples look like this: ['A clock that blends in with the wall hangs in a bathroom. ', 'A very clean and well decorated empty bathroom', 'A bathroom with a border of butterflies and blue paint on the walls above it.', 'An angled view of a beautifully decorated bathroom.', 'A blue and white bathroom with butterfly themed wall tiles.']


In [None]:
#Convert the examples into InputExamples. It might take around 10 seconds in Google Colab.
from sentence_transformers import InputExample

train_examples = []
test_examples = []
train_data = dataset['train']['set']
n_examples = dataset['train'].num_rows

for i in range(n_examples):
  train_examples.append(InputExample(texts=[train_data[i][0], train_data[i][1], train_data[i][2]]))

  if (len(train_data[i])>5):
    test_examples.append(train_data[i][3])
    test_examples.append(train_data[i][4])
    test_examples.append(train_data[i][5])
  if (len(train_data[i])<5 and len(train_data[i])>3):
    test_examples.append(train_data[i][3])
  if (len(train_data[i])<5 and len(train_data[i])>4):
    test_examples.append(train_data[i][4])

print("Size of Train Set:",len(train_examples))
print("Size of Test Set:",len(test_examples))

Size of Train Set: 82783
Size of Test Set: 639


In [None]:
print(f"We have a {type(train_examples)} of length {len(train_examples)} containing {type(train_examples[0])}'s.")

We have a <class 'list'> of length 82783 containing <class 'sentence_transformers.readers.InputExample.InputExample'>'s.


# Training the Model

In [None]:
#Reference: https://huggingface.co/blog/how-to-train-sentence-transformers
#Reference: https://huggingface.co/docs/transformers/training#train-a-tensorflow-model-with-keras
from sentence_transformers import SentenceTransformer, models

# Step 1: use an existing language model
word_embedding_model = models.Transformer('bert-base-cased')

## Step 2: use a pool function over the token embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

## Join steps 1 and 2 using the modules argument
trained_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
#We wrap our training dataset into a Pytorch Dataloader to shuffle examples and get batch sizes.
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

In [None]:
# Loss functions for training a Sentence Transformers model
from sentence_transformers import losses
train_loss = losses.TripletLoss(model=trained_model)

In [None]:
# #---------------------------TRAINING---------------------------------------
num_epochs = 10
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data

In [None]:
# Training takes around 45 minutes with a Google Colab Pro account
trained_model.fit(train_objectives=[(train_dataloader, train_loss)],epochs=num_epochs,warmup_steps=warmup_steps) 

# Saving the model to HF hub

In [None]:
# Save the entire model as a SavedModel.
!mkdir -p saved_model
trained_model.save('saved_model/my_model') 

Use this token:  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

In [None]:
!huggingface-cli login

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_folder(
    create_pr=1,
    repo_id="Charul1223/bert-base-cased_coco_caption",
    folder_path="saved_model/my_model",
    repo_type="model",
    
)