<a href="https://colab.research.google.com/github/Aditya6122/HashtagifyMe/blob/main/HashtagGenerator_HashtagifyMe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers --quiet
!pip install datasets --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m114.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m89.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import io
import urllib
import PIL.Image
import torch
import datasets
import numpy as np
import requests
from datasets import load_dataset
from transformers import BlipForConditionalGeneration, AutoProcessor
from datasets.utils.file_utils import get_datasets_user_agent
from torch.utils.data import Dataset
from tqdm.notebook import tqdm_notebook
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import shutil

In [None]:
dataset_name = "conceptual_captions"
labeled_config = "labeled"
whole_dataset = load_dataset(dataset_name,labeled_config,split='train')
subset_size = 15000
dataset = whole_dataset.select(range(subset_size))
dataset = dataset.train_test_split(test_size=0.10, shuffle=True, seed=42)
dataset['train'] = dataset['train'].train_test_split(test_size=0.12, shuffle=True, seed=42)
data = {}
data['train'] = dataset['train']['train']
data['eval'] = dataset['train']['test']
data['test'] = dataset['test']
data['train'].set_format("torch")
data['eval'].set_format("torch")
data['test'].set_format("torch")



In [None]:
USER_AGENT = get_datasets_user_agent()
def fetch_single_image(image_url, timeout=10, retries=0):
    for _ in range(retries + 1):
        try:
            request = urllib.request.Request(
                image_url,
                data=None,
                headers={"user-agent": USER_AGENT},
            )
            with urllib.request.urlopen(request, timeout=timeout) as req:
                image = PIL.Image.open(io.BytesIO(req.read()))
            break
        except Exception:
            image = None
    return image

In [None]:
class ConceptualCaptionCustom(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __getitem__(self, idx):
        item = self.dataset[idx]
        img_url = item['image_url']
        label = item['labels']
        img = fetch_single_image(img_url)

        if img is None:
            return None

        target = [label[0]]
        for i in label[1:]:
            target.append(',')
            target.append(i)

        target = ' '.join([i for i in target])
        item = {"image": img, "text": target}
        try:
            encoding = self.processor(images=item["image"], text=item["text"],return_tensors="pt")
            encoding = {k:v.squeeze() for k,v in encoding.items()}
            return encoding
        except:
            return None

    def __len__(self):
        return len(self.dataset)

In [None]:
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

In [None]:
train_dataset = ConceptualCaptionCustom(data['train'], processor)
val_dataset = ConceptualCaptionCustom(data['eval'], processor)
test_dataset = ConceptualCaptionCustom(data['test'],processor)

In [None]:
def collate_fn(batch, dataset, batch_size):
    pixel_values = []
    input_ids = []

    for i in batch:
        if(i is not None):
            pixel_values.append(i['pixel_values'])
            input_ids.append(i['input_ids'])

    missing = 0
    if(len(batch) != batch_size):
        missing = batch_size - len(batch)

    while missing!=0:
        rand_idx = np.random.randint(0, len(dataset))
        rand_ele = dataset[rand_idx]
        if(rand_ele != None):
            pixel_values.append(rand_ele['pixel_values'])
            input_ids.append(rand_ele['input_ids'])
            missing -=1

    validated_batch = {}
    validated_batch['input_ids'] = pad_sequence(input_ids).permute(1,0)
    validated_batch['pixel_values'] = torch.stack(pixel_values)

    return validated_batch

In [None]:
batch_size = 32
train_dataloader =  DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8,pin_memory=True, collate_fn = lambda batch :collate_fn(batch, train_dataset, batch_size=batch_size))
val_dataloader =  DataLoader(val_dataset, batch_size=batch_size, num_workers=2,pin_memory=True, collate_fn = lambda batch :collate_fn(batch, train_dataset, batch_size=batch_size))
test_dataloader =  DataLoader(test_dataset, batch_size=batch_size, num_workers=2,pin_memory=True, collate_fn = lambda batch :collate_fn(batch, train_dataset, batch_size=batch_size))



In [None]:
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.load_state_dict(torch.load('/content/drive/MyDrive/new/hashtag_generator.pth'))
model.to(device)

BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-0

In [None]:
for param in model.parameters():
    param.requires_grad = True

for vision_param in model.vision_model.parameters():
    vision_param.requires_grad = False

bert_params = model.text_decoder.bert.parameters()
cls_params = model.text_decoder.cls.parameters()

optimizer = torch.optim.AdamW([
                {'params': bert_params, 'lr': 1e-8},
                {'params': cls_params}
            ], lr=1e-7)


In [None]:
model.train()
num_epochs = 1
running_loss = 0
subsection = len(train_dataloader)/4

for epoch in range(num_epochs):
    print(f"Epoch [{epoch + 1}/{num_epochs}]")
    print("-"*100)
    running_loss = 0
    progress_bar = tqdm_notebook(total=subsection)
    for idx, batch in enumerate(train_dataloader):
        input_ids = batch["input_ids"].to(device)
        pixel_values = batch["pixel_values"].to(device)

        outputs = model(input_ids=input_ids,
                        pixel_values=pixel_values,
                        labels=input_ids)

        loss = outputs.loss
        running_loss += loss

        if (idx % subsection == 0 and idx!=0) or (idx+1 == len(train_dataloader)):
            if(progress_bar):
                progress_bar.close()
            print(f"Epoch [{epoch + 1}/{num_epochs}], Batch [{idx + 1}/{len(train_dataloader)}], Loss: {running_loss/subsection}")
            running_loss = 0
            if(idx+1 != len(train_dataloader)):
              progress_bar = tqdm_notebook(total=subsection)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if(idx+1 != len(train_dataloader)):
          progress_bar.update(1)

    if(progress_bar):
        progress_bar.close()

Epoch [1/1]
----------------------------------------------------------------------------------------------------


  0%|          | 0/93.0 [00:00<?, ?it/s]

Epoch [1/1], Batch [94/372], Loss: 1.9163135290145874


  0%|          | 0/93.0 [00:00<?, ?it/s]

Epoch [1/1], Batch [187/372], Loss: 1.8911949396133423


  0%|          | 0/93.0 [00:00<?, ?it/s]

Epoch [1/1], Batch [280/372], Loss: 1.8916420936584473


  0%|          | 0/93.0 [00:00<?, ?it/s]

Epoch [1/1], Batch [372/372], Loss: 1.8806589841842651


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# torch.save(model.state_dict(), '/content/hashtag_generator.pth')

In [None]:
# source_path = '/content/hashtag_generator.pth'
# destination_path = '/content/drive/MyDrive/new/hashtag_generator.pth'
# shutil.copyfile(source_path, destination_path)

In [None]:
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

Downloading (…)rocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.load_state_dict(torch.load('/content/drive/MyDrive/new/hashtag_generator.pth'))
model.to(device)

BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-0

In [None]:
url = 'https://stat5.bollywoodhungama.in/wp-content/uploads/2021/06/Satyaprem-Ki-Katha-27-346x260.jpg'
raw_image = PIL.Image.open(requests.get(url, stream=True).raw)
inputs = processor(raw_image, return_tensors="pt").to(device)
model.eval()
with torch.no_grad():
    for i in range(5):
        out = model.generate(**inputs,max_length=20,do_sample=True)
        caption = processor.decode(out[0],skip_special_tokens=True)
        print(caption)

romance, movie, poster, love
love, romance, kiss, movie
poster, love, sunlight, happy, interaction, photography, interaction, tree, gesture, night
love, romance, poster, fashion, romance novel, event
love, romance, movie, poster, interaction, light, romance novel, photography, happy


In [None]:
with torch.no_grad():
  multiple_output = model.generate(**inputs,max_length=20,do_sample=True,num_return_sequences=20)
  caption = processor.batch_decode(multiple_output,skip_special_tokens=True)
  caption

In [None]:
caption

['poster, movie, dance, flesh, album cover, dance, love, photography, photo capt',
 'movie, poster, love, happy, romance, book cover',
 'love, engagement, movie, poster, romance, happy, event, photography, kiss, happy',
 'love, romance, poster, romance movie, album cover, bride, formal wear, happy,',
 'romance, dance, dance like hot dancing, love, poster, event, heat, photography,',
 'love, movie, poster, romance, interaction, lovebird, photography, photo shoot, romance',
 'romance, movie, poster, human, action film, photography, photo shoot, romance with text',
 'romance, poster, movie, love, forehead, romance comedy, happy, interaction, photography,',
 'love, romance, poster, font, movie, photography, heat dance, star, romance novel',
 'romance, movie, poster, love, action film, photography, romance, album cover, art',
 'light, romance, poster, photography, gesture, dress, love, happy, romance novel,',
 'performance, dance, event, dance music, musical, music artist, musical theatre, 