<a href="https://colab.research.google.com/github/FeryET/DeepLearning_CA7/blob/master/DL_CA7_Q1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/kaggle"
!kaggle datasets download -d adityajn105/flickr8k
!unzip -qo "/content/flickr8k.zip"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
flickr8k.zip: Skipping, found more recently modified local copy (use --force to force download)


In [15]:
import numpy as np
from PIL import Image
import re
import string
from glob import glob
import pandas as pd
import string 
import itertools
import collections

import torch
import torchvision
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Defining Constants

In [None]:
START_TOKEN = "<SOS>"
END_TOKEN = "<EOS>"
PAD_TOKEN = "<PAD>"

MAX_LENGTH = 150


# Preprocessing Pipelines

### Captions

In [12]:


def preprocess_text(text):
  prep = text.lower()
  prep = re.sub("\s+", " ", prep)
  prep = prep.translate(string.punctuation)
  prep = f"{START_TOKEN} {prep.strip()} {END_TOKEN}"
  return prep.split()


class VocabTransform:
  def __init__(self, vocab):
    self.vocab = vocab
  
  def __call__(self, tokenized):
    return [self.vocab[t] for t in tokenized]

### Images

In [13]:
# This is copied from https://pytorch.org/hub/pytorch_vision_resnet/
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Defining the Dataset

In [None]:
class FilckrDataset(Dataset):
  def __init__(self, images_path, csv_path, image_transforms, caption_transforms):
    self.image_transforms = image_transforms
    self.caption_transforms = caption_transforms
    self.images_path = images_path
    self.images_fnames = sorted(os.listdir(images_path))
    self.csv_path = csv_path
    self.df = pd.read_csv(csv_path)
    self._preprocess_captions()
  
  def _preprocess_captions(self):
    self.df["cleaned"] = self.df["caption"].apply(preprocess_text)
    self.df.sort_values(by="image", inplace=True)
    vocab = set(itertools.chain(self.df["cleaned"]))
    vocab.add(PAD_TOKEN)
    vocab = sorted(vocab)
    self.vocab = {v: idx for idx , v in enumerate(self.vocab)}
  
  @property
  def vocab_len(self):
    return len(self.vocab)

  def __getitem__(self, idx):
    self.step = 5
    idx *= step # num of repeats
    fname = self.images_fnames[idx]
    fpath = os.path.join(self.image_path, fname)
    
    image = Image.open(fpath)
    captions = self.df.iloc[idx:idx+step]["cleaned"]
    # Truncating
    captions = [c[:MAX_LENGTH] for c in captions]

    image = self.image_transforms(image)
    captions = self.caption_transforms(captions)

    return {
        "image": image, "captions": captions
    }

In [None]:
# This is needed for batches

class RepeatImages:
  def __init__(self, num_repeat):
    self.num_repeat = num_repeat

  def __call__(self, data):
    images = data["image"]
    images = torch.repeat_interleave(images, self.num_repeat, dim=0)
    data["image"] = images
    return data

class PadCaptions:
  def __init__(self, vocab):
    self.pad_idx = vocab[PAD_TOKEN]
  
  def __call__(self, data):
    captions = data["captions"]
    torch.nn.utils.rnn.pad_sequence(captions,
                                    batch_first=True,
                                    padding_vale=self.pad_idx)
    data["captions"] = captions
    return data

In [None]:
class FlickrRNN(nn.Module):
  # Influenced by https://github.com/siddsrivastava/Image-captioning/blob/master/model.py
  class EncoderCNN:
    def __init__(self, embedding_dim, do_freeze):
      # Loading resnet
      resnet = torchvision.models.resnet18(pretrained=True)
      fc_in_features = resnet.fc.in_features
      modules = list(resnet.children())[:-1]
      self.resnet = nn.Sequential(*modules)
      for param in self.resnet.parameters():
        # do not freeze if do_freeze = False
        param.requires_grad = not do_freeze
      self.fc = nn.Linear(fc_in_features, embedding_dim)
    
    def forward(self, x):
      x = self.resnet(x)
      x = self.fc(x)
      return x


  class DecoderRNN:
    def __init__(self, 
                 len_vocab,
                 embedding_dim,
                 hidden_size,
                 padding_idx,
                 bidirectional=False,
                 dropout=0):
      
      # Creating embeddings
      self.embed = nn.Embedding(num_embeddings=len_vocab,
                                embedding_dim=embedding_dim, 
                                padding_idx=padding_idx),
                                scale_grad_by_freq=True,
                                sparse=True,
                                )
      self.lstm = nn.LSTM(input_size=embedding_dim,
                          hidden_size=hidden_size,
                          batch_first=True,
                          dropout=dropout
                          bidirectional=bidirectional)
      self.fc = nn.Linear(hidden_size,vocab_size)

    def forward(self, caption_seq):
        caption_seq = caption_seq[:,:-1] 
        embeddings = self.embed(caption_seq)
        total_input = torch.cat((features.unsqueeze(1),embeddings),1)
        lstm_out, self.hidden = self.lstm(total_input)
        outputs = self.fc(lstm_out)
        return outputs

  def __init__(self, 
               len_vocab,
               embedding_dim,
               hidden_size,
               padding_idx,
               bidirectional=False,
               dropout=0,
               freeze=True):
    




In [20]:
print(resnet.fc.out_features)

1000
