# Advanced Certification in AIML
## A Program by IIIT-H and TalentSprint

## Problem Statement

Automatic Image Captioning

## Objectives
Build an image captioning model to generate captions of an image using CNN
Dataset Link: Flickr8k_dataset
Dataset description: A collection of sentence-based image description

● Dataset consists of 8k images in JPEG format with different shapes and sizes.

● Images are paired with five different captions which provide clear descriptions of the salient entities and events.

● The images were chosen from six different Flickr groups and included a variety of scenes and situations.

## Dataset

Flickr8k_datase - https://github.com/goodwillyoga/Flickr8k_dataset

## Basic Pytorch packages

**torchvision:**  This package is used to load and prepare the dataset. Using this package we can perform/apply transformations on the input data.

**transforms:**  This package is  used to perform **preprocessing on images** and operations sequentially.

**nn:**  This package provides an easy and modular way to build and train simple or complex neural networks.

**optim:** This package is used for  implementing various optimization algorithms

In [31]:
# Import Libraries
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchvision import datasets, transforms, models
from torch.autograd import Variable

from PIL import Image

import pandas as pd
import numpy as np
import sys, time, os, warnings

from mpl_toolkits.mplot3d import Axes3D

from pickle import dump
from collections import OrderedDict
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn import metrics

from keras import models
from keras.applications import VGG16
#from keras.preprocessing.image import load_img, img_to_array
from keras.utils import load_img, img_to_array
from keras.applications.vgg16 import preprocess_input

from IPython import get_ipython
ipython = get_ipython()

In [4]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
ipython.magic("sx unzip -qq /content/drive/MyDrive/Dataset/Flickr8k_Dataset.zip")


[]

In [6]:
ipython.magic("sx unzip -qq /content/drive/MyDrive/Dataset/captions.txt.zip")

[]

In [19]:
# Load data
images_dir = os.listdir("/content/Flicker8k_Dataset")

images_path = './Flicker8k_Dataset/'
captions_path = './Flicker8k_Dataset/Flickr_TextData/Flickr8k.token.txt'
train_path = './Flicker8k_Dataset/Flickr_TextData/Flickr_8k.trainImages.txt'
val_path = './Flicker8k_Dataset/Flickr_TextData/Flickr_8k.devImages.txt'
test_path = './Flicker8k_Dataset/Flickr_TextData/Flickr_8k.testImages.txt'

# captions = open(captions_path, 'r').read().split("\n")
# x_train = open(train_path, 'r').read().split("\n")
# x_val = open(val_path, 'r').read().split("\n")
# x_test = open(test_path, 'r').read().split("\n")

### Defining Transformations

In [13]:
# Define transformations for the images

from numpy import expand_dims
image_size = (128,128)

# YOUR CODE HERE for defining Transformation for an image
transform = transforms.Compose([
    transforms.Resize(image_size),
    transforms.Grayscale(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

In [33]:
# YOUR CODE HERE for the DataLoader

from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder


train_data = ImageFolder('Flicker8k_Dataset/', transform=transform)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)



FileNotFoundError: ignored

## **Stage 2:** Load and Finetune a pre-trained model

Load a pretrained model and finetune the appropriate layers


Initialize the device to the available runtime type

In [21]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda")
print(device)

cuda


## load_vgg16() and run_vgg16()

In [34]:
def load_vgg16():
    """
    Load the vgg16 model
    """
    modelvgg = VGG16(include_top=True,weights=None)
    modelvgg.load_weights("/content/drive/MyDrive/Dataset/vgg16_weights_tf_dim_ordering_tf_kernels.h5")
    # Exclude the last classification layer
    modelvgg.layers.pop()
    modelvgg = models.Model(inputs=modelvgg.inputs, outputs=modelvgg.layers[-1].output)
    modelvgg.summary()
    return modelvgg

def run_vgg16(dir_Flickr_jpg):
    """
    Generate the image features (4096 elements) from the VGG16 model without the last classification layer
    """
    modelvgg = load_vgg16()
    jpgs = os.listdir(dir_Flickr_jpg)
    images = OrderedDict()
    npix = 224
    target_size = (npix,npix,3)
    data = np.zeros((len(jpgs),npix,npix,3))
    for i,name in enumerate(jpgs):
        # load an image from file
        filename = dir_Flickr_jpg + '/' + name
        image = load_img(filename, target_size=target_size)
        # convert the image pixels to a numpy array
        image = img_to_array(image)
        nimage = preprocess_input(image)

        y_pred = modelvgg.predict(nimage.reshape( (1,) + nimage.shape[:3]))
        images[name] = y_pred.flatten()

    dump(images, open('./images.pkl', 'wb'))


In [None]:
def main():
    dir_Flickr_jpg = "./Flicker8k_Dataset/"
    run_vgg16(dir_Flickr_jpg)

    # images = pd.read_pickle('../data/images.pkl', compression='infer')
    # df_txt0 = pd.read_csv('../data/token0.txt', sep='\t')
    # fnames, dcaptions, dimages = link_text_image(df_txt0, images)

    # pca = PCA(n_components=3)
    # X_pca_3d = pca.fit_transform(dimages)
    # plot_elbow(X_pca_3d)
    # plot_pca_3d(X_pca_3d[:1000])

    # picked_pic = OrderedDict()
    # picked_pic["purple"] = [517, 644, 867, 225, 11, 128]
    # picked_pic["blue"] = [401,718,591,348,686, 47]
    # plot_pca_image(picked_pic, dir_Flickr_jpg)

if __name__ == '__main__':
    main()