In [2]:
from train import train_captioning_model
from model_architecture import CNNtoRNN
from gradcam import GradCAM
import os
from model import ConvNet
from PIL import Image
import torch
import numpy as np
import torch.nn.functional as F
from utils import visualize_cam, Normalize, beam_search, greedy_search, list_to_string, pre_process
from torchvision.utils import make_grid, save_image
from datetime import datetime
from get_loader import get_loader
import torchvision.transforms as transforms
import torch.nn as nn
import torchvision.models as models

In [3]:
preprocessing = transforms.Compose([
            transforms.Resize((356,356)),
            transforms.RandomCrop((299,299)),
            transforms.ToTensor(),
            transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))
        ])

train_loader, dataset = get_loader(
        root_folder="flickr8k/Images",
        annotation_file="flickr8k/captions.txt",
        transform = preprocessing,
        num_workers=1,
    )

In [4]:
embed_size = 512
num_epochs = 20
visual_model = models.alexnet(pretrained=True)
visual_model.classifier = nn.Sequential(nn.Linear(9216,embed_size))
print(visual_model)
alexnet_model = train_captioning_model(visual_model,num_epochs)

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Linear(in_features=9216, out_features=512, bias=True)
  )
)
cuda
5168


RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR

In [None]:
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt
%matplotlib inline

images = ["newton.jpg","boy.jpg","car.jpg","girl_water.jpg","dog.jpg","couple.jpg","car_person.jpg","girl_car.jpg","cat.jpg"]
images = ["girl_bag.jpg"]
for img in images:
    plt.figure()
    image_path = "images\\" + img
    image = Image.open(image_path)
    processed_image = pre_process(image)
    print("Current File : " + img)
    greedy_caption = greedy_search(alexnet_model, dataset, processed_image)
    beam_search_caption = beam_search(alexnet_model, dataset, processed_image, 3)
    if greedy_caption != None and beam_search_caption != None:
        greedy_caption = List_to_string(greedy_caption)
        beam_search_caption = List_to_string(beam_search_caption)
        plt.imshow(image)
        title = "GREEDY CAPTION: " + greedy_caption + "\n" + "BEAM_SEARCH CAPTION: " + beam_search_caption
        print(title)
        plt.figtext(0.5, 0.01, title, wrap=True, horizontalalignment='center', fontsize=6)
        plt.savefig("output_"+img,dpi=300)

In [None]:
car_image = Image.open('images//car.jpg')
car_torch_img = torch.from_numpy(np.asarray(car_image)).permute(2, 0, 1).unsqueeze(0).float().div(255).cuda()
car_torch_img = F.upsample(car_torch_img, size=(224, 224), mode='bilinear', align_corners=False)
car_norm_torch_img = normalizer(car_torch_img)
car_image

In [None]:
for name, param in model.named_parameters():
    print(name)
    param.requires_grad = True
    
airplane_images = []
for layer,_ in model.CNN.CNNArchitecture.named_parameters():
    print(layer)
    layer_name_list = layer.split('.')
    layer_name = layer_name_list[0]+'_'+layer_name_list[1]
    print(layer_name)
    cam_dict = dict()
    model_dict = dict(type='alexnet',arch=model.CNN.CNNArchitecture.cuda(), layer_name=layer_name,input_size=(224,224))
    model_GradCAM = GradCAM(model_dict, True)
    mask , logit = model_GradCAM(airplane_torch_img)
    mask = mask.cpu()
    heatmap_t, result_t = visualize_cam(mask, airplane_torch_img)
    layer_Image = torch.stack([airplane_torch_img.squeeze().cpu(), heatmap_t, result_t], 0)
    airplane_images.append(layer_Image)
airplane_images = make_grid(torch.cat(airplane_images, 0), nrow=3)

In [None]:
now = datetime.now()
dt_string = now.strftime("%d_%m_%Y_%H_%M_%S_")
print("date and time =", dt_string)

output_dir = 'outputs'
os.makedirs(output_dir, exist_ok=True)
output_name = dt_string + 'car_person.jpg'
output_path = os.path.join(output_dir,output_name)
# print(images)
save_image(airplane_images, output_path)
Image.open(output_path)