In [1]:
import pandas as pd
from train import train_captioning_model
from model_architecture import CNNtoRNN
from gradcam import GradCAM
import os
from model import ConvNet
from PIL import Image
import torch
import numpy as np
import torch.nn.functional as F
from utils import visualize_cam, Normalize, print_examples
from torchvision.utils import make_grid, save_image
from datetime import datetime
from get_loader import get_loader
import torchvision.transforms as transforms
import torch.nn as nn
import torchvision.models as models

In [2]:
preprocessing = transforms.Compose([
            transforms.Resize((356,356)),
            transforms.RandomCrop((299,299)),
            transforms.ToTensor(),
            transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))
        ])

train_loader, val_loader, dataset = get_loader(
        root_folder="flickr8k/Images",
        annotation_file="flickr8k/captions.txt",
        transform = preprocessing,
        num_workers=1,
    )

404
40051
404


In [9]:
def saliency_and_caption(file_names, model,filename):
    salient_image_addresses = []
    predicted_captions = []
    for file_name in file_names:
        image_path = "images\\" + file_name
        image = Image.open(image_path)

        transform = transforms.Compose([
            transforms.Resize((356, 356)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])

        test_img = transform(image.convert("RGB")).unsqueeze(0) #
        predicted_caption = model.caption_image(test_img.cuda(), dataset.vocab)
        predicted_captions.append(predicted_caption)
    
        for name, param in model.CNN.CNNArchitecture.named_parameters():
            param.requires_grad = True
    
        normalizer = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        torch_img = torch.from_numpy(np.asarray(image)).permute(2, 0, 1).unsqueeze(0).float().div(255).cuda()
        torch_img = F.upsample(torch_img, size=(224, 224), mode='bilinear', align_corners=False)
        norm_torch_img = normalizer(torch_img)

        images = []

        for layer,_ in model.CNN.CNNArchitecture.named_parameters():
            if 'bias' not in layer and 'classifier' not in layer:
                layer_name_list = layer.split('.')
                layer_name = layer_name_list[0]+'_'+layer_name_list[1]
                cam_dict = dict()
                model_dict = dict(type='alexnet',arch=model.CNN.CNNArchitecture.cuda(), layer_name=layer_name,input_size=(224,224))
                model_GradCAM = GradCAM(model_dict, True)
                mask , logit = model_GradCAM(torch_img)
                mask = mask.cpu()
                heatmap_t, result_t = visualize_cam(mask,torch_img)
                layer_Image = torch.stack([torch_img.squeeze().cpu(), heatmap_t, result_t], 0)
                images.append(layer_Image)
        images = make_grid(torch.cat(images, 0), nrow=3)
        now = datetime.now()
        dt_string = now.strftime("%d_%m_%Y_%H_%M_%S_")

        output_dir = 'outputs'
        os.makedirs(output_dir, exist_ok=True)
        output_name = dt_string + file_name
        output_path = os.path.join(output_dir,output_name)
        save_image(images, output_path)

        salient_image_addresses.append(output_path)
    name_dict = {
            'captions': predicted_captions,
            'images': salient_image_addresses
          }

    df = pd.DataFrame(name_dict)
    df.to_csv(filename)

In [4]:
test_images = ["girl_car.jpg","girl_water.jpg","car.jpg","dog.jpg","man_fish.jpg"]

FULL NETWORK TRAINING (5 conv layers)

In [5]:
embed_size = 512 #
num_epochs = 100
visual_model = models.alexnet(pretrained=True) #pretrained

In [6]:
visual_model.classifier = nn.Sequential(nn.Linear(9216,embed_size))

for name,param in visual_model.named_parameters():
    if 'classifier' in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [7]:
alexnet_full_model = train_captioning_model(embed_size,visual_model,num_epochs, load_model=False, save_model=False)

404
40051
404


                                                                                                                       

Epoch : 0 Loss : 3.0358011722564697 Step : 313 Train Accuracy : 61.737179098738125


                                                                                                                       

Average loss on Validation Set : 2.6955652236938477 Average Valid Accuracy : 0.9251691997051239


In [8]:
filename="full_model_result.csv"
saliency_and_caption(test_images,alexnet_full_model,filename)



4 LAYERS

In [31]:
visual_model.classifier = nn.Sequential(nn.Linear(9216,embed_size))
visual_model.features = nn.Sequential(*[visual_model.features[i] for i in range(10)])

for name,param in visual_model.named_parameters():
    if 'classifier' in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [32]:
alexnet_model_4 = train_captioning_model(embed_size,visual_model,num_epochs, load_model=False, save_model=False)

404
40051
404


                                                                                                                       

Epoch : 0 Loss : 2.939314126968384 Step : 313 Train Accuracy : 62.52790745749371


                                                                                                                       

Average loss on Validation Set : 3.0162580013275146 Average Valid Accuracy : 0.9823007434606552


                                                                                                                       

Epoch : 1 Loss : 2.711622714996338 Step : 626 Train Accuracy : 69.66637614369392
Average loss on Validation Set : 3.0162580013275146 Average Valid Accuracy : 0.0


                                                                                                                       

Epoch : 2 Loss : 2.689408779144287 Step : 939 Train Accuracy : 72.21162861585617


                                                                                                                       

Average loss on Validation Set : 2.6272077560424805 Average Valid Accuracy : 1.0954109132289886


                                                                                                                       

Epoch : 3 Loss : 2.553788185119629 Step : 1252 Train Accuracy : 74.04748938977718
Average loss on Validation Set : 2.6272077560424805 Average Valid Accuracy : 0.0


                                                                                                                       

Epoch : 4 Loss : 2.382962942123413 Step : 1565 Train Accuracy : 76.12488314509392


                                                                                                                       

Average loss on Validation Set : 2.2935900688171387 Average Valid Accuracy : 1.1930032968521118




In [33]:
filename="4_layers_result.csv"
saliency_and_caption(test_images,alexnet_model_4,filename)
# predicted_captions_4, layered_saliency_images_4 = saliency_and_caption(test_images,alexnet_model_4)

3 CONV LAYERS

In [38]:
visual_model.classifier = nn.Sequential(nn.Linear(13824,embed_size))
visual_model.features = nn.Sequential(*[visual_model.features[i] for i in range(8)])

for name,param in visual_model.named_parameters():
    if 'classifier' in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [39]:
alexnet_model_3 = train_captioning_model(embed_size,visual_model,num_epochs, load_model=False, save_model=False)

404
40051
404


                                                                                                                       

Epoch : 0 Loss : 2.991016387939453 Step : 313 Train Accuracy : 61.95760487020016


                                                                                                                       

Average loss on Validation Set : 2.9707908630371094 Average Valid Accuracy : 0.8943258076906204


                                                                                                                       

Epoch : 1 Loss : 2.781543254852295 Step : 626 Train Accuracy : 69.2215428352356
Average loss on Validation Set : 2.9707908630371094 Average Valid Accuracy : 0.0


                                                                                                                       

Epoch : 2 Loss : 2.6379730701446533 Step : 939 Train Accuracy : 71.62228222191334


                                                                                                                       

Average loss on Validation Set : 2.6216704845428467 Average Valid Accuracy : 1.0958247482776642


                                                                                                                       

Epoch : 3 Loss : 2.523810863494873 Step : 1252 Train Accuracy : 74.18602454662323
Average loss on Validation Set : 2.6216704845428467 Average Valid Accuracy : 0.0


                                                                                                                       

Epoch : 4 Loss : 2.4719078540802 Step : 1565 Train Accuracy : 75.81526413559914


                                                                                                                       

Average loss on Validation Set : 2.2977919578552246 Average Valid Accuracy : 1.0781202614307404




In [41]:
filename="3_layers_result.csv"
saliency_and_caption(test_images,alexnet_model_3,filename)
# predicted_captions_3, layered_saliency_images_3 = saliency_and_caption(test_images,alexnet_model_3)

2 LAYERS

In [70]:
visual_model.classifier = nn.Sequential(nn.Linear(6912,embed_size))
visual_model.features = nn.Sequential(*[visual_model.features[i] for i in range(6)])

for name,param in visual_model.named_parameters():
    if 'classifier' in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [71]:
alexnet_model_2 = train_captioning_model(embed_size,visual_model,num_epochs, load_model=False, save_model=False)

404
40051
404


                                                                                                                       

Epoch : 0 Loss : 3.091157913208008 Step : 313 Train Accuracy : 61.924809485673904


                                                                                                                       

Average loss on Validation Set : 3.073967218399048 Average Valid Accuracy : 0.8495033532381058


                                                                                                                       

Epoch : 1 Loss : 2.7200350761413574 Step : 626 Train Accuracy : 69.27134282886982
Average loss on Validation Set : 3.073967218399048 Average Valid Accuracy : 0.0


                                                                                                                       

Epoch : 2 Loss : 2.7014474868774414 Step : 939 Train Accuracy : 71.72322028875351


                                                                                                                       

Average loss on Validation Set : 2.72670841217041 Average Valid Accuracy : 0.8615242838859558


                                                                                                                       

Epoch : 3 Loss : 2.462919235229492 Step : 1252 Train Accuracy : 73.4545606225729
Average loss on Validation Set : 2.72670841217041 Average Valid Accuracy : 0.0


                                                                                                                       

Epoch : 4 Loss : 2.3691978454589844 Step : 1565 Train Accuracy : 74.84238375723362


                                                                                                                       

Average loss on Validation Set : 2.6706299781799316 Average Valid Accuracy : 0.954964205622673




In [72]:
filename="2_layers_result.csv"
saliency_and_caption(test_images,alexnet_model_2,filename)
# predicted_captions_2, layered_saliency_images_2 = saliency_and_caption(test_images,alexnet_model_2)

1 LAYER

In [61]:
visual_model.classifier = nn.Sequential(nn.Linear(2304,embed_size))
visual_model.features = nn.Sequential(*[visual_model.features[i] for i in range(3)])

for name,param in visual_model.named_parameters():
    if 'classifier' in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

alexnet_model_1 = train_captioning_model(embed_size,visual_model,num_epochs, load_model=False, save_model=False)

404
40051
404


                                                                                                                       

Epoch : 0 Loss : 3.1294779777526855 Step : 313 Train Accuracy : 61.829709273763


                                                                                                                       

Average loss on Validation Set : 3.1412336826324463 Average Valid Accuracy : 0.8848900347948074


                                                                                                                       

Epoch : 1 Loss : 2.8299026489257812 Step : 626 Train Accuracy : 69.11363899707794
Average loss on Validation Set : 3.1412336826324463 Average Valid Accuracy : 0.0


                                                                                                                       

Epoch : 2 Loss : 2.6615285873413086 Step : 939 Train Accuracy : 71.21358519792557


                                                                                                                       

Average loss on Validation Set : 2.723855972290039 Average Valid Accuracy : 0.930815264582634


                                                                                                                       

Epoch : 3 Loss : 2.69767165184021 Step : 1252 Train Accuracy : 72.78073942661285
Average loss on Validation Set : 2.723855972290039 Average Valid Accuracy : 0.0


                                                                                                                       

Epoch : 4 Loss : 2.5773074626922607 Step : 1565 Train Accuracy : 73.76449097692966


                                                                                                                       

Average loss on Validation Set : 2.868346929550171 Average Valid Accuracy : 0.9269476234912872


In [9]:
import torchvision.models as models
from torchsummary import summary
resnet = models.resnet152(pretrained=True)
for param in resnet.parameters():
    param.requires_grad_(False)
print(resnet.fc.in_features)
print(summary(resnet.fc))
# self.resnet = nn.Sequential(*modules)
# self.embed = nn.Linear(resnet.fc.in_features, embed_size)
# self.init_weights()
# predicted_captions_1, layered_saliency_images_1 = saliency_and_caption(test_images,alexnet_model_1)

2048
Layer (type:depth-idx)                   Param #
└─Linear: 0-1                            (2,049,000)
Total params: 2,049,000
Trainable params: 0
Non-trainable params: 2,049,000
Layer (type:depth-idx)                   Param #
└─Linear: 0-1                            (2,049,000)
Total params: 2,049,000
Trainable params: 0
Non-trainable params: 2,049,000
