In [1]:
import argparse
import sys 
import pandas as pd
import numpy as np

sys.path.append('/home/dchesakov/fairseq-image-captioning')

import data
import json
import os
import tqdm

from sacremoses import MosesTokenizer


def load_annotations(coco_dir):
    with open(os.path.join(coco_dir, 'annotations', f'captions_train2014.json')) as f:
        annotations = json.load(f)['annotations']

    with open(os.path.join(coco_dir, 'annotations', f'captions_val2014.json')) as f:
        annotations.extend(json.load(f)['annotations'])

    return annotations


def select_captions(annotations, image_ids):
    """Select captions of given image_ids and return them with their image IDs.
    """

    # for fast lookup
    image_ids = set(image_ids)

    captions = []
    caption_image_ids = []

    for annotation in annotations:
        image_id = annotation['image_id']
        if image_id in image_ids:
            captions.append(annotation['caption'].replace('\n', ''))
            caption_image_ids.append(image_id)

    return captions, caption_image_ids


def tokenize_captions(captions, lang='en'):
    """Tokenizes captions list with Moses tokenizer.
    """

    tokenizer = MosesTokenizer(lang=lang)
    return [tokenizer.tokenize(caption, return_str=True) for caption in captions]


def write_captions(captions, filename, lowercase=True):
    with open(filename, 'w') as f:
        for caption in captions:
            if lowercase:
                caption = caption.lower()
            f.write(caption + '\n')


def write_image_ids(image_ids, filename):
    with open(filename, 'w') as f:
        for image_id in image_ids:
            f.write(f'{image_id}\n')


def main(args):
    os.makedirs(args.output_dir, exist_ok=True)

    # Load annotations of MS-COCO training and validation set
    annotations = load_annotations(args.ms_coco_dir)

    # Read image ids of given split
    image_ids = data.read_split_image_ids(args.split)

    # Select captions and their image IDs from annotations
    captions, caption_image_ids = select_captions(annotations, image_ids)

    print('Tokenize captions ...')
    captions = tokenize_captions(tqdm.tqdm(captions))

    captions_filename = os.path.join(args.output_dir, f'{args.split}-captions.tok.en')
    caption_image_ids_filename = os.path.join(args.output_dir, f'{args.split}-ids.txt')

    write_captions(captions, captions_filename)
    print(f'Wrote tokenized captions to {captions_filename}.')

    write_image_ids(caption_image_ids, caption_image_ids_filename)
    print(f'Wrote caption image IDs to {caption_image_ids_filename}.')

In [2]:
annotations = load_annotations('/home/dchesakov/fairseq-image-captioning/ms-coco')

[{'image_id': 318556,
  'id': 48,
  'caption': 'A very clean and well decorated empty bathroom'},
 {'image_id': 116100,
  'id': 67,
  'caption': 'A panoramic view of a kitchen and all of its appliances.'},
 {'image_id': 318556,
  'id': 126,
  'caption': 'A blue and white bathroom with butterfly themed wall tiles.'},
 {'image_id': 116100,
  'id': 148,
  'caption': 'A panoramic photo of a kitchen and dining room'},
 {'image_id': 379340,
  'id': 173,
  'caption': 'A graffiti-ed stop sign across the street from a red car '},
 {'image_id': 379340,
  'id': 188,
  'caption': 'A vandalized stop sign and a red beetle on the road'},
 {'image_id': 318556,
  'id': 219,
  'caption': 'A bathroom with a border of butterflies and blue paint on the walls above it.'},
 {'image_id': 318556,
  'id': 255,
  'caption': 'An angled view of a beautifully decorated bathroom.'},
 {'image_id': 134754,
  'id': 272,
  'caption': 'The two people are walking down the beach.'},
 {'image_id': 538480,
  'id': 288,
  'ca

In [16]:
def split_file(split):
    return os.path.join('/home/dchesakov/fairseq-image-captioning/splits', f'karpathy_{split}_images.txt')


def read_split_image_ids_and_paths(split):
    split_df = pd.read_csv(split_file(split), sep=' ', header=None)
    return np.array(split_df.iloc[:,1]), np.array(split_df.iloc[:,0])

In [18]:
image_ids = read_split_image_ids_and_paths('train')[0]

In [19]:
captions, caption_image_ids = select_captions(annotations, image_ids)

In [22]:
captions = tokenize_captions(tqdm.tqdm(captions))

100%|██████████| 566747/566747 [01:12<00:00, 7793.13it/s] 


In [23]:
captions

['A very clean and well decorated empty bathroom',
 'A panoramic view of a kitchen and all of its appliances .',
 'A blue and white bathroom with butterfly themed wall tiles .',
 'A panoramic photo of a kitchen and dining room',
 'A graffiti-ed stop sign across the street from a red car',
 'A vandalized stop sign and a red beetle on the road',
 'A bathroom with a border of butterflies and blue paint on the walls above it .',
 'An angled view of a beautifully decorated bathroom .',
 'The two people are walking down the beach .',
 'A sink and a toilet inside a small bathroom .',
 'An empty kitchen with white and black appliances .',
 'A white square kitchen with tile floor that needs repairs',
 'The vanity contains two sinks with a towel for each .',
 'Several metal balls sit in the sand near a group of people .',
 'Two people carrying surf boards on a beach .',
 'A kitchen with brown cabinets , tile backsplash , and grey counters .',
 'A surfer , a woman , and a child walk on the beach 

In [6]:
import argparse
import os
import torch
import tqdm
import numpy as np
import pandas as pd

import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="3"

from torchvision import transforms

from data import ImageDataset
from model.inception import inception_v3_base

In [7]:
def split_file(split):
    return os.path.join('/home/dchesakov/fairseq-image-captioning/splits', f'mysplit_{split}_images.txt')


def read_split_image_ids_and_paths(split):
    split_df = pd.read_csv(split_file(split), sep=' ', header=None)
    return np.array(split_df.iloc[:,1]), np.array(split_df.iloc[:,0])

In [None]:
def main(args):
    image_ids, image_paths = read_split_image_ids_and_paths(args.split)
    image_paths = [image_path for image_path in image_paths]
    features_dir = os.path.join('/home/dchesakov/fairseq-image-captioning/output_cxr', f'{args.split}-features-grid-cxr')

    os.makedirs(features_dir, exist_ok=True)

    inception = inception_v3_base(pretrained=True)
    inception.eval()
    inception.to(args.device)

    transform = transforms.Compose([
        transforms.Resize((299, 299)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    dataset = ImageDataset(image_ids, image_paths, transform=transform)
    loader = torch.utils.data.DataLoader(dataset,
                                         batch_size=args.batch_size,
                                         num_workers=args.num_workers,
                                         pin_memory=args.device.type == 'cuda',
                                         shuffle=False)

    with torch.no_grad():
        for imgs, ids in tqdm.tqdm(loader):
            outs = inception(imgs.to(args.device)).permute(0, 2, 3, 1).view(-1, 64, 2048)
            for out, id in zip(outs, ids):
                out = out.cpu().numpy()
                id = str(id.item())
                np.save(os.path.join(features_dir, id), out)

In [12]:
image_ids, image_paths = read_split_image_ids_and_paths('train')
image_paths = [image_path for image_path in image_paths]
# features_dir = os.path.join('/home/dchesakov/fairseq-image-captioning/output_cxr', f'{args.split}-features-grid-cxr')

# os.makedirs(features_dir, exist_ok=True)

inception = inception_v3_base(pretrained=True)
inception.eval()
inception.to('cuda')

transform = transforms.Compose([
    transforms.Resize((299, 299)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [13]:
dataset = ImageDataset(image_ids, image_paths, transform=transform)

In [16]:
loader = torch.utils.data.DataLoader(dataset,
                                     batch_size=8,
                                     num_workers=2,
                                     pin_memory=True,
                                     shuffle=False)

In [18]:
for imgs, ids in tqdm.tqdm(loader):
    pass

100%|██████████| 836/836 [00:44<00:00, 18.69it/s]
