In [9]:
import PIL
import os
import numpy as np
import h5py
import json
import torch
from scipy.misc import imread, imresize
from tqdm import tqdm, tqdm_notebook
from collections import Counter
from random import seed, choice, sample
import pandas as pd
from sklearn.model_selection import train_test_split
from models import Encoder, DecoderWithAttention
from pathlib import Path

In [15]:
curr_path = Path(os.getcwd())

'/home/dchesakov/skoltech_image_cap/NLMCXR_data/all_reports_tags_df.pkl'

In [16]:
all_reports_df = pd.read_pickle(os.path.join(str(path.parent), 'NLMCXR_data/all_reports_tags_df.pkl'))

In [17]:
all_reports_df.shape

(7430, 10)

In [18]:
'''
табличка с предобработанными данными, где в images название файла, а так же 
в prepocessed_full_report токенизированный репорт, приведенный к нужному виду
'''
all_reports_df.head(2)

Unnamed: 0,images,FINDINGS,IMPRESSION,tags_major,tags_automatic,report,full_report,processed_captions,processed_findings,preprocessed_full_report
0,CXR2357_IM-0921-1001,Cardiomediastinal silhouette and pulmonary vas...,No acute cardiopulmonary findings.,[normal],[],./reports/ecgen-radiology/2357.xml,No acute cardiopulmonary findings. Cardiomedia...,"[no acute cardiopulmonary findings, cardiomedi...",[cardiomediastinal silhouette and pulmonary va...,no acute cardiopulmonary findings . cardiomedi...
1,CXR2357_IM-0921-2001,Cardiomediastinal silhouette and pulmonary vas...,No acute cardiopulmonary findings.,[normal],[],./reports/ecgen-radiology/2357.xml,No acute cardiopulmonary findings. Cardiomedia...,"[no acute cardiopulmonary findings, cardiomedi...",[cardiomediastinal silhouette and pulmonary va...,no acute cardiopulmonary findings . cardiomedi...


In [21]:
NLMCXR_path = os.path.join(str(path.parent), 'NLMCXR_data')
NLMCXR_path

'/home/dchesakov/skoltech_image_cap/NLMCXR_data'

In [24]:
# функция, которая создает датасет в нужном формате 

def create_input_files(captions_per_image=1, min_word_freq=6, output_folder=NLMCXR_path,
                       max_len=100, version='10'):
    """
    Creates input files for training, validation, and test data.
    :param captions_per_image: number of captions to sample per image
    :param min_word_freq: words occuring less frequently than this threshold are binned as <unk>s
    :param output_folder: folder to save files
    :param max_len: don't sample captions longer than this length
    :param version: which version to use (needed to create the name of file)
    """
    
    all_reports_df = pd.read_pickle(f'{output_folder}/all_reports_tags_df.pkl')
    
    # Данные в таблице уже перемешаны, так что используется такое разбиение на трейн, валидацию и тест
    train_df = all_reports_df[:-600]
    val_df = all_reports_df[-600:-300]
    test_df = all_reports_df[-300:]

    # Read image paths and captions for each image
    train_image_paths = [f'{output_folder}/images/' + img_name + '.png' for img_name in train_df['images']]
    val_image_paths = [f'{output_folder}/images/' + img_name + '.png' for img_name in val_df['images']]
    test_image_paths = [f'{output_folder}/images/' + img_name + '.png' for img_name in test_df['images']]
    
    train_image_captions = [[cap.split()[:max_len]] for cap in train_df['preprocessed_full_report']]
    val_image_captions = [[cap.split()[:max_len]] for cap in val_df['preprocessed_full_report']]
    test_image_captions = [[cap.split()[:max_len]] for cap in test_df['preprocessed_full_report']]
    
    
    word_freq = Counter()
    for sents in all_reports_df['preprocessed_full_report']:
        word_freq.update(sents.split())

    # Sanity check
    assert len(train_image_paths) == len(train_image_captions)
    assert len(val_image_paths) == len(val_image_captions)
    assert len(test_image_paths) == len(test_image_captions)
    
    
    words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq]
    word_map = {k: v + 1 for v, k in enumerate(words)}
    
    def load_embeddings(emb_path, vocab):
        clf_embeddings = {}
        emb_vocab = set()
        for line in open(emb_path):
            line = line.strip('\n').split()
            word, emb = line[0], line[1:]
            emb = [float(e) for e in emb]
            if word in vocab:
                clf_embeddings[word] = emb
        for w in vocab:
            if w in clf_embeddings:
                emb_vocab.add(w)
        word2idx = {w: idx + 1 for (idx, w) in enumerate(emb_vocab)}
        max_val = max(word2idx.values())

        word2idx['<unk>'] = max_val + 1
        word2idx['<end>'] = max_val + 2
        word2idx['<start>'] = max_val + 3
        # word2idx[','] = max_val + 4
        word2idx['<pad>'] = 0
        emb_dim = len(list(clf_embeddings.values())[0])
        clf_embeddings['<unk>'] = [0.0 for i in range(emb_dim)]
        clf_embeddings['<end>'] = [0.0 for i in range(emb_dim)]
        clf_embeddings['<start>'] = [0.0 for i in range(emb_dim)]
        clf_embeddings['<pad>'] = [0.0 for i in range(emb_dim)]
        # clf_embeddings[','] = [0.0 for i in range(emb_dim)]
        
        embeddings = [[] for i in range(len(word2idx))]
        for w in word2idx:
            embeddings[word2idx[w]] = clf_embeddings[w]
        embeddings = torch.Tensor(embeddings)
        return embeddings, word2idx
    
    # Создаем словарик и эмбеддинги
    embeddings, word_map = load_embeddings(f'{output_folder}/embeds/glove.6B.300d.txt', word_map)
    # Сохранять эмбеддинги нужно, только если потом собираемся их использовать
    torch.save(embeddings, f"{output_folder}/embeds/embeddings_glove_v{version}.pt")

    # Create a base/root name for all output files
    base_filename = f'NLMCXR_v{version}' + '_' + str(captions_per_image) + '_cap_per_img_' + str(min_word_freq) + '_min_word_freq'

    # Save word map to a JSON
    with open(os.path.join(output_folder, 'WORDMAP_' + base_filename + '.json'), 'w') as j:
        json.dump(word_map, j)

    # Sample captions for each image, save images to HDF5 file, and captions and their lengths to JSON files
    seed(123)
    # return [(train_image_paths, train_image_captions), (val_image_paths, val_image_captions), (test_image_paths, test_image_captions)]

    for impaths, imcaps, split in [(train_image_paths, train_image_captions, 'TRAIN'),
                                   (val_image_paths, val_image_captions, 'VAL'),
                                   (test_image_paths, test_image_captions, 'TEST')]:

        with h5py.File(os.path.join(output_folder, split + '_IMAGES_' + base_filename + '.hdf5'), 'a') as h:
            # Make a note of the number of captions we are sampling per image
            h.attrs['captions_per_image'] = captions_per_image

            # Create dataset inside HDF5 file to store images
            images = h.create_dataset('images', (len(impaths), 3, 256, 256), dtype='uint8')

            print("\nReading %s images and captions, storing to file...\n" % split)

            enc_captions = []
            caplens = []

            for i, path in enumerate(tqdm_notebook(impaths)):

                # Sample captions
                if len(imcaps[i]) < captions_per_image:
                    captions = imcaps[i] + [choice(imcaps[i]) for _ in range(captions_per_image - len(imcaps[i]))]
                else:
                    captions = sample(imcaps[i], k=captions_per_image)

                # Sanity check
                assert len(captions) == captions_per_image

                # Read images
                img = imread(impaths[i])
                if len(img.shape) == 2:
                    img = img[:, :, np.newaxis]
                    img = np.concatenate([img, img, img], axis=2)
                img = imresize(img, (256, 256))
                img = img.transpose(2, 0, 1)
                assert img.shape == (3, 256, 256)
                assert np.max(img) <= 255

                # Save image to HDF5 file
                images[i] = img

                for j, c in enumerate(captions):
                    # Encode captions
                    enc_c = [word_map['<start>']] + [word_map.get(word, word_map['<unk>']) for word in c] + [
                        word_map['<end>']] + [word_map['<pad>']] * (max_len - len(c))

                    # Find caption lengths
                    c_len = len(c) + 2

                    enc_captions.append(enc_c)
                    caplens.append(c_len)

            # Sanity check
            assert images.shape[0] * captions_per_image == len(enc_captions) == len(caplens)

            # Save encoded captions and their lengths to JSON files
            with open(os.path.join(output_folder, split + '_CAPTIONS_' + base_filename + '.json'), 'w') as j:
                json.dump(enc_captions, j)

            with open(os.path.join(output_folder, split + '_CAPLENS_' + base_filename + '.json'), 'w') as j:
                json.dump(caplens, j)

In [25]:
create_input_files()


Reading TRAIN images and captions, storing to file...



HBox(children=(IntProgress(value=0, max=6830), HTML(value='')))

`imread` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imread`` instead.
`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.



Reading VAL images and captions, storing to file...



HBox(children=(IntProgress(value=0, max=300), HTML(value='')))


Reading TEST images and captions, storing to file...



HBox(children=(IntProgress(value=0, max=300), HTML(value='')))

## Create Data Set with GPT-2 tokenizer

In [2]:
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, get_constant_schedule, get_cosine_schedule_with_warmup,
                                  BertConfig, BertForMaskedLM, BertTokenizer,
                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
                                  DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)

In [3]:
from transformers import GPT2Tokenizer, GPT2Model

import os
import numpy as np

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

import torch

In [4]:
# загружаем уже зафайнтюненные токенайзер и модель гпт-2

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('/home/dchesakov/transformers/output3')

I0515 14:36:41.382153 140352421476160 tokenization_utils.py:895] Model name '/home/dchesakov/transformers/output3' not found in model shortcut name list (gpt2, gpt2-medium, gpt2-large, gpt2-xl, distilgpt2). Assuming '/home/dchesakov/transformers/output3' is a path, a model identifier, or url to a directory containing tokenizer files.
I0515 14:36:41.383839 140352421476160 tokenization_utils.py:924] Didn't find file /home/dchesakov/transformers/output3/added_tokens.json. We won't load it.
I0515 14:36:41.384978 140352421476160 tokenization_utils.py:977] loading file /home/dchesakov/transformers/output3/vocab.json
I0515 14:36:41.385739 140352421476160 tokenization_utils.py:977] loading file /home/dchesakov/transformers/output3/merges.txt
I0515 14:36:41.386518 140352421476160 tokenization_utils.py:977] loading file None
I0515 14:36:41.387429 140352421476160 tokenization_utils.py:977] loading file /home/dchesakov/transformers/output3/special_tokens_map.json
I0515 14:36:41.388145 140352421476

In [6]:
gpt_model = GPT2LMHeadModel.from_pretrained('/home/dchesakov/transformers/output3', pad_token_id=tokenizer.eos_token_id)

I0515 14:36:41.844866 140352421476160 configuration_utils.py:284] loading configuration file /home/dchesakov/transformers/output3/config.json
I0515 14:36:41.846754 140352421476160 configuration_utils.py:322] Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50256,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "vocab_size": 50257
}

I0515 14:36:41.848603 140352421476160 modeling_utils.py:610] loading weights file /home/dchesakov/transformers/output3/pytorch_model.bin


In [7]:
gpt_model.transformer.wte.weight.shape

torch.Size([50257, 768])

In [8]:
len(tokenizer.get_vocab())

50257

In [9]:
# Основное отличие -- используем токенизатор из гтп-2 модели, 
# не делаем эмбеддинги, а просто добавляем нужные токены в словарик, 
# потому что предобученные эмбеддинги из глов и тд тут не работают
# можно использовать эмбеддинги из gpt зафайнтюненной (gpt_model.transformer.wte.weight.shape)

def create_input_files_gpt(captions_per_image=1, min_word_freq=6, output_folder='/home/dchesakov/NLMCXR_data',
                       max_len=100, version='gpt_cross'):
    """
    Creates input files for training, validation, and test data.
    :param captions_per_image: number of captions to sample per image
    :param min_word_freq: words occuring less frequently than this threshold are binned as <unk>s
    :param output_folder: folder to save files
    :param max_len: don't sample captions longer than this length
    :param version: which version to use (needed to create the name of file)
    """
    
    all_reports_df = pd.read_pickle('/home/dchesakov/NLMCXR_data/all_reports_tags_df.pkl')
    
    train_df = all_reports_df[:-600]
    val_df = all_reports_df[-600:-300]
    test_df = all_reports_df[-300:]

    # Read image paths and captions for each image
    train_image_paths = ['/home/dchesakov/NLMCXR_data/images/' + img_name + '.png' for img_name in train_df['images']]
    val_image_paths = ['/home/dchesakov/NLMCXR_data/images/' + img_name + '.png' for img_name in val_df['images']]
    test_image_paths = ['/home/dchesakov/NLMCXR_data/images/' + img_name + '.png' for img_name in test_df['images']]
    
    train_image_captions = [[tokenizer.tokenize(cap)[:max_len]] for cap in train_df['preprocessed_full_report']]
    val_image_captions = [[tokenizer.tokenize(cap)[:max_len]] for cap in val_df['preprocessed_full_report']]
    test_image_captions = [[tokenizer.tokenize(cap)[:max_len]] for cap in test_df['preprocessed_full_report']]
    
    
    word_freq = Counter()
    for sents in all_reports_df['preprocessed_full_report']:
        word_freq.update(tokenizer.tokenize(sents))

    # Sanity check
    assert len(train_image_paths) == len(train_image_captions)
    assert len(val_image_paths) == len(val_image_captions)
    assert len(test_image_paths) == len(test_image_captions)
    
    
    words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq]
    word_map = {k: v + 1 for v, k in enumerate(words)}
    max_val = max(word_map.values())
    word_map['<unk>'] = max_val + 1
    word_map['<end>'] = max_val + 2
    word_map['<start>'] = max_val + 3
    # word2idx[','] = max_val + 4
    word_map['<pad>'] = 0

    # Create a base/root name for all output files
    base_filename = f'NLMCXR_v{version}' + '_' + str(captions_per_image) + '_cap_per_img_' + str(min_word_freq) + '_min_word_freq'

    # Save word map to a JSON
    with open(os.path.join(output_folder, 'WORDMAP_' + base_filename + '.json'), 'w') as j:
        json.dump(word_map, j)

    # Sample captions for each image, save images to HDF5 file, and captions and their lengths to JSON files
    seed(123)
    # return [(train_image_paths, train_image_captions), (val_image_paths, val_image_captions), (test_image_paths, test_image_captions)]

    for impaths, imcaps, split in [(train_image_paths, train_image_captions, 'TRAIN'),
                                   (val_image_paths, val_image_captions, 'VAL'),
                                   (test_image_paths, test_image_captions, 'TEST')]:

        with h5py.File(os.path.join(output_folder, split + '_IMAGES_' + base_filename + '.hdf5'), 'a') as h:
            # Make a note of the number of captions we are sampling per image
            h.attrs['captions_per_image'] = captions_per_image

            # Create dataset inside HDF5 file to store images
            images = h.create_dataset('images', (len(impaths), 3, 256, 256), dtype='uint8')

            print("\nReading %s images and captions, storing to file...\n" % split)

            enc_captions = []
            caplens = []

            for i, path in enumerate(tqdm_notebook(impaths)):

                # Sample captions
                if len(imcaps[i]) < captions_per_image:
                    captions = imcaps[i] + [choice(imcaps[i]) for _ in range(captions_per_image - len(imcaps[i]))]
                else:
                    captions = sample(imcaps[i], k=captions_per_image)

                # Sanity check
                assert len(captions) == captions_per_image

                # Read images
                img = imread(impaths[i])
                if len(img.shape) == 2:
                    img = img[:, :, np.newaxis]
                    img = np.concatenate([img, img, img], axis=2)
                img = imresize(img, (256, 256))
                img = img.transpose(2, 0, 1)
                assert img.shape == (3, 256, 256)
                assert np.max(img) <= 255

                # Save image to HDF5 file
                images[i] = img

                for j, c in enumerate(captions):
                    # Encode captions
                    enc_c = [word_map['<start>']] + [word_map.get(word, word_map['<unk>']) for word in c] + [
                        word_map['<end>']] + [word_map['<pad>']] * (max_len - len(c))

                    # Find caption lengths
                    c_len = len(c) + 2

                    enc_captions.append(enc_c)
                    caplens.append(c_len)

            # Sanity check
            assert images.shape[0] * captions_per_image == len(enc_captions) == len(caplens)

            # Save encoded captions and their lengths to JSON files
            with open(os.path.join(output_folder, split + '_CAPTIONS_' + base_filename + '.json'), 'w') as j:
                json.dump(enc_captions, j)

            with open(os.path.join(output_folder, split + '_CAPLENS_' + base_filename + '.json'), 'w') as j:
                json.dump(caplens, j)

In [10]:
create_input_files_gpt(version='DELETEGPT')


Reading TRAIN images and captions, storing to file...



HBox(children=(IntProgress(value=0, max=6830), HTML(value='')))

`imread` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imread`` instead.
`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.




Reading VAL images and captions, storing to file...



HBox(children=(IntProgress(value=0, max=300), HTML(value='')))



Reading TEST images and captions, storing to file...



HBox(children=(IntProgress(value=0, max=300), HTML(value='')))




In [12]:
vocab_gpt = tokenizer.get_vocab()

In [13]:
rev_vocab_gpt = {i:v for v, i in vocab_gpt.items()}

In [14]:
### посмотрим, какие вообще мы используем токены

In [15]:
vocab_gpt

{'!': 0,
 '"': 1,
 '#': 2,
 '$': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 '(': 7,
 ')': 8,
 '*': 9,
 '+': 10,
 ',': 11,
 '-': 12,
 '.': 13,
 '/': 14,
 '0': 15,
 '1': 16,
 '2': 17,
 '3': 18,
 '4': 19,
 '5': 20,
 '6': 21,
 '7': 22,
 '8': 23,
 '9': 24,
 ':': 25,
 ';': 26,
 '<': 27,
 '=': 28,
 '>': 29,
 '?': 30,
 '@': 31,
 'A': 32,
 'B': 33,
 'C': 34,
 'D': 35,
 'E': 36,
 'F': 37,
 'G': 38,
 'H': 39,
 'I': 40,
 'J': 41,
 'K': 42,
 'L': 43,
 'M': 44,
 'N': 45,
 'O': 46,
 'P': 47,
 'Q': 48,
 'R': 49,
 'S': 50,
 'T': 51,
 'U': 52,
 'V': 53,
 'W': 54,
 'X': 55,
 'Y': 56,
 'Z': 57,
 '[': 58,
 '\\': 59,
 ']': 60,
 '^': 61,
 '_': 62,
 '`': 63,
 'a': 64,
 'b': 65,
 'c': 66,
 'd': 67,
 'e': 68,
 'f': 69,
 'g': 70,
 'h': 71,
 'i': 72,
 'j': 73,
 'k': 74,
 'l': 75,
 'm': 76,
 'n': 77,
 'o': 78,
 'p': 79,
 'q': 80,
 'r': 81,
 's': 82,
 't': 83,
 'u': 84,
 'v': 85,
 'w': 86,
 'x': 87,
 'y': 88,
 'z': 89,
 '{': 90,
 '|': 91,
 '}': 92,
 '~': 93,
 '¡': 94,
 '¢': 95,
 '£': 96,
 '¤': 97,
 '¥': 98,
 '¦': 99,
 '§': 100