In [None]:
import os
import numpy as np
! pip install -q -U trax
! pip install -q tensorflow
import trax

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import re
import numpy as onp
import os
import time
import json
from glob import glob
from PIL import Image
import pickle
from zipfile import ZipFile
import shutil

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
PATH = PATH = os.path.abspath('./drive/My Drive/Final Project/')

In [None]:
#copy zip to disk
shutil.copyfile(PATH+'/resizedFinals.zip', './resizedFinals.zip')
#extract
with ZipFile('./resizedFinals.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall()
#remove unzipping bits
shutil.rmtree('./__MACOSX')

In [None]:
#copy zip to disk
shutil.copyfile(PATH+'/cachedImages.zip', './cachedImages.zip')
#extract
with ZipFile('./cachedImages.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall()


In [None]:
#run this
PATH = os.path.abspath('./resizedFinals')
caption_file = os.path.abspath(PATH+'/textFiles/comment_list.json')

with open(caption_file, 'r') as f:
    comments = json.load(f)

# Store comments and image names in vectors
all_captions = []
all_img_name_vector = []
MAX_SIZE = 64

for c in comments:
    comment = '<start> ' + c['body'] + ' <end>'
    if len(comment.split(' ')) > MAX_SIZE:
      continue
    image_id = c['id']
    full_image_path = PATH + '/resized/' + image_id + '.jpg'

    all_img_name_vector.append(full_image_path)
    all_captions.append(comment)


# Shuffle captions and image_names together
# Set a random state
train_captions, img_name_vector = shuffle(all_captions,
                                          all_img_name_vector,
                                          random_state=1)

# Select the first 30000 captions from the shuffled set
num_examples = 30000
train_captions = train_captions[:num_examples]
img_name_vector = img_name_vector[:num_examples]

In [None]:
len(train_captions), len(all_captions)

In [None]:
def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, image_path

In [None]:
image_model = tf.keras.applications.InceptionV3(include_top=False,
                                                weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output

image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

In [None]:
!pip install -q tqdm
from tqdm import tqdm

In [None]:
len(sorted(set(img_name_vector)))

In [None]:
if not os.path.exists(PATH+'/carp/'):
  os.makedirs(os.path.abspath(PATH+'/carp/'))
# Get unique images
encode_train = sorted(set(img_name_vector))
# Feel free to change batch_size according to your system configuration
image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
image_dataset = image_dataset.map(
  load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)

for img, path in image_dataset:
  print(img.shape)
  batch_features = image_features_extract_model(img)
  print(batch_features.shape)
  batch_features = tf.reshape(batch_features,
                              (batch_features.shape[0], -1, batch_features.shape[3]))
  print(batch_features.shape)
  print()

  for bf, p in zip(batch_features, path):
    path_of_feature = p.numpy().decode("utf-8")
    path_of_feature = path_of_feature.split('/')
    path_of_feature[-2] = 'carp'
    path_of_feature = '/'.join(path_of_feature)
    np.save(path_of_feature, bf.numpy())

In [None]:
#takes about 10 minutes
shutil.make_archive('./cachedImages','zip',base_dir='./resizedFinals/cached')

In [None]:
shutil.copyfile('./cachedImages.zip', './drive/My Drive/Final Project/cachedImages.zip')

In [None]:
# Find the maximum length of any caption in our dataset
def calc_max_length(tensor):
    return max(len(t) for t in tensor)

# Choose the top 5000 words from the vocabulary
top_k = 5000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k,
                                                  oov_token="<unk>",
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(train_captions)
train_seqs = tokenizer.texts_to_sequences(train_captions)

tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

# Create the tokenized vectors
train_seqs = tokenizer.texts_to_sequences(train_captions)

# Pad each vector to the max_length of the captions
# If you do not provide a max_length value, pad_sequences calculates it automatically
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post', maxlen=MAX_SIZE)


In [None]:
print(len(cap_vector))

In [None]:
for i, name in enumerate(img_name_vector):
  name = name.split('/')
  name[-2] = 'cached'
  img_name_vector[i] = '/'.join(name)
# Create training and validation sets using an 80-20 split
img_name_train, img_name_val, cap_train, cap_val= train_test_split(img_name_vector,
                                                                    cap_vector,
                                                                    test_size=0.2,
                                                                    random_state=0)

In [None]:
len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)

In [None]:
# Feel free to change these parameters according to your system's configuration

BATCH_SIZE = 64
BUFFER_SIZE = 1000
embedding_dim = 256
vocab_size = top_k + 1
num_steps = len(img_name_train) // BATCH_SIZE
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = 2048
attention_features_shape = 64

In [None]:
# Load the numpy files
def map_func(img_name, cap):
  img_tensor = np.load(img_name.decode('utf-8')+'.npy')
  return img_tensor, cap

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))
#Use map to load the numpy files in parallel
dataset = dataset.map(lambda item1, item2: tf.numpy_function(
          map_func, [item1, item2], [tf.float32, tf.int32]),
          num_parallel_calls=tf.data.experimental.AUTOTUNE)
# Shuffle and batch
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
from trax.supervised import inputs
from trax import math
from trax import layers as tl
from trax.models import reformer as rf
from trax.shapes import ShapeDtype
from trax.shapes import signature
import jax
import jax.numpy as np

In [None]:
def map_func(img_name):
  img_tensor = np.load(img_name+'.npy')
  return img_tensor

def my_input(n_devices, img_name_list, cap_list):
  spot = 0
  while spot < len(img_name_list):
    spot += n_devices
    yield (np.array(list(map(map_func, img_name_list[spot:spot+n_devices]))), 
          cap_list[spot:spot+n_devices])

inputTest = my_input(1, img_name_train, cap_train)

In [None]:
roast_inputs = trax.supervised.Inputs(lambda n_devices: my_input(n_devices, img_name_train, cap_train))

In [None]:
import trax.models.transformer as tr

In [None]:
def complicated2(input_vocab_size,
                output_vocab_size=None,
                d_model=512,
                d_ff=2048,
                n_encoder_layers=6,
                n_decoder_layers=6,
                n_heads=8,
                dropout=0.1,
                max_len=2048,
                mode='train',
                ff_activation=tl.Relu):
  """Returns a Transformer model.
  This model expects an input pair: target, source.
  Args:
    input_vocab_size: int: vocab size of the source.
    output_vocab_size: int (optional): vocab size of the target. If None, the
      source and target are assumed to have the same vocab.
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_encoder_layers: int: number of encoder layers
    n_decoder_layers: int: number of decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    mode: str: 'train' or 'eval'
    ff_activation: the non-linearity in feed-forward layer
  Returns:
    A Transformer model as a layer that maps from a target, source pair to
    activations over a vocab set.
  """
  def PositionalEncoder(vocab_size):  # tokens --> vectors
    return [
        tl.Embedding(d_model, vocab_size),
        tl.Dropout(rate=dropout, mode=mode),
        tl.PositionalEncoding(max_len=max_len),
    ]

  in_encoder = PositionalEncoder(input_vocab_size)
  out_encoder = (in_encoder if output_vocab_size is None
                 else PositionalEncoder(output_vocab_size))
  if output_vocab_size is None:
    output_vocab_size = input_vocab_size

  encoder_blocks = [
      tr._EncoderBlock(
          d_model, d_ff, n_heads, dropout, i, mode, ff_activation)
      for i in range(n_encoder_layers)]

  #change to use rnn data
  encoder = tl.Serial(
      tl.Dense(d_model),
      tl.Relu()
  )
  # encoder = tl.Serial(
  #     in_encoder,
  #     encoder_blocks,
  #     tl.LayerNorm()
  # )
  if mode == 'predict':
    encoder = tl.Cache(encoder)

  encoder_decoder_blocks = [
      tr._EncoderDecoderBlock(
          d_model, d_ff, n_heads, dropout, i, mode, ff_activation)
      for i in range(n_decoder_layers)]

  # Assemble and return the model.
  return tl.Serial(
      # Input: encoder_side_tokens, decoder_side_tokens
      # Copy decoder tokens for use in loss.
      tl.Select([1, 0, 1]),               # tok_e tok_d tok_d

      # Encode.
      tl.Branch([], tl.PaddingMask()),    # tok_d masks ..... .....
      tl.Select([2, 1, 0]),               # tok_e masks tok_d .....
      encoder,                            # vec_e ..... ..... .....

      # # Decode.
      tl.Select([2, 1, 0]),               # tok_d masks vec_e .....
      tl.ShiftRight(),                    # tok_d ..... ..... .....
      out_encoder,                        # vec_d ..... ..... .....
      tl.Branch(
          [], tl.EncoderDecoderMask()),   # vec_d masks ..... .....
      encoder_decoder_blocks,             # vec_d masks ..... .....
      tl.LayerNorm(),                     # vec_d ..... ..... .....

      # Map to output vocab.
      tl.Select([0], n_in=3),             # vec_d tok_d
      tl.Dense(output_vocab_size),        # vec_d .....
      tl.LogSoftmax(),                    # vec_d .....
  )
test = complicated2(input_vocab_size=vocab_size, mode='train')
test

In [None]:
inputTest = my_input(3, img_name_train, cap_train)
coolGuy1 = next(inputTest)
coolGuy2 = next(inputTest)
guyLength = -1
testGuy = (coolGuy1[0], coolGuy1[1])
test.init(signature(coolGuy1))
signature(coolGuy1)

# testGuyOldSkool = (coolGuy1[1][:,1:guyLength], coolGuy2[1][:,1:guyLength])
# test.init(signature(testGuyOldSkool))
# signature(testGuyOldSkool)

In [None]:
runIt = test(coolGuy1)
signature(runIt)

In [None]:
def LotsOfEffort(mode):
  return complicated2(vocab_size, d_model=embedding_dim, d_ff=1024, max_len=MAX_SIZE, mode=mode)


In [None]:
output_dir = os.path.abspath('./drive/My Drive/Final Project/train_dir/')
if not os.path.exists(output_dir):
  os.makedirs(output_dir)

In [None]:
roast_inputs = trax.supervised.Inputs(lambda n_devices: my_input(n_devices, img_name_train, cap_train))
roast_inputs

In [None]:
# Train tiny model with Trainer.
output_dir = os.path.abspath('./drive/My Drive/Final Project/train_dir/')
!rm -f ./train_dir/model.pkl  # Remove old model.
trainer = trax.supervised.Trainer(
    model=LotsOfEffort,
    loss_fn=trax.layers.CrossEntropyLoss(),
    optimizer=trax.optimizers.Adafactor,  # Change optimizer params here.
    lr_schedule=trax.lr.MultifactorSchedule,  # Change lr schedule here.
    inputs=roast_inputs,
    output_dir=output_dir)

In [None]:
# Train for 3 epochs each consisting of 500 train batches, eval on 2 batches.
n_epochs  = 20
train_steps = num_steps
eval_steps = 2
for _ in range(n_epochs):
  trainer.train_epoch(train_steps, eval_steps)

In [None]:
# Initialize model for inference.
output_dir = os.path.abspath('./drive/My Drive/Final Project/train_dir/')
predict_model = LotsOfEffort(mode='predict')

predict_signature = (trax.shapes.ShapeDtype((1, 64, 2048), dtype=np.float32), ShapeDtype((1, 1), dtype=np.int32))

predict_model.init(predict_signature)
predict_model.init_from_file(os.path.join(output_dir, "model.pkl"),
                             weights_only=True)

In [None]:
def make_roast(img_path, model):

  pic, path = load_image(img_path)
  pic = tf.expand_dims(pic, 0) 
  features = image_features_extract_model.predict(pic)
  
  features = tf.reshape(features,
                              (features.shape[0], -1, features.shape[3]))
  features = features.numpy()
  start = np.array(tokenizer.word_index['<start>'])
  end = tokenizer.word_index['<end>']
  start = start.reshape((1,1))
  
  out = model((features, start))
  result = [int(out[1])]
  for i in range(10):
    newIn = np.argmax(out[0][0,:], axis=-1).reshape((1,1))
    out = model((features, newIn))
    result.append(int(out[1]))
  return result

path = img_name_train[30]
#print(path)
pred = make_roast('/content/resizedFinals/resized/69g375.jpg', predict_model)