# Import modules and load data

In [1]:
#autoload modules in notebook
%load_ext autoreload
%autoreload 2  

In [2]:
from itertools import combinations
import json
import pandas as pd
import tensorflow as tf
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
import wandb
from transformers import DistilBertTokenizer, TFDistilBertModel
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import CLIP_model
import CLIP_data_load

2023-08-18 10:17:37.533863: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# gpus = tf.config.list_physical_devices('GPU')
# if gpus:
#   try:
#     # Currently, memory growth needs to be the same across GPUs
#     for gpu in gpus:
#       tf.config.experimental.set_memory_growth(gpu, True)
#     logical_gpus = tf.config.list_logical_devices('GPU')
#     print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
#   except RuntimeError as e:
#     # Memory growth must be set before GPUs have been initialized
#     print(e)

In [4]:
# Hyperparameters
train_percentage = 0.8
test_percentage = 0.2
validation_percentage = 0.15
latent_dim_imgs = 1024
latent_dim_text = 768
latent_dim_common = 512
batch_size = 8
SEED = 116
img_shape = (128,128,3)
text_input_shape = (200)
bert_model_name = 'distilbert-base-uncased' #bert-base-multilingual-uncased
max_len = 200
model_name = 'our_loss'

# Paths
all_images_path = 'dataset/resized_train/'
project_location = '.'
all_captions_path = f'{project_location}/dataset/caption_prediction_train.csv'  
all_concept_ids_path = f'{project_location}/dataset/concept_detection_train.csv'
all_concepts_path = f'{project_location}/dataset/concepts.csv'

zip_dataset_location = f'{project_location}/dataset/resized_train.zip'
dataset_extract_location = "/"
model_loc = f'{project_location}/weights/{model_name}.h5'

In [5]:
run = wandb.init(project="clip-flick",
           entity='calonca',
           name="test_new_loss",
           config = {
              'batch_size':batch_size,
              'latent_dim_common':latent_dim_common,
              'latent_dim_text':latent_dim_text,
              'latent_dim_imgs':latent_dim_imgs,
              'img_shape':img_shape,
              'text_input_shape':text_input_shape,
              'model_name': model_name,
           })

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33malexlaconca[0m ([33mcalonca[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666830571666651, max=1.0)…

In [6]:
#Merging all dataframes toghether into ID, caption, cuis, concepts
captionsDF = pd.read_csv(all_captions_path, sep='\t')
concept_id_df = pd.read_csv(all_concept_ids_path, sep='\t')
concept_df = pd.read_csv(all_concepts_path, sep=',')

concepts = concept_df.set_index('concept').T.to_dict('list')
concept_id_df.cuis = concept_id_df.cuis.apply(lambda x: x.split(';'))
concept_id_df['concepts'] = concept_id_df.cuis.apply(lambda cuis: {concepts[cui][0] for cui in cuis})

captionsDF = pd.merge(captionsDF,concept_id_df, on="ID")
captionsDF.ID = captionsDF.ID.apply(lambda x: x+'.jpg')

In [7]:
train_val, test = train_test_split(captionsDF, test_size = test_percentage, shuffle = True, random_state = SEED)
train, val = train_test_split(train_val, test_size = validation_percentage, shuffle = True, random_state = SEED)

In [8]:
#use only a subset of the dataset, one batch
train = train[:batch_size]
val = val[:batch_size]
test = test[:batch_size]

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained(bert_model_name)

trainList = CLIP_data_load.paths_captions_emb_list(train,  all_images_path,tokenizer=tokenizer,max_len=max_len)
valList = CLIP_data_load.paths_captions_emb_list(val, all_images_path,tokenizer=tokenizer,max_len=max_len)

100%|██████████| 8/8 [00:00<00:00, 3383.87it/s]
100%|██████████| 8/8 [00:00<00:00, 2893.62it/s]


In [10]:
train_gen = CLIP_data_load.ClipBaseGenerator(data = trainList, batch_size = batch_size, out_shape = img_shape[0:2], shuffle = True,seed=SEED)
val_gen = CLIP_data_load.ClipBaseGenerator(data = valList, batch_size = batch_size, out_shape = img_shape[0:2], shuffle = True,seed=SEED)

## **Train CLIP**

In [11]:
from wandb.keras import WandbCallback
wandb_callback = WandbCallback(monitor='loss',
                               log_batch_frequency=10,
                               save_model = False,
                               validation_steps=5)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=500,restore_best_weights=True)

In [12]:
fen_model = tf.keras.applications.EfficientNetV2S(
    include_top=False,
    weights="imagenet",
    pooling='avg',
    input_shape = img_shape
)

bert_model = TFDistilBertModel.from_pretrained(bert_model_name)

model: CLIP_model.CLIP_base = CLIP_model.get_clip_model(
    image_input_shape = img_shape,
    text_input_shape = text_input_shape,
    text_encoder = bert_model,
    image_encoder = fen_model,
    latent_dim_imgs = latent_dim_imgs,
    latent_dim_text = latent_dim_text,
    latent_dim_common = latent_dim_common,
    train_bert = True,
    loss = CLIP_model.loose_loss,
    learning_rate=1e-5,
) # type: ignore # 

2023-08-18 10:17:50.825918: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-18 10:17:50.826691: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-18 10:17:50.826855: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [13]:
#new loss
history = model.fit(x = train_gen, validation_data = val_gen, epochs = 1000, 
                    callbacks = [early_stopping,wandb_callback],
                   workers=4)

Epoch 1/1000


2023-08-18 10:17:58.580310: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]
2023-08-18 10:17:58.713402: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:417] Loaded runtime CuDNN library: 8.5.0 but source was compiled with: 8.6.0.  CuDNN library needs to have matching major version and equal or higher minor version. If using a binary install, upgrade your CuDNN library.  If building from sources, make sure the library loaded at runtime is compatible with the version specified during compile configuration.
2023-08-18 10:17:58.713948: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at conv_ops.cc:1068 : UNIMPLEMENTED: DNN library is not found.


UnimplementedError: Exception encountered when calling layer 'stem_conv' (type Conv2D).

{{function_node __wrapped__Conv2D_device_/job:localhost/replica:0/task:0/device:GPU:0}} DNN library is not found. [Op:Conv2D]

Call arguments received by layer 'stem_conv' (type Conv2D):
  • inputs=tf.Tensor(shape=(8, 128, 128, 3), dtype=float32)

In [None]:
%pip install tensorflow

/bin/bash: /home/ale/miniconda3/envs/tf/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Looking in indexes: https://pypi.org/simple, https://packagecloud.io/github/git-lfs/pypi/simple
Note: you may need to restart the kernel to use updated packages.


In [None]:
#old loss loss
history = model.fit(x = train_gen, validation_data = val_gen, epochs = 1000, 
                    callbacks = [early_stopping,wandb_callback],
                   workers=4)

Epoch 1/1000


2023-08-17 18:53:48.303638: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]




2023-08-17 18:54:34.005916: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 2.3 seconds.), retrying request


Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 2.3 seconds.), retrying request
[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 4.4 seconds.), retrying request
[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 8.5 seconds.), retrying request


Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 2.4 seconds.), retrying request


Epoch 46/1000
Epoch 47/1000
Epoch 48/1000


KeyboardInterrupt: 

In [None]:
import gc
gc.collect()

12390

In [None]:
test = 1

In [None]:
# !mkdir -p {project_location}/weights
# model.save(f"{project_location}/weights/{model_name}.h5")

In [None]:
# wandb.save(f"{project_location}/weights/{model_name}.h5", base_path='weights/')

In [None]:
artifact_dir

NameError: name 'artifact_dir' is not defined

In [None]:
import wandb
import shutil
import os
run = wandb.init()
artifact = run.use_artifact('calonca/clip-flick/dataset:v0', type='dataset')
artifact_dir = artifact.download()
shutil.move(os.path.join(artifact_dir,'resized_train.zip'),os.path.join('dataset','resized_train.zip'))


VBox(children=(Label(value='0.003 MB of 0.007 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.403642…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666811663332434, max=1.0)…

[34m[1mwandb[0m: Downloading large artifact dataset:v0, 243.32MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.6


'dataset/resized_train.zip'

In [None]:
artifact = wandb.Artifact(model_name, type='dataset')
artifact.add_file(f"{project_location}/dataset/resized_train.zip")
run.log_artifact(artifact)
run.finish()