# Import modules and load data

In [1]:
#autoload modules in notebook
%load_ext autoreload
%autoreload 2  

In [2]:
import pandas as pd
import tensorflow as tf
import wandb
from transformers import DistilBertTokenizer, TFDistilBertModel
from sklearn.model_selection import train_test_split
from wandb.keras import WandbCallback

import CLIP_model
import CLIP_data_load


2023-09-03 23:09:36.221296: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [3]:
# Hyperparameters
train_percentage = 0.8
test_percentage = 0.2
validation_percentage = 0.15
latent_dim_imgs = 1024
latent_dim_text = 768
latent_dim_common = 512
lr_image_encoder = 1e-3
lr_text_encoder = 1e-5
lr_projector = 1e-4
projector_dropout_rate = 0.1
batch_size = 80
SEED = 116
img_shape = (128,128,3)
captions_input_shape = (128,)
concepts_input_shape = (80,)
bert_model_name = 'distilbert-base-uncased' #bert-base-multilingual-uncased

model_name = 'test_clip_base'

# Paths
all_images_path = 'dataset/resized_train/'
project_location = '.'
all_captions_path = f'{project_location}/dataset/caption_prediction_train.csv'  
all_concept_ids_path = f'{project_location}/dataset/concept_detection_train.csv'
all_concepts_path = f'{project_location}/dataset/concepts.csv'

zip_dataset_location = f'{project_location}/dataset/resized_train.zip'
dataset_extract_location = "/"
model_loc = f'{project_location}/weights/{model_name}.h5'

In [4]:
run = wandb.init(
    project="clip",
    entity="calonca",
    name=model_name,
    config={
        "batch_size": batch_size,
        "latent_dim_common": latent_dim_common,
        "latent_dim_text": latent_dim_text,
        "latent_dim_imgs": latent_dim_imgs,
        "img_shape": img_shape,
        "captions_input_shape": captions_input_shape,
        "concepts_input_shape": concepts_input_shape,
        "model_name": model_name,
        "bert_model_name": bert_model_name,
        "lr_image_encoder": lr_image_encoder,
        "lr_text_encoder": lr_text_encoder,
        "lr_projector": lr_projector,
        "projector_dropout_rate": projector_dropout_rate,
    },
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33malexlaconca[0m ([33mcalonca[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Data preparation

In [5]:
#Merging all dataframes toghether into ID, caption, cuis, concepts
captionsDF = pd.read_csv(all_captions_path, sep='\t')
concept_id_df = pd.read_csv(all_concept_ids_path, sep='\t')
concept_df = pd.read_csv(all_concepts_path, sep=',')

captions_concepts_df = CLIP_data_load.preprocess_captions_concepts(captionsDF, concept_id_df, concept_df)

[nltk_data] Downloading package stopwords to /home/ale/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ale/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
train_val, test = train_test_split(captions_concepts_df, test_size = test_percentage, shuffle = False)
train, val = train_test_split(train_val, test_size = validation_percentage, shuffle = False)

In [8]:
train.iloc[99,:]['caption']

'angiographic situation embolization left uterine artery'

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained(bert_model_name)

trainList = CLIP_data_load.paths_captions_concepts_emb_list(
    train,
    all_images_path,
    tokenizer=tokenizer,
    max_len_captions=captions_input_shape[0],
    max_len_concepts=concepts_input_shape[0],
    remove_images_threshold=0,
)
valList = CLIP_data_load.paths_captions_concepts_emb_list(
    val,
    all_images_path,
    tokenizer=tokenizer,
    max_len_captions=captions_input_shape[0],
    max_len_concepts=concepts_input_shape[0],
    remove_images_threshold=0,
)

100%|██████████| 136/136 [00:00<00:00, 1516.50it/s]
100%|██████████| 24/24 [00:00<00:00, 1682.15it/s]


In [20]:
train_gen = CLIP_data_load.FusionGenerator(
    channels_first=False,
    preprocessing_function=None,
    data=trainList,
    batch_size=batch_size,
    out_shape=img_shape[0:2],
    shuffle=True,
    seed=SEED,
    model_version='base',
)
val_gen = CLIP_data_load.FusionGenerator(
    channels_first=False,
    preprocessing_function=None,
    data=valList,
    batch_size=batch_size,
    out_shape=img_shape[0:2],
    shuffle=True,
    seed=SEED,
    model_version='base',
)

### Model creation

In [11]:
fen_model = tf.keras.applications.EfficientNetV2S(
    include_top=False,
    weights="imagenet",
    pooling='avg',
    input_shape = img_shape
)

bert_model = TFDistilBertModel.from_pretrained(bert_model_name)

model = CLIP_model.get_clip_model(
    image_input_shape = img_shape,
    text_input_shape = captions_input_shape,
    text_encoder = bert_model,
    image_encoder = fen_model,
    latent_dim_imgs = latent_dim_imgs,
    latent_dim_text = latent_dim_text,
    latent_dim_common = latent_dim_common,
    train_bert = True,
    lr_image_encoder=lr_image_encoder,
    lr_text_encoder=lr_text_encoder,
    lr_projector=lr_projector,
    projector_dropout_rate=projector_dropout_rate,
)

2023-09-03 23:10:04.256376: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-03 23:10:04.273231: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-03 23:10:04.273409: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [12]:
model.summary()

Model: "clip_base"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 input_4 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 128, 128, 3)]        0         []                            
                                                                                                  
 tf_distil_bert_model (TFDi  TFBaseModelOutput(last_hid   6636288   ['input_3[0][0]',             
 stilBertModel)              den_state=(None, 128, 768)   0          'input_4[0][0]']     

### Model training

In [13]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5,restore_best_weights=True)

In [14]:
wandb_callback = WandbCallback(
    monitor='loss',
    log_batch_frequency=10,
    save_model = False,
    validation_steps=len(val_gen),
)


In [15]:
history = model.fit(x = train_gen, validation_data = val_gen, epochs = 100, 
                    callbacks = [early_stopping,wandb_callback],
                   workers=4)

Epoch 1/100


2023-09-03 23:10:12.575013: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600
2023-09-03 23:10:15.360039: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0xd1bcfd40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-09-03 23:10:15.360065: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3080 Laptop GPU, Compute Capability 8.6
2023-09-03 23:10:15.405166: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-09-03 23:10:15.638793: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/100
Epoch 3/100
 1/17 [>.............................] - ETA: 27s - loss: 1.9075

KeyboardInterrupt: 

### Model save

In [18]:
!mkdir -p {project_location}/weights
model.save(f"{project_location}/weights/{model_name}.h5")

/bin/bash: /home/ale/miniconda3/envs/tf2/lib/libtinfo.so.6: no version information available (required by /bin/bash)


  saving_api.save_model(


In [19]:
artifact = wandb.Artifact(model_name, type='model')
artifact.add_file(f"{project_location}/weights/{model_name}.h5")
run.log_artifact(artifact)
run.finish()



0,1
epoch,▁█
loss,▆▇█▆▅▅▁
val_loss,▁█

0,1
best_epoch,1.0
best_loss,2.03213
epoch,1.0
loss,1.90749
val_loss,2.14912
