Load data, split into train and test in two folders then adapt a tolkenizer and save result on train data.

Adapt a tokenizer on train data and save its token list in a file (one token per line). This will be used to tokenise text in clip model training and inference

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import sys
my_local_drive='/content/gdrive/MyDrive/ML2_projet'
sys.path.append(my_local_drive)

In [None]:
%cd $my_local_drive

/content/gdrive/MyDrive/ML2_projet


In [None]:
import os
import pandas as pd
import re
import numpy as np
import random
import zipfile
import requests
import io
import math
from pathlib import Path
import shutil
import datetime

from dataclasses import dataclass

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.utils import register_keras_serializable
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from tensorflow.keras.metrics import Mean
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import load_model

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Pour utiliser au mieux le GPU
AUTOTUNE = tf.data.AUTOTUNE

In [None]:
def copy_dateset(df, base_dir, target_dir):
  # copy a dataset into new datasets
  for index, row in df.iterrows():
    # very slow and ineficient but only ran once
    base_image_path = os.path.join(base_dir, row["image_path"])
    copy_image_path = os.path.join(target_dir, row["image_path"])

    text_id = row["image_path"].split('/')[-1] # text is not subsplit in folders like images.
    text_id = text_id.replace(".jpg", ".txt")  # change extension
    base_text_path = os.path.join(base_dir, "captions", text_id)
    copy_text_path = os.path.join(target_dir, "captions", text_id)

    print(f"moving file : {base_image_path}, {copy_image_path}")
    shutil.copy(base_image_path, copy_image_path)
    print(f"moving file : {base_text_path}, {copy_text_path}")
    shutil.copy(base_text_path, copy_text_path)

In [None]:
# Répertoire cible pour sauvegarder vos modèles
model_dir = "./models_forclip"
os.makedirs(model_dir, exist_ok=True)

# Répertoire des données
dataset_dir = "./flickr_long_subset"
# dataset_dir = "./flickr_subset2"  ## small dataset

# storing tokenizer vocab noe token per line
vocab_path = os.path.join(dataset_dir, "vocab.txt")


# Répertoire des images
image_dir = os.path.join(dataset_dir, "images")
# Répertoire des captions
captions_dir = os.path.join(dataset_dir, "captions")

# Variables utiles
# Attention respecter bien l'ordre alphabétique des classes pour
# le générateur
class_names = ['ball', 'bike', 'dog', 'water']
# class encoding dict
class_dict = {
    "ball": 0,
    "bike": 1,
    "dog": 2,
    "water": 3
}


# train
train_dir = os.path.join(dataset_dir, "train_data")
train_image_dir = os.path.join(train_dir, "images")
train_captions_dir = os.path.join(train_dir, "captions")

# validation
val_dir = os.path.join(dataset_dir, "val_data")
val_image_dir = os.path.join(val_dir, "images")
val_captions_dir = os.path.join(val_dir, "captions")

# test
test_dir = os.path.join(dataset_dir, "test_data")
test_image_dir = os.path.join(test_dir, "images")
test_captions_dir = os.path.join(test_dir, "captions")

dir_list = [
    train_dir,
    train_image_dir,
    train_captions_dir,
    val_dir,
    val_image_dir,
    val_captions_dir,
    test_dir,
    test_image_dir,
    test_captions_dir
]

# create directories
for dir in dir_list:
    os.makedirs(dir, exist_ok=True)

# create subdirectories for image classes
for class_name in class_names:
    os.makedirs(os.path.join(train_image_dir, class_name), exist_ok=True)
    os.makedirs(os.path.join(val_image_dir, class_name), exist_ok=True)
    os.makedirs(os.path.join(test_image_dir, class_name), exist_ok=True)


# Pour les images
image_size=(224, 224)
image_shape = image_size + (3,)


# Pour les textes
sequence_length = 32
vocab_size = 10000
num_heads = 4
ff_dim = 256
num_layers = 2

# Pour les images et les textes dans le modèle CLIP
embed_dim = 128

# pour le training:
batch_size = 16

In [None]:
caption_file_path = Path(dataset_dir) / "captions.csv"
captions_df = pd.read_csv(caption_file_path, sep=",")
captions_df.shape


(800, 3)

In [None]:
# This is unoptimized and will take a while to run, sorry....
# But you only need to run it once !
caption_file_path = Path(dataset_dir) / "captions.csv"
captions_df = pd.read_csv(caption_file_path, sep=",")
captions_df.head()

train_val_df, test_df = train_test_split(
    captions_df,
    test_size=0.1,
    random_state=42,
    shuffle=True,
    stratify=captions_df["label"])

# this actually takes 9% of data for val instead of 10% but seems good enough here ?
train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.1,
    random_state=42,
    shuffle=True,
    stratify=train_val_df["label"])

# copy data files
copy_dateset(train_df, dataset_dir, train_dir)
copy_dateset(val_df, dataset_dir, val_dir)
copy_dateset(test_df, dataset_dir, test_dir)

# create new captions csv
train_df.to_csv(os.path.join(train_dir, "captions.csv"), quoting=0, index=False)
val_df.to_csv(os.path.join(val_dir, "captions.csv"), quoting=0, index=False)
test_df.to_csv(os.path.join(test_dir, "captions.csv"), quoting=0, index=False)


moving file : ./flickr_long_subset/images/ball/ball_695.jpg, ./flickr_long_subset/train_data/images/ball/ball_695.jpg
moving file : ./flickr_long_subset/captions/ball_695.txt, ./flickr_long_subset/train_data/captions/ball_695.txt
moving file : ./flickr_long_subset/images/dog/dog_702.jpg, ./flickr_long_subset/train_data/images/dog/dog_702.jpg
moving file : ./flickr_long_subset/captions/dog_702.txt, ./flickr_long_subset/train_data/captions/dog_702.txt
moving file : ./flickr_long_subset/images/bike/bike_229.jpg, ./flickr_long_subset/train_data/images/bike/bike_229.jpg
moving file : ./flickr_long_subset/captions/bike_229.txt, ./flickr_long_subset/train_data/captions/bike_229.txt
moving file : ./flickr_long_subset/images/water/water_150.jpg, ./flickr_long_subset/train_data/images/water/water_150.jpg
moving file : ./flickr_long_subset/captions/water_150.txt, ./flickr_long_subset/train_data/captions/water_150.txt
moving file : ./flickr_long_subset/images/ball/ball_115.jpg, ./flickr_long_subse

# Train tokenizer

In [None]:
tokenizer = TextVectorization(
        max_tokens=vocab_size,
        standardize='lower_and_strip_punctuation',
        split='whitespace',
        pad_to_max_tokens=True,
        output_sequence_length=sequence_length,
        output_mode="int"  # save 0 for pad tokens
      )

tokenizer.adapt(train_df['caption'])
vocab = tokenizer.get_vocabulary()
print(vocab)

# save vocab
with open(vocab_path, 'w') as f:
    for token in vocab[2:]:
        f.write(token + '\n')




