# Prepare data and embeddings

We used this notebook to prepare our dataset. We clip it to the range 1950-1999, create and save all relevant year, scene and test/train labels and make image embeddings using OpenCLIP.

In [None]:
!pip install open_clip_torch
import open_clip

import torch
from PIL import Image
import random
import json
import pickle
import os

import numpy as np
from sklearn.model_selection import train_test_split

Collecting open_clip_torch
  Downloading open_clip_torch-2.20.0-py3-none-any.whl (1.5 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.5 MB[0m [31m6.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.5/1.5 MB[0m [31m21.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from open_clip_torch)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.2 M

In [None]:
device = 'cuda' if torch.cuda.is_available() else "cpu"

In [None]:
# helpers to load and save pickle and json files

def loadPKL(path):
  infile = open(path,'rb')
  X = pickle.load(infile)
  print('Loaded ' + path.split('/')[-1])
  return X

def savePKL(data, path):
  with open(path, 'wb') as f:
    pickle.dump(data, f)
  print('Saved ' + path.split('/')[-1])

def loadJSON(path):
  infile = open(path,'rb')
  X = json.load(infile)
  print('Loaded ' + path.split('/')[-1])
  return X

def saveJSON(data, path):
  with open(path, 'w') as jsonfile:
    json.dump(data, jsonfile)
  print('Saved ' + path.split('/')[-1])

# Make labels

## Load metadata

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_folder = '/content/drive/MyDrive/dating-images/data/'

In [None]:
image_data = loadJSON(data_folder + 'all_paths_dates.json')

random.seed(20) # shuffle the data with a set seed
random.shuffle(image_data)

Loaded all_paths_dates.json


In [None]:
years = []
scenes = []

for path, date in image_data:
  year = int(date.split('-')[-1])
  scene = path.split('/')[-2]
  years.append(year)
  scenes.append(scene)

## Append labels to metadata

In [None]:
for i, item in enumerate(image_data):
  item.append(years[i])
  item.append(scenes[i])

## Clip image data within the range 1950-1999

As the years prior to 1950 and after 2000 contained few images, we removed all data outside of this range.

In [None]:
print("Total images: " + str(len(image_data)))

for i, year in reversed(list(enumerate(years))):
  if year < 1950 or year >= 2000:
    image_data.pop(i)

print("Images in range 1950-1999: " + str(len(image_data)))

Total images: 43039
Images in range 1950-1999: 39866


## Make test/train split

We made a stratified train/test split based on the clipped year labels.

In [None]:
indices = list(np.arange(len(image_data)))
years = list(zip(*image_data))[2]

In [None]:
X_train, X_test, _, _ = train_test_split(indices, years, test_size=0.2, stratify=years, random_state=123)

In [None]:
print(len(X_train))
print(len(X_test))

print(X_train[:10])
print(X_test[:10])

31892
7974
[13455, 22023, 29007, 38267, 32589, 23269, 34341, 33669, 39502, 11583]
[13198, 21682, 24509, 27532, 13036, 3068, 22977, 15677, 4527, 23582]


Assign a `'train'` or `'test'` label to each of the images. We used these to make the train and test sets for all of our experiments.

In [None]:
for i, item in enumerate(image_data):
  if (i in X_train) and (i in X_test):
    print("Index " + str(i) + " is in both train and test sets! Check train/test split.") # this shouldn't happen
    break
  if i in X_train:
    item.append('train')
    continue
  if i in X_test:
    item.append('test')
    continue

In [None]:
print(len(image_data))
print(image_data[0])

39866
['../scene_detection/images/soccer/NL-HlmNHA_1478_20083K00_22.jpg', '19-11-1980', 1980, 'soccer', 'train']


Save a copy of the image metadata including the year, scene and train/test labels.

In [None]:
saveJSON(image_data, data_folder + 'image_data.json')

# Make embeddings

The dataset can be placed in `./data/scene_detection/images/`, where all images are in the corresponding `scene` folder.

In our experiments we repeated this to create embeddings with the colorized images.

In [None]:
def get_image_embedding(image_path):
    image = Image.open(image_path)
    tensor = preprocess(image).unsqueeze(0).to(device)
    with torch.no_grad():
        embedding = model.encode_image(tensor)
    return embedding.flatten().cpu().numpy()

### Load model

In [None]:
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32-quickgelu', pretrained='laion400m_e32')
tokenizer = open_clip.get_tokenizer('ViT-B-32-quickgelu')

100%|███████████████████████████████████████| 605M/605M [00:06<00:00, 94.1MiB/s]


In [None]:
X = []

for image in image_data[:5]:
  path = data_folder + image[0].split('../')[-1]
  embedding = get_image_embedding(path)
  X.append(embedding)

Save a copy of the embeddings

In [None]:
savePKL(X, data_folder + 'embeddings.pkl')