# Fine-tuning OpenCLIP

We used this notebook to train a logistic classifier using the OpenCLIP image embeddings.

In [None]:
import torch
import json
import os
import time
import pickle

from google.colab import drive

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score

In [None]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
data_folder = '/content/drive/MyDrive/dating-images/data/'
model_folder = '/content/drive/My Drive/dating-images/models/'

In [None]:
os.makedirs(model_folder, exist_ok=True)

In [None]:
device = 'cuda' if torch.cuda.is_available() else "cpu"

## Load image embeddings and labels

In [None]:
# helpers to load and save pickle and json files

def loadPKL(path):
  infile = open(path,'rb')
  X = pickle.load(infile)
  print('Loaded ' + path.split('/')[-1])
  return X

def savePKL(data, path):
  with open(path, 'wb') as f:
    pickle.dump(data, f)
  print('Saved ' + path.split('/')[-1])

def loadJSON(path):
  infile = open(path,'rb')
  X = json.load(infile)
  print('Loaded ' + path.split('/')[-1])
  return X

def saveJSON(data, path):
  with open(path, 'w') as jsonfile:
    json.dump(data, jsonfile)
  print('Saved ' + path.split('/')[-1])

In [None]:
image_data = loadJSON(data_folder + 'image_data.json')
embeddings = loadPKL(data_folder + 'embeddings.pkl')
embeddings_colorized = loadPKL(data_folder + 'embeddings_colorized.pkl')

Loaded image_data.json
Loaded embeddings.pkl
Loaded embeddings_colorized.pkl


In [None]:
# image_data = [[path, data, year, scene, train/test], ... ]

years = list(zip(*image_data))[2]
print(years[:10])

(1980, 1951, 1994, 1963, 1986, 1954, 1989, 1993, 1985, 1993)


## Make classifier

In [None]:
clf = LogisticRegression(random_state=0, max_iter=5000, verbose=0)

## Split data

Using assigned `'train'` and `'test'` labels saved with the image metadata.

In [None]:
# image_data = [[path, data, year, scene, train/test], ... ]

def split_train_test_sets(image_data, list_to_split):
  train = []
  test = []
  for i, item in enumerate(image_data):
    if item[-1]=='train':
      train.append(list_to_split[i])
      continue
    if item[-1]=='test':
      test.append(list_to_split[i])
      continue
    else:
      print("train/test set not assigned for image index " + str(i)) # this shouldn't happen
  return train, test

In [None]:
X_train, X_test = split_train_test_sets(image_data, embeddings)
years_train, years_test = split_train_test_sets(image_data, years)

## Train classifier

We followed the same process to train and evaluate a classifier using the embeddings made from the colorized images.

In [None]:
clf.fit(X_train, years_train)

timestamp = time.strftime("%Y%m%d-%H%M")

y_pred = clf.predict(X_test)

score = accuracy_score(years_test, y_pred)
print(score)

0.13719588663155255


## Save model

In [None]:
savePKL(clf, model_folder + f'{timestamp}.pkl')

Saved 20230621-1033.pkl
