In [102]:
import pandas as pd
from PIL import Image
import open_clip
import requests
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import math
import json
import csv
import torch

In [4]:
input_file_name = "/Users/elangrossman/Downloads/idigbio.json"

In [5]:

model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
model.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active
tokenizer = open_clip.get_tokenizer('ViT-B-32')

In [127]:
def extract_data(file_location):
    with open (file_location, "rb") as f:
        lines = f.readlines()
    new_lines = []
    for line in lines:
        line = json.loads(line)
        
        media = line["media"]
        for med in media:
            d = {}
            if "dwc:scientificName" not in line["data"]:
                print("scientific name not found")
                d["scientificName"] = line["data"]["dwc:genus"]
            else:
                d["scientificName"] = line["data"]["dwc:scientificName"]
            if "ac:accessURI" in med["data"]:
                url = med["data"]["ac:accessURI"]
                d["media_location"] = url
                new_lines.append(d)

    return pd.DataFrame(new_lines)
    

In [84]:
def download_image_and_preprocess(image_location):
    """This method downloads an array of images and outputs thier vectors in memory"""
    content = requests.get(image_location).content
    img = Image.open(BytesIO(content))
    image = preprocess(img).unsqueeze(0)
    return model.encode_image(image)

###
def process_images(file_location):
    """This section details how to convert these batches to vectors! We use multi-threading to download many images at once. 
    We don't save any of the images and only compute them in memory. The next step will
    """

    df = extract_data(file_location)

    batch_size=10
    num_chunks = math.ceil(len(df)/batch_size)

    with open("vectors.csv", "a") as out_file:
        writer = csv.writer(out_file)

        i = 0

        for chunk in np.array_split(df, num_chunks):
            i+=1

            if i %100 ==0 :
                print(i*1000)
            media_locations = chunk["media_location"]
            with ThreadPoolExecutor(max_workers=5) as executor:
                downloaded_images = executor.map(download_image_and_preprocess, media_locations)

                ### Do something with images (probably store them somewhere)
                for i, image in enumerate(downloaded_images):
                    # line = ', '.join(map(str, image.detach().numpy().tolist()))
                    row = image.detach().numpy().tolist()[0]


                    writer.writerow([media_locations.iloc[i],] + row)

                    # out_file.write(line + '\n')
                    

    # return all_images



In [85]:
process_images(input_file_name)

  return bound(*args, **kwds)


In [78]:
with open("vectors.csv", "rb") as f:
    data = f.readlines()

In [79]:
data[0]

b'"https://images.collections.yale.edu/iiif/2/ypm:e1ce7f3b-fef8-44a3-bacf-60d65c5a478c/full/!1920,1920/0/default.jpg",-0.23820354044437408,-0.3410683274269104,-0.07838624715805054,0.36089572310447693,-0.2663402259349823,-0.28753626346588135,0.06049162149429321,0.38144567608833313,-0.07850157469511032,-0.8259036540985107,0.10293406993150711,-0.24331752955913544,0.13304539024829865,0.22863388061523438,0.22755363583564758,-0.026215866208076477,0.2618148624897003,-0.08750877529382706,0.8693447709083557,0.05802713707089424,0.003845561295747757,0.022130087018013,-0.49914756417274475,0.024088839069008827,-0.03101903200149536,-0.09103882312774658,-0.15422721207141876,0.3784756362438202,0.49847477674484253,0.06891927868127823,0.37374475598335266,-0.14246083796024323,0.13878054916858673,0.20563249289989471,0.3877609372138977,0.06640904396772385,0.22132755815982819,-0.9281030297279358,0.0445358008146286,0.050260066986083984,0.2645643949508667,0.3157598674297333,-0.2251501977443695,1.1640268564224

In [82]:
pd.read_csv("vectors.csv", header=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,512
0,https://images.collections.yale.edu/iiif/2/ypm...,-0.238204,-0.341068,-0.078386,0.360896,-0.26634,-0.287536,0.060492,0.381446,-0.078502,...,0.176158,0.126622,-0.338246,-0.496352,-0.026853,-0.40324,-0.227414,0.010856,-0.236127,0.613918
1,https://images.collections.yale.edu/iiif/2/ypm...,0.264021,-0.78685,-0.291065,0.117454,-0.510014,-0.097921,0.468463,0.4976,0.087577,...,0.18101,-0.642937,-0.867378,-0.790296,-0.056268,0.259121,-0.138452,-0.258864,-0.481881,0.376177
2,https://images.collections.yale.edu/iiif/2/ypm...,-0.221329,-0.772506,0.031218,0.32949,-0.894112,-0.215537,0.365567,0.499378,-0.305687,...,0.193448,-0.899448,-0.282458,-0.786761,-0.103858,-0.314209,-0.105062,-0.161489,-0.241206,0.259111
3,https://images.collections.yale.edu/iiif/2/ypm...,0.264021,-0.78685,-0.291065,0.117454,-0.510014,-0.097921,0.468463,0.4976,0.087577,...,0.18101,-0.642937,-0.867378,-0.790296,-0.056268,0.259121,-0.138452,-0.258864,-0.481881,0.376177
4,https://images.collections.yale.edu/iiif/2/ypm...,-0.137508,0.210246,-0.212738,-0.073958,-0.359957,0.085589,0.311637,0.296473,-0.48148,...,0.242932,-0.266456,-0.613158,-0.229771,0.21191,-0.392202,-0.109289,-0.051735,-0.327489,0.053363
5,https://images.collections.yale.edu/iiif/2/ypm...,-0.517694,0.013447,-0.280132,0.832022,-0.129944,-0.166346,0.073072,0.068697,-0.173559,...,0.490669,-0.100886,-0.215337,0.124228,-0.266959,-0.085759,0.68415,-0.405431,-0.186269,0.128743
6,https://images.collections.yale.edu/iiif/2/ypm...,-0.46015,-1.107975,-0.54487,0.57398,-0.039855,-0.42496,-0.261986,0.259361,-0.087468,...,0.397919,-0.298072,-0.031638,-0.026006,-0.350282,-0.347362,0.642822,-0.278033,-0.052285,0.306975
7,https://images.collections.yale.edu/iiif/2/ypm...,-0.329458,0.006447,-0.461486,0.723967,0.039809,-0.262739,-0.111899,0.086717,-0.377734,...,0.260366,0.275365,-0.099998,0.149744,-0.121843,-0.149105,0.589882,-0.202246,-0.292846,0.174454
8,https://images.collections.yale.edu/iiif/2/ypm...,0.206068,-0.893098,-0.579525,-0.034299,-0.611491,-0.110002,0.269454,0.264315,-0.643301,...,0.528296,-0.177052,-0.648424,-0.376443,0.006753,0.011923,-0.472017,-0.037721,-0.046511,0.350326
9,https://images.collections.yale.edu/iiif/2/ypm...,0.206068,-0.893098,-0.579525,-0.034299,-0.611491,-0.110002,0.269454,0.264315,-0.643301,...,0.528296,-0.177052,-0.648424,-0.376443,0.006753,0.011923,-0.472017,-0.037721,-0.046511,0.350326


### Postgres PGVector

The next section details how to write to the psql database with an example
Make sure to run this:

If you don't have access, try using the pgvector docker image which is just postgres with the pgvector extention enabled

```sql
CREATE EXTENSION IF NOT EXISTS vector;

CREATE TABLE IF NOT EXISTS embeddings (
  id SERIAL PRIMARY KEY,
  embedding vector(512),
  url text,
  created_at timestamptz DEFAULT now()
);
```

In [122]:
import psycopg2
conn = psycopg2.connect(
        user="postgres",
        password="postgres",
        host="localhost",
        port=8082,  # The port you exposed in docker-compose.yml
        database="postgres"
    )
conn.autocommit = True


In [116]:
cur = conn.cursor()

In [111]:
file_location = "https://inaturalist-open-data.s3.amazonaws.com/photos/345120366/original.jpg"
vector = download_image_and_preprocess(file_location)
vector = torch.nn.functional.normalize(vector)
vector = vector.detach().numpy().tolist()[0]
vector = json.dumps(vector)
vector

In [118]:
cur.execute(
                "INSERT INTO embeddings (url, embedding) VALUES (%s, %s)",
                (file_location, vector)
            )

In [121]:
cur.close()
conn.close()

## Putting it all together:

we will now download all of our images, but instead add a process to write to our new table:

In [134]:
def write_to_pgvector(cur, url, vector):

    return cur.execute(
                "INSERT INTO embeddings (url, embedding) VALUES (%s, %s)",
                (url, vector)
            )
    
def download_image_and_preprocess(image_location):
    """This method downloads an array of images and outputs thier vectors in memory"""
    try:
        content = requests.get(image_location).content
        img = Image.open(BytesIO(content))
        image = preprocess(img).unsqueeze(0)
        vector = model.encode_image(image)
    except:
        print("Image not found")
        vector = None

    return vector

def process_images(file_location, cur):
    """This section details how to convert these batches to vectors! We use multi-threading to download many images at once. 
    We don't save any of the images and only compute them in memory. The next step will
    """

    df = extract_data(file_location)

    batch_size=10
    num_chunks = math.ceil(len(df)/batch_size)


    i = 0

    for chunk in np.array_split(df, num_chunks):
        i+=1

        if i %100 ==0 :
            print(i*1000)
        media_locations = chunk["media_location"]
        with ThreadPoolExecutor(max_workers=5) as executor:
            downloaded_images = executor.map(download_image_and_preprocess, media_locations)

            ### Do something with images (probably store them somewhere)
            for i, image in enumerate(downloaded_images):
                # line = ', '.join(map(str, image.detach().numpy().tolist()))
                if image:

                    row = torch.nn.functional.normalize(image)
                    row = image.detach().numpy().tolist()[0]
                    vector = json.dumps(row)

                    url = media_locations.iloc[i]

                    res = write_to_pgvector(cur, url, vector)
                else:
                    print("{} not embedded".format(url))





In [135]:
cur = conn.cursor()
process_images("idigbio", cur)
cur.close()


scientific name not found
scientific name not found
scientific name not found
scientific name not found
scientific name not found
scientific name not found


  return bound(*args, **kwds)


ConnectionError: HTTPSConnectionPool(host='images.collections.yale.edu', port=443): Max retries exceeded with url: /iiif/2/ypm:543a64ca-81c9-4057-97f3-b7cd46acc5e5/full/!1920,1920/0/default.jpg (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x2fa7d2f50>: Failed to resolve 'images.collections.yale.edu' ([Errno 8] nodename nor servname provided, or not known)"))