In [None]:
#!pip install datasets
#!pip install httpx
!pip install pinecone-client

Collecting pinecone-client
  Using cached pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Using cached pinecone_client-5.0.1-py3-none-any.whl (244 kB)
Installing collected packages: pinecone-client
Successfully installed pinecone-client-5.0.1


In [None]:
import asyncio
from datasets import load_dataset
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import httpx
from io import BytesIO
from pinecone import Pinecone, ServerlessSpec
from google.colab import userdata

ds = load_dataset("pinecone/movie-posters")

# Initialize CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Async func to fetch image from URL and get its embedding
async def get_image_embedding(poster_url):
    async with httpx.AsyncClient() as client:
        response = await client.get(poster_url)
    image = Image.open(BytesIO(response.content))
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        embedding = model.get_image_features(**inputs)
    return embedding.squeeze().numpy()

# Async funcs to process the dataset and upload embeddings to Pinecone
async def process_and_upload():
    tasks = []
    for item in ds['train']:
        imdb_id = item['imdbId']
        poster_url = item['poster']
        tasks.append(upload_embedding(imdb_id, poster_url))
    await asyncio.gather(*tasks)

async def upload_embedding(imdb_id, poster_url):
    embedding = await get_image_embedding(poster_url)
    index.upsert(vectors=[(imdb_id, embedding)])

In [None]:
# Pinecone and create an index

pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))

pc.create_index(
    name="movie-posters",
    dimension=512, # dimensions from CLIP
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

In [None]:
# Run the process and upload function
asyncio.run(process_and_upload())

RuntimeError: asyncio.run() cannot be called from a running event loop

In [None]:
# ds['train'][100]

{'imdbId': 'tt3017864',
 'poster': 'https://m.media-amazon.com/images/M/MV5BYTI0ZDUxM2QtZTVjNy00NjViLThlOTEtZGE1M2NhYzU1OGY1XkEyXkFqcGdeQXVyNTAyODkwOQ@@._V1_SX300.jpg'}