Populate Pinecone with movie poster embeddings and metadata

In [None]:
!pip install datasets
!pip install pinecone-client
!pip install tmdbv3api

Collecting tmdbv3api
  Downloading tmdbv3api-1.9.0-py3-none-any.whl.metadata (8.0 kB)
Downloading tmdbv3api-1.9.0-py3-none-any.whl (25 kB)
Installing collected packages: tmdbv3api
Successfully installed tmdbv3api-1.9.0


CLIP

In [None]:
from datasets import load_dataset
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
from io import BytesIO
import pinecone
from tmdbv3api import TMDb, Find
from google.colab import userdata
import time

# Configuration Variables
# MODEL_NAME = "openai/clip-vit-base-patch32"
# PROCESSOR_NAME = "openai/clip-vit-base-patch32"
# NAMESPACE = "CLIP"

MODEL_NAME = "openai/clip-vit-base-patch32"
PROCESSOR_NAME = "openai/clip-vit-base-patch32"
NAMESPACE = "CLIP"

# Initialize Pinecone
pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))
index = pc.Index("movie-posters")

# Initialize TMDB API client
tmdb = TMDb()
tmdb.api_key = userdata.get("TMDB_API_KEY")

# Initialize the Find class
find = Find()

# Load the Pinecone movie poster dataset
ds = load_dataset("pinecone/movie-posters")

# Initialize CLIP model and processor
model = CLIPModel.from_pretrained(MODEL_NAME)
processor = CLIPProcessor.from_pretrained(PROCESSOR_NAME)

# Function to fetch an image from a URL and get its embedding
def get_image_embedding(poster_url):
    response = requests.get(poster_url)
    image = Image.open(BytesIO(response.content))
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        embedding = model.get_image_features(**inputs)
    return embedding.squeeze().numpy()

# Function to fetch movie metadata from TMDB API using IMDb ID
def get_movie_metadata(imdb_id):
    time.sleep(0.02)  # 50 requests per second = 1 request every 0.02 seconds so we don't overload TMDB API
    try:
        # Use the Find class to search for the movie by IMDb ID
        results = find.find(imdb_id, 'imdb_id')
        if results.movie_results:
            movie = results.movie_results[0]  # Take the first result
            return {
                'title': movie.title,
                'overview': movie.overview,
                'release_date': movie.release_date
            }
        else:
            print(f"No metadata found for IMDb ID {imdb_id}")
            return None
    except Exception as e:
        print(f"Error fetching metadata for IMDb ID {imdb_id}: {e}")
        return None

# Process the dataset and upload embeddings with metadata to Pinecone
def process_and_upload():
    for item in ds['train']:
        imdb_id = item['imdbId']
        poster_url = item['poster']

        # Fetch movie metadata
        metadata = get_movie_metadata(imdb_id)
        if metadata is None:
            continue

        # Add poster URL from dataset to metadata
        metadata['poster_url'] = poster_url

        # Generate image embedding
        try:
            embedding = get_image_embedding(poster_url)
        except Exception as e:
            print(f"Failed to get embedding for {poster_url}: {e}")
            continue

        # Upload to Pinecone with namespace and metadata as a dictionary
        index.upsert(
            vectors=[(imdb_id, embedding, metadata)],
            namespace=NAMESPACE
        )

process_and_upload()

No metadata found for IMDb ID tt5491994
No metadata found for IMDb ID tt3218680
No metadata found for IMDb ID tt3012698
Failed to get embedding for https://m.media-amazon.com/images/M/MV5BYzE2MjEwMTQtOTQ2Mi00ZWExLTkyMjUtNmJjMjBlYWFjZDdlXkEyXkFqcGdeQXVyMTI3ODAyMzE2._V1_SX300.jpg: cannot identify image file <_io.BytesIO object at 0x7fe0b6b84e00>
No metadata found for IMDb ID tt2879552
No metadata found for IMDb ID tt2401256
No metadata found for IMDb ID tt6769208
Failed to get embedding for N/A: Invalid URL 'N/A': No scheme supplied. Perhaps you meant https://N/A?
No metadata found for IMDb ID tt4299972
No metadata found for IMDb ID tt5189670
No metadata found for IMDb ID tt7343762
No metadata found for IMDb ID tt3762944
Failed to get embedding for https://m.media-amazon.com/images/M/MV5BZWJlODhhYTEtZjg3YS00NjNmLTgwNTMtMjBmYTZhYjQzMDJkXkEyXkFqcGdeQXVyNjAwNDUxODI@._V1_SX300.jpg: cannot identify image file <_io.BytesIO object at 0x7fe0b654b420>
Failed to get embedding for https://m.media-a

In [None]:
get_movie_metadata("tt6087562")

{'title': 'The Scythian Lamb',
 'overview': "Based on a manga written by Tatsuhiko Yamagami, the story is set in a former seaport town Uobuka, where 6 former criminals were sent to live there by the government, with the intention of re-socialising them. Aside from the few who know about the project, the general townsfolk has no idea of the former convicts' identities. Tsukisue is the pleasant and efficient municipal official put in charge of the programme. As he slowly learns about their past, a body is discovered.",
 'release_date': '2018-02-03'}

In [None]:
# Pinecone and create an index
from pinecone import Pinecone
from pinecone import ServerlessSpec
from google.colab import userdata

pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))

# pc.create_index(
#     name="movie-posters",
#     dimension=512, # dimensions from CLIP
#     metric="cosine",
#     spec=ServerlessSpec(
#         cloud="aws",
#         region="us-east-1"
#     )
# )

# Access the existing index
index = pc.Index("movie-posters")


In [None]:
#index.delete(delete_all=True, namespace='')

{}

In [None]:
print(index.describe_index_stats())

{'dimension': 512,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10191}},
 'total_vector_count': 10191}


ResNet

In [None]:
from datasets import load_dataset
import torch
from torchvision import models, transforms
from PIL import Image
import requests
from io import BytesIO
import pinecone
from tmdbv3api import TMDb, Find
from google.colab import userdata
import time

# Configuration Variables
MODEL_NAME = "resnet50"  # Using ResNet50 model
NAMESPACE = "ResNet-50"
VECTOR_DIM = 512  # Limiting to 512 to match CLIP dimensions (Pinecone free plan has only one index)

# Initialize Pinecone
pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))
index = pc.Index("movie-posters")

# Initialize TMDB API client
tmdb = TMDb()
tmdb.api_key = userdata.get("TMDB_API_KEY")

# Initialize the Find class
find = Find()

# Load the Pinecone movie poster dataset
ds = load_dataset("pinecone/movie-posters")

# Initialize ResNet model and modify it to produce 512-dimensional embeddings
class ResNet50WithEmbedding(torch.nn.Module):
    def __init__(self, embedding_dim):
        super(ResNet50WithEmbedding, self).__init__()
        self.base_model = models.resnet50(pretrained=True)
        # Remove the final fully connected layer
        self.base_model = torch.nn.Sequential(*list(self.base_model.children())[:-1])
        # Add a new fully connected layer
        self.fc = torch.nn.Linear(in_features=2048, out_features=embedding_dim)

    def forward(self, x):
        x = self.base_model(x)
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.fc(x)
        return x

model = ResNet50WithEmbedding(VECTOR_DIM)
model.eval()

# Define image preprocessing
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to fetch an image from a URL and get its embedding
def get_image_embedding(poster_url):
    response = requests.get(poster_url)
    image = Image.open(BytesIO(response.content)).convert('RGB')
    inputs = preprocess(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        embedding = model(inputs).squeeze().numpy()
    return embedding

# Function to fetch movie metadata from TMDB API using IMDb ID
def get_movie_metadata(imdb_id):
    time.sleep(0.02)  # 50 requests per second = 1 request every 0.02 seconds so we don't overload TMDB API
    try:
        results = find.find(imdb_id, 'imdb_id')
        if results.movie_results:
            movie = results.movie_results[0]  # Take the first result
            return {
                'title': movie.title,
                'overview': movie.overview,
                'release_date': movie.release_date
            }
        else:
            print(f"No metadata found for IMDb ID {imdb_id}")
            return None
    except Exception as e:
        print(f"Error fetching metadata for IMDb ID {imdb_id}: {e}")
        return None

# Process the dataset and upload embeddings with metadata to Pinecone
def process_and_upload():
    for item in ds['train']:
        imdb_id = item['imdbId']
        poster_url = item['poster']

        # Fetch movie metadata
        metadata = get_movie_metadata(imdb_id)
        if metadata is None:
            continue

        # Add poster URL from dataset to metadata
        metadata['poster_url'] = poster_url

        # Generate image embedding
        try:
            embedding = get_image_embedding(poster_url)
        except Exception as e:
            print(f"Failed to get embedding for {poster_url}: {e}")
            continue

        # Upload to Pinecone with the specified namespace and metadata as a dictionary
        index.upsert(
            vectors=[(imdb_id, embedding.tolist(), metadata)],
            namespace=NAMESPACE
        )

process_and_upload()

Failed to get embedding for https://m.media-amazon.com/images/M/MV5BMDkzNmRhNTMtZDI4NC00Zjg1LTgxM2QtMjYxZDQ3OWJlMDRlXkEyXkFqcGdeQXVyNTU5MjkzMTU@._V1_SX300.jpg: cannot identify image file <_io.BytesIO object at 0x7fe0b7856ed0>
No metadata found for IMDb ID tt5491994
No metadata found for IMDb ID tt3218680
No metadata found for IMDb ID tt3012698
Failed to get embedding for https://m.media-amazon.com/images/M/MV5BYzE2MjEwMTQtOTQ2Mi00ZWExLTkyMjUtNmJjMjBlYWFjZDdlXkEyXkFqcGdeQXVyMTI3ODAyMzE2._V1_SX300.jpg: cannot identify image file <_io.BytesIO object at 0x7fe0b72de1b0>
No metadata found for IMDb ID tt2879552
No metadata found for IMDb ID tt2401256
No metadata found for IMDb ID tt6769208
Failed to get embedding for N/A: Invalid URL 'N/A': No scheme supplied. Perhaps you meant https://N/A?
No metadata found for IMDb ID tt4299972
No metadata found for IMDb ID tt5189670
No metadata found for IMDb ID tt7343762
No metadata found for IMDb ID tt3762944
Failed to get embedding for https://m.media-a

VGG16

In [None]:
import torch
from torchvision import models, transforms
from PIL import Image
import requests
from io import BytesIO
import pinecone
from tmdbv3api import TMDb, Find
from google.colab import userdata
import time

# Configuration Variables
VECTOR_DIM = 512  # Dimension for Pinecone index, limited to 512 by CLIP, and only one index allowed on free tier Pinecone
NAMESPACE = "VGG16"

# Initialize Pinecone
pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))
index = pc.Index("movie-posters")

# Initialize TMDB API client
tmdb = TMDb()
tmdb.api_key = userdata.get("TMDB_API_KEY")

# Initialize the Find class
find = Find()

# Load the Pinecone movie poster dataset
ds = load_dataset("pinecone/movie-posters")

# Initialize VGG16 model and modify it to produce 512-dimensional embeddings
class VGG16WithEmbedding(torch.nn.Module):
    def __init__(self, embedding_dim):
        super(VGG16WithEmbedding, self).__init__()
        self.base_model = models.vgg16(pretrained=True)
        # Remove the final fully connected layers
        self.base_model.classifier = torch.nn.Sequential(*list(self.base_model.classifier.children())[:-3])
        # Add a new fully connected layer
        self.fc = torch.nn.Linear(in_features=4096, out_features=embedding_dim)

    def forward(self, x):
        x = self.base_model(x)
        x = self.fc(x)
        return x

model = VGG16WithEmbedding(VECTOR_DIM)
model.eval()

# Define image preprocessing
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to fetch an image from a URL and get its embedding
def get_image_embedding(poster_url):
    response = requests.get(poster_url)
    image = Image.open(BytesIO(response.content)).convert('RGB')
    inputs = preprocess(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        embedding = model(inputs).squeeze().numpy()
    return embedding

# Function to fetch movie metadata from TMDB API using IMDb ID
def get_movie_metadata(imdb_id):
    time.sleep(0.02)  # 50 requests per second = 1 request every 0.02 seconds so we don't overload TMDB API
    try:
        results = find.find(imdb_id, 'imdb_id')
        if results.movie_results:
            movie = results.movie_results[0]  # Take the first result
            return {
                'title': movie.title,
                'overview': movie.overview,
                'release_date': movie.release_date
            }
        else:
            print(f"No metadata found for IMDb ID {imdb_id}")
            return None
    except Exception as e:
        print(f"Error fetching metadata for IMDb ID {imdb_id}: {e}")
        return None

# Process the dataset and upload embeddings with metadata to Pinecone
def process_and_upload():
    for item in ds['train']:
        imdb_id = item['imdbId']
        poster_url = item['poster']

        # Fetch movie metadata
        metadata = get_movie_metadata(imdb_id)
        if metadata is None:
            continue

        # Add poster URL from dataset to metadata
        metadata['poster_url'] = poster_url

        # Generate image embedding
        try:
            embedding = get_image_embedding(poster_url)
        except Exception as e:
            print(f"Failed to get embedding for {poster_url}: {e}")
            continue

        # Upload to Pinecone with the specified namespace and metadata as a dictionary
        index.upsert(
            vectors=[(imdb_id, embedding.tolist(), metadata)],
            namespace=NAMESPACE
        )

process_and_upload()


Failed to get embedding for https://m.media-amazon.com/images/M/MV5BMDkzNmRhNTMtZDI4NC00Zjg1LTgxM2QtMjYxZDQ3OWJlMDRlXkEyXkFqcGdeQXVyNTU5MjkzMTU@._V1_SX300.jpg: cannot identify image file <_io.BytesIO object at 0x7fe075541cb0>
No metadata found for IMDb ID tt5491994
No metadata found for IMDb ID tt3218680
No metadata found for IMDb ID tt3012698
Failed to get embedding for https://m.media-amazon.com/images/M/MV5BYzE2MjEwMTQtOTQ2Mi00ZWExLTkyMjUtNmJjMjBlYWFjZDdlXkEyXkFqcGdeQXVyMTI3ODAyMzE2._V1_SX300.jpg: cannot identify image file <_io.BytesIO object at 0x7fe075541c60>
No metadata found for IMDb ID tt2879552
No metadata found for IMDb ID tt2401256
No metadata found for IMDb ID tt6769208
Failed to get embedding for N/A: Invalid URL 'N/A': No scheme supplied. Perhaps you meant https://N/A?
No metadata found for IMDb ID tt4299972
No metadata found for IMDb ID tt5189670
No metadata found for IMDb ID tt7343762
No metadata found for IMDb ID tt3762944
Failed to get embedding for https://m.media-a

In [None]:
print("tedt")

tedt


Test results for CLIP Model

In [None]:
import random

# Create a dictionary for quick lookup of poster URLs by IMDb ID
id_to_poster = {item['imdbId']: item['poster'] for item in ds['train']}

# Choose a random movie from the dataset
random_movie = random.choice(ds['train'])

# Extract the IMDb ID and poster URL of the random movie
specific_imdb_id = random_movie['imdbId']

id_to_poster = {item['imdbId']: item['poster'] for item in ds['train']}

# Look up the movie in the dataset using the specific IMDb ID
if specific_imdb_id in id_to_poster:
    poster_url = id_to_poster[specific_imdb_id]

    # Get the embedding of the specified movie (use the function get_image_embedding)
    embedding = get_image_embedding(poster_url)  # Assuming you have already defined this function

    # Query Pinecone to get the top 10 closest movie posters
    result = index.query(vector=embedding.tolist(), top_k=10, include_metadata=True, namespace="CLIP")

    # Print the specified movie and the top 10 similar movies with metadata
    print(f"Specified Movie IMDb ID: {specific_imdb_id}")
    print(f"Poster URL: {poster_url}")
    print("Top 10 Similar Movies:")
    for match in result['matches']:
        similar_imdb_id = match['id']
        similar_poster_url = id_to_poster.get(similar_imdb_id, "URL not found")
        # Access the metadata
        metadata = match.get('metadata', {})
        title = metadata.get('title', "Title not found")
        overview = metadata.get('overview', "Overview not found")
        release_date = metadata.get('release_date', "Release date not found")
        poster_url = metadata.get('poster_url', "Poster URL not found")

        print(f"IMDb ID: {similar_imdb_id}, Score: {match['score']}")
        print(f"Title: {title}")
        print(f"Overview: {overview}")
        print(f"Release Date: {release_date}")
        print(f"Poster URL: {poster_url}")
else:
    print(f"IMDb ID {specific_imdb_id} not found in the dataset.")


Specified Movie IMDb ID: tt8523678
Poster URL: https://m.media-amazon.com/images/M/MV5BZDIwMWNmNzctNjFlYS00ZTMyLTlmOWEtYThiZjJjYjcwNmE5XkEyXkFqcGdeQXVyMjk3NTUyOTc@._V1_SX300.jpg
Top 10 Similar Movies:
IMDb ID: tt8523678, Score: 1.00005567
Title: Flesh & Blood
Overview: Kimberly, a teenager suffering from agoraphobia, has not left the house since her mother's unsolved murder. On the eve of Thanksgiving, she begins to suspect that the safe harbor of home and her doting father may be a dangerous mirage.
Release Date: 2018-11-02
Poster URL: https://m.media-amazon.com/images/M/MV5BZDIwMWNmNzctNjFlYS00ZTMyLTlmOWEtYThiZjJjYjcwNmE5XkEyXkFqcGdeQXVyMjk3NTUyOTc@._V1_SX300.jpg
IMDb ID: tt4670016, Score: 0.730577171
Title: Wolves at the Door
Overview: Four friends gather at an elegant home during the Summer of Love, 1969. Unbeknownst to them, deadly visitors are waiting outside. What begins as a simple farewell party turns to a night of primal terror as the intruders stalk and torment the four, who

In [None]:
# Specify the IMDb ID of the movie you want to use
specific_imdb_id = 'tt3638686'

id_to_poster = {item['imdbId']: item['poster'] for item in ds['train']}

# Look up the movie in the dataset using the specific IMDb ID
if specific_imdb_id in id_to_poster:
    poster_url = id_to_poster[specific_imdb_id]

    # Get the embedding of the specified movie (use the function get_image_embedding)
    embedding = get_image_embedding(poster_url)  # Assuming you have already defined this function

    # Query Pinecone to get the top 10 closest movie posters
    result = index.query(vector=embedding.tolist(), top_k=10, include_metadata=True, namespace="CLIP")

    # Print the specified movie and the top 10 similar movies with metadata
    print(f"Specified Movie IMDb ID: {specific_imdb_id}")
    print(f"Poster URL: {poster_url}")
    print("Top 10 Similar Movies:")
    for match in result['matches']:
        similar_imdb_id = match['id']
        similar_poster_url = id_to_poster.get(similar_imdb_id, "URL not found")
        # Access the metadata
        metadata = match.get('metadata', {})
        title = metadata.get('title', "Title not found")
        overview = metadata.get('overview', "Overview not found")
        release_date = metadata.get('release_date', "Release date not found")
        poster_url = metadata.get('poster_url', "Poster URL not found")

        print(f"IMDb ID: {similar_imdb_id}, Score: {match['score']}")
        print(f"Title: {title}")
        print(f"Overview: {overview}")
        print(f"Release Date: {release_date}")
        print(f"Poster URL: {poster_url}")
else:
    print(f"IMDb ID {specific_imdb_id} not found in the dataset.")


Specified Movie IMDb ID: tt3638686
Poster URL: https://m.media-amazon.com/images/M/MV5BNzI0MTk5ODg4MF5BMl5BanBnXkFtZTgwODQ2NjgwOTE@._V1_SX300.jpg
Top 10 Similar Movies:
IMDb ID: tt3638686, Score: 1.00113225
Title: Famous Nathan
Overview: A Coney Island-inspired, densely-layered visually dynamic documentary portrait of the life and times of the original Nathan's Famous, created in 1916 by filmmaker Lloyd Handwerker's grandparents, Nathan and Ida Handwerker. 30 years in the making, Famous Nathan interweaves decades-spanning archival footage, family photos and home movies, an eclectic soundtrack and never-before-heard audio from Nathan: his only interview, ever as well as compelling, intimate and hilarious interviews with the dedicated band of workers, not at all shy at offering opinions, memories and the occasional tall tale.
Release Date: 2015-07-17
Poster URL: https://m.media-amazon.com/images/M/MV5BNzI0MTk5ODg4MF5BMl5BanBnXkFtZTgwODQ2NjgwOTE@._V1_SX300.jpg
IMDb ID: tt4191702, Score: 0

Test TMDB API

In [None]:
import requests

tmdb_api_key = userdata.get('TMDB_API_KEY')

# Function to get movie details from TMDB using IMDb ID
def get_movie_details(imdb_id):
    url = f"https://api.themoviedb.org/3/find/{imdb_id}"
    params = {
        'api_key': tmdb_api_key,
        'external_source': 'imdb_id'
    }
    response = requests.get(url, params=params)
    data = response.json()

    # print("respones data")
    # print(data)

    if response.status_code == 200 and 'movie_results' in data:
        if len(data['movie_results']) > 0:
            return data['movie_results'][0]  # Return the first movie result
        else:
            print("No movie found for this IMDb ID.")
            return None
    else:
        print(f"Error: {response.status_code}")
        return None

# Example usage:
imdb_id = 'tt2514894'  # Example IMDb ID
movie_details = get_movie_details(imdb_id)

print(movie_details)

if movie_details:
    print("Title:", movie_details.get('title'))
    print("Overview:", movie_details.get('overview'))
    print("Release Date:", movie_details.get('release_date'))
    print("Poster Path:", id_to_poster.get(imdb_id, "URL not found"))


{'backdrop_path': None, 'id': 173465, 'title': 'Medora', 'original_title': 'Medora', 'overview': "In America's basketball heartland, four resilient boys from rural Medora, Indiana, fight to end their high school team's three-year losing streak, as their dwindling town faces the threat of extinction.", 'poster_path': None, 'media_type': 'movie', 'adult': False, 'original_language': 'en', 'genre_ids': [99], 'popularity': 0.514, 'release_date': '2013-11-08', 'video': False, 'vote_average': 7.2, 'vote_count': 4}
Title: Medora
Overview: In America's basketball heartland, four resilient boys from rural Medora, Indiana, fight to end their high school team's three-year losing streak, as their dwindling town faces the threat of extinction.
Release Date: 2013-11-08
Poster Path: https://m.media-amazon.com/images/M/MV5BMTU0MzM2ODI2N15BMl5BanBnXkFtZTgwNTE0NDc2MDE@._V1_SX300.jpg
