In [3]:
!pip install datasets
!pip install pinecone-client

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [4]:
from datasets import load_dataset
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
from io import BytesIO
import pinecone

ds = load_dataset("pinecone/movie-posters")

# Initialize CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define function to fetch an image from a URL and get its embedding
def get_image_embedding(poster_url):
    # Synchronously fetch the image from the URL
    response = requests.get(poster_url)
    image = Image.open(BytesIO(response.content))

    # Preprocess the image
    inputs = processor(images=image, return_tensors="pt")

    # Generate the image embedding
    with torch.no_grad():
        embedding = model.get_image_features(**inputs)

    # Return the embedding as a numpy array
    return embedding.squeeze().numpy()

# Process the dataset and upload embeddings to Pinecone
def process_and_upload():
    for item in ds['train']:
        imdb_id = item['imdbId']
        poster_url = item['poster']
        embedding = get_image_embedding(poster_url)
        index.upsert(vectors=[(imdb_id, embedding)])

Downloading metadata:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/562k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10269 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [9]:
# Pinecone and create an index
from pinecone import Pinecone
from pinecone import ServerlessSpec
from google.colab import userdata

pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))

# pc.create_index(
#     name="movie-posters",
#     dimension=512, # dimensions from CLIP
#     metric="cosine",
#     spec=ServerlessSpec(
#         cloud="aws",
#         region="us-east-1"
#     )
# )

# Access the existing index
index = pc.Index("movie-posters")


In [10]:
# Run the process and upload function
process_and_upload()

KeyboardInterrupt: 

In [11]:
print(index.describe_index_stats())

{'dimension': 512,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 161}},
 'total_vector_count': 161}


In [15]:
import random

# Create a dictionary for quick lookup of poster URLs by IMDb ID
id_to_poster = {item['imdbId']: item['poster'] for item in ds['train']}

# Step 3: Choose a random movie from the dataset
random_movie = random.choice(ds['train'])

# Step 4: Extract the IMDb ID and poster URL of the random movie
imdb_id = random_movie['imdbId']
poster_url = random_movie['poster']

# Step 5: Get the embedding of the random movie (use the function get_image_embedding)
embedding = get_image_embedding(poster_url)  # Assuming you have already defined this function

# Step 6: Query Pinecone to get the top 10 closest movie posters
result = index.query(vector=embedding.tolist(), top_k=10, include_values=True)

# Step 7: Print the random movie and the top 10 similar movies
print(f"Random Movie IMDb ID: {imdb_id}")
print(f"Poster URL: {poster_url}")
print("Top 10 Similar Movies:")
for match in result['matches']:
    similar_imdb_id = match['id']
    similar_poster_url = id_to_poster.get(similar_imdb_id, "URL not found")
    print(f"IMDb ID: {similar_imdb_id}, Score: {match['score']}, Poster URL: {similar_poster_url}")

Random Movie IMDb ID: tt2973064
Poster URL: https://m.media-amazon.com/images/M/MV5BMmE3MGY3ZjEtODQ1ZC00OWQyLWFjYTAtN2NmNzE3ODQyNDJlXkEyXkFqcGdeQXVyNzA3ODY5NTU@._V1_SX300.jpg
Top 10 Similar Movies:
IMDb ID: tt2084970, Score: 0.462122947, Poster URL: https://m.media-amazon.com/images/M/MV5BOTgwMzFiMWYtZDhlNS00ODNkLWJiODAtZDVhNzgyNzJhYjQ4L2ltYWdlXkEyXkFqcGdeQXVyNzEzOTYxNTQ@._V1_SX300.jpg
IMDb ID: tt2332623, Score: 0.460748464, Poster URL: https://m.media-amazon.com/images/M/MV5BNzYyMDU3NDg3NV5BMl5BanBnXkFtZTgwODk1ODMxMTE@._V1_SX300.jpg
IMDb ID: tt2379713, Score: 0.444533527, Poster URL: https://m.media-amazon.com/images/M/MV5BOWQ1MDE1NzgtNTQ4OC00ZjliLTllZDAtN2IyOTVmMTc5YjUxXkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_SX300.jpg
IMDb ID: tt3416742, Score: 0.436589092, Poster URL: https://m.media-amazon.com/images/M/MV5BYmYyYWY4NjgtNGQ2Yi00NDNiLWJlOTgtYjI1MTI0NjZkNjhhXkEyXkFqcGdeQXVyNDE5MTU2MDE@._V1_SX300.jpg
IMDb ID: tt2802144, Score: 0.42163679, Poster URL: https://m.media-amazon.com/images/M/MV5BYTM

In [18]:
import requests

tmdb_api_key = userdata.get('TMDB_API_KEY')

# Function to get movie details from TMDB using IMDb ID
def get_movie_details(imdb_id):
    url = f"https://api.themoviedb.org/3/find/{imdb_id}"
    params = {
        'api_key': tmdb_api_key,
        'external_source': 'imdb_id'
    }
    response = requests.get(url, params=params)
    data = response.json()

    if response.status_code == 200 and 'movie_results' in data:
        if len(data['movie_results']) > 0:
            return data['movie_results'][0]  # Return the first movie result
        else:
            print("No movie found for this IMDb ID.")
            return None
    else:
        print(f"Error: {response.status_code}")
        return None

# Example usage:
imdb_id = 'tt2084970'  # Example IMDb ID
movie_details = get_movie_details(imdb_id)

if movie_details:
    print("Title:", movie_details.get('title'))
    print("Overview:", movie_details.get('overview'))
    print("Release Date:", movie_details.get('release_date'))
    print("Poster Path:", id_to_poster.get(imdb_id, "URL not found"))


Title: The Imitation Game
Overview: Based on the real life story of legendary cryptanalyst Alan Turing, the film portrays the nail-biting race against time by Turing and his brilliant team of code-breakers at Britain's top-secret Government Code and Cypher School at Bletchley Park, during the darkest days of World War II.
Release Date: 2014-11-14
Poster Path: https://m.media-amazon.com/images/M/MV5BOTgwMzFiMWYtZDhlNS00ODNkLWJiODAtZDVhNzgyNzJhYjQ4L2ltYWdlXkEyXkFqcGdeQXVyNzEzOTYxNTQ@._V1_SX300.jpg
