# Version 1 - Populate Pinecone with movie poster embeddings and metadata

V1 movie poster dataset sourced from Hugging Face:
https://huggingface.co/datasets/pinecone/movie-posters

V2 will source movies using TMDB API's popular endpoint, to get a list of more modern movies (and movie posters of better image quality)
https://developer.themoviedb.org/reference/movie-popular-list

In [2]:
#!pip install datasets
!pip install pinecone-client
#!pip install tmdbv3api

Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.0.3-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.0.3-py3-none-any.whl (117 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.6/117.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5

CLIP

In [None]:
from datasets import load_dataset
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
from io import BytesIO
import pinecone
from tmdbv3api import TMDb, Find
from google.colab import userdata
import time
from pinecone import Pinecone
from pinecone import ServerlessSpec
from google.colab import userdata

# Configuration Variables
# MODEL_NAME = "openai/clip-vit-base-patch32"
# PROCESSOR_NAME = "openai/clip-vit-base-patch32"
# NAMESPACE = "CLIP"

MODEL_NAME = "openai/clip-vit-base-patch32"
PROCESSOR_NAME = "openai/clip-vit-base-patch32"
NAMESPACE = "CLIP"

# Initialize Pinecone
pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))
index = pc.Index("movie-posters")

# Initialize TMDB API client
tmdb = TMDb()
tmdb.api_key = userdata.get("TMDB_API_KEY")

# Initialize the Find class
find = Find()

# Load the Pinecone movie poster dataset
ds = load_dataset("pinecone/movie-posters")

# Initialize CLIP model and processor
model = CLIPModel.from_pretrained(MODEL_NAME)
processor = CLIPProcessor.from_pretrained(PROCESSOR_NAME)

# Function to fetch an image from a URL and get its embedding
def get_image_embedding(poster_url):
    response = requests.get(poster_url)
    image = Image.open(BytesIO(response.content))
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        embedding = model.get_image_features(**inputs)
    return embedding.squeeze().numpy()

# Function to fetch movie metadata from TMDB API using IMDb ID
def get_movie_metadata(imdb_id):
    time.sleep(0.02)  # 50 requests per second = 1 request every 0.02 seconds so we don't overload TMDB API
    try:
        # Use the Find class to search for the movie by IMDb ID
        results = find.find(imdb_id, 'imdb_id')
        if results.movie_results:
            movie = results.movie_results[0]  # Take the first result
            return {
                'title': movie.title,
                'overview': movie.overview,
                'release_date': movie.release_date
            }
        else:
            print(f"No metadata found for IMDb ID {imdb_id}")
            return None
    except Exception as e:
        print(f"Error fetching metadata for IMDb ID {imdb_id}: {e}")
        return None

# Process the dataset and upload embeddings with metadata to Pinecone
def process_and_upload():
    for item in ds['train']:
        imdb_id = item['imdbId']
        poster_url = item['poster']

        # Fetch movie metadata
        metadata = get_movie_metadata(imdb_id)
        if metadata is None:
            continue

        # Add poster URL from dataset to metadata
        metadata['poster_url'] = poster_url

        # Generate image embedding
        try:
            embedding = get_image_embedding(poster_url)
        except Exception as e:
            print(f"Failed to get embedding for {poster_url}: {e}")
            continue

        # Upload to Pinecone with namespace and metadata as a dictionary
        index.upsert(
            vectors=[(imdb_id, embedding, metadata)],
            namespace=NAMESPACE
        )

#process_and_upload()

Downloading metadata:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/562k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10269 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [None]:
get_movie_metadata("tt6087562")

{'title': 'The Scythian Lamb',
 'overview': "Based on a manga written by Tatsuhiko Yamagami, the story is set in a former seaport town Uobuka, where 6 former criminals were sent to live there by the government, with the intention of re-socialising them. Aside from the few who know about the project, the general townsfolk has no idea of the former convicts' identities. Tsukisue is the pleasant and efficient municipal official put in charge of the programme. As he slowly learns about their past, a body is discovered.",
 'release_date': '2018-02-03'}

In [None]:
# Pinecone and create an index
from pinecone import Pinecone
from pinecone import ServerlessSpec
from google.colab import userdata

pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))

# pc.create_index(
#     name="movie-posters",
#     dimension=512, # dimensions from CLIP
#     metric="cosine",
#     spec=ServerlessSpec(
#         cloud="aws",
#         region="us-east-1"
#     )
# )

# Access the existing index
index = pc.Index("movie-posters")


In [None]:
#index.delete(delete_all=True, namespace='')

{}

In [None]:
print(index.describe_index_stats())

{'dimension': 512,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10191}},
 'total_vector_count': 10191}


ResNet

In [None]:
from datasets import load_dataset
import torch
from torchvision import models, transforms
from PIL import Image
import requests
from io import BytesIO
import pinecone
from tmdbv3api import TMDb, Find
from google.colab import userdata
import time

# Configuration Variables
MODEL_NAME = "resnet50"  # Using ResNet50 model
NAMESPACE = "ResNet-50"
VECTOR_DIM = 512  # Limiting to 512 to match CLIP dimensions (Pinecone free plan has only one index)

# Initialize Pinecone
pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))
index = pc.Index("movie-posters")

# Initialize TMDB API client
tmdb = TMDb()
tmdb.api_key = userdata.get("TMDB_API_KEY")

# Initialize the Find class
find = Find()

# Load the Pinecone movie poster dataset
ds = load_dataset("pinecone/movie-posters")

# Initialize ResNet model and modify it to produce 512-dimensional embeddings
class ResNet50WithEmbedding(torch.nn.Module):
    def __init__(self, embedding_dim):
        super(ResNet50WithEmbedding, self).__init__()
        self.base_model = models.resnet50(pretrained=True)
        # Remove the final fully connected layer
        self.base_model = torch.nn.Sequential(*list(self.base_model.children())[:-1])
        # Add a new fully connected layer
        self.fc = torch.nn.Linear(in_features=2048, out_features=embedding_dim)

    def forward(self, x):
        x = self.base_model(x)
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.fc(x)
        return x

model = ResNet50WithEmbedding(VECTOR_DIM)
model.eval()

# Define image preprocessing
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to fetch an image from a URL and get its embedding
def get_image_embedding(poster_url):
    response = requests.get(poster_url)
    image = Image.open(BytesIO(response.content)).convert('RGB')
    inputs = preprocess(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        embedding = model(inputs).squeeze().numpy()
    return embedding

# Function to fetch movie metadata from TMDB API using IMDb ID
def get_movie_metadata(imdb_id):
    time.sleep(0.02)  # 50 requests per second = 1 request every 0.02 seconds so we don't overload TMDB API
    try:
        results = find.find(imdb_id, 'imdb_id')
        if results.movie_results:
            movie = results.movie_results[0]  # Take the first result
            return {
                'title': movie.title,
                'overview': movie.overview,
                'release_date': movie.release_date
            }
        else:
            print(f"No metadata found for IMDb ID {imdb_id}")
            return None
    except Exception as e:
        print(f"Error fetching metadata for IMDb ID {imdb_id}: {e}")
        return None

# Process the dataset and upload embeddings with metadata to Pinecone
def process_and_upload():
    for item in ds['train']:
        imdb_id = item['imdbId']
        poster_url = item['poster']

        # Fetch movie metadata
        metadata = get_movie_metadata(imdb_id)
        if metadata is None:
            continue

        # Add poster URL from dataset to metadata
        metadata['poster_url'] = poster_url

        # Generate image embedding
        try:
            embedding = get_image_embedding(poster_url)
        except Exception as e:
            print(f"Failed to get embedding for {poster_url}: {e}")
            continue

        # Upload to Pinecone with the specified namespace and metadata as a dictionary
        index.upsert(
            vectors=[(imdb_id, embedding.tolist(), metadata)],
            namespace=NAMESPACE
        )

#process_and_upload()

Failed to get embedding for https://m.media-amazon.com/images/M/MV5BMDkzNmRhNTMtZDI4NC00Zjg1LTgxM2QtMjYxZDQ3OWJlMDRlXkEyXkFqcGdeQXVyNTU5MjkzMTU@._V1_SX300.jpg: cannot identify image file <_io.BytesIO object at 0x7fe0b7856ed0>
No metadata found for IMDb ID tt5491994
No metadata found for IMDb ID tt3218680
No metadata found for IMDb ID tt3012698
Failed to get embedding for https://m.media-amazon.com/images/M/MV5BYzE2MjEwMTQtOTQ2Mi00ZWExLTkyMjUtNmJjMjBlYWFjZDdlXkEyXkFqcGdeQXVyMTI3ODAyMzE2._V1_SX300.jpg: cannot identify image file <_io.BytesIO object at 0x7fe0b72de1b0>
No metadata found for IMDb ID tt2879552
No metadata found for IMDb ID tt2401256
No metadata found for IMDb ID tt6769208
Failed to get embedding for N/A: Invalid URL 'N/A': No scheme supplied. Perhaps you meant https://N/A?
No metadata found for IMDb ID tt4299972
No metadata found for IMDb ID tt5189670
No metadata found for IMDb ID tt7343762
No metadata found for IMDb ID tt3762944
Failed to get embedding for https://m.media-a

VGG16

In [None]:
import torch
from torchvision import models, transforms
from PIL import Image
import requests
from io import BytesIO
import pinecone
from tmdbv3api import TMDb, Find
from google.colab import userdata
import time

# Configuration Variables
VECTOR_DIM = 512  # Dimension for Pinecone index, limited to 512 by CLIP, and only one index allowed on free tier Pinecone
NAMESPACE = "VGG16"

# Initialize Pinecone
pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))
index = pc.Index("movie-posters")

# Initialize TMDB API client
tmdb = TMDb()
tmdb.api_key = userdata.get("TMDB_API_KEY")

# Initialize the Find class
find = Find()

# Load the Pinecone movie poster dataset
ds = load_dataset("pinecone/movie-posters")

# Initialize VGG16 model and modify it to produce 512-dimensional embeddings
class VGG16WithEmbedding(torch.nn.Module):
    def __init__(self, embedding_dim):
        super(VGG16WithEmbedding, self).__init__()
        self.base_model = models.vgg16(pretrained=True)
        # Remove the final fully connected layers
        self.base_model.classifier = torch.nn.Sequential(*list(self.base_model.classifier.children())[:-3])
        # Add a new fully connected layer
        self.fc = torch.nn.Linear(in_features=4096, out_features=embedding_dim)

    def forward(self, x):
        x = self.base_model(x)
        x = self.fc(x)
        return x

model = VGG16WithEmbedding(VECTOR_DIM)
model.eval()

# Define image preprocessing
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to fetch an image from a URL and get its embedding
def get_image_embedding(poster_url):
    response = requests.get(poster_url)
    image = Image.open(BytesIO(response.content)).convert('RGB')
    inputs = preprocess(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        embedding = model(inputs).squeeze().numpy()
    return embedding

# Function to fetch movie metadata from TMDB API using IMDb ID
def get_movie_metadata(imdb_id):
    time.sleep(0.02)  # 50 requests per second = 1 request every 0.02 seconds so we don't overload TMDB API
    try:
        results = find.find(imdb_id, 'imdb_id')
        if results.movie_results:
            movie = results.movie_results[0]  # Take the first result
            return {
                'title': movie.title,
                'overview': movie.overview,
                'release_date': movie.release_date
            }
        else:
            print(f"No metadata found for IMDb ID {imdb_id}")
            return None
    except Exception as e:
        print(f"Error fetching metadata for IMDb ID {imdb_id}: {e}")
        return None

# Process the dataset and upload embeddings with metadata to Pinecone
def process_and_upload():
    for item in ds['train']:
        imdb_id = item['imdbId']
        poster_url = item['poster']

        # Fetch movie metadata
        metadata = get_movie_metadata(imdb_id)
        if metadata is None:
            continue

        # Add poster URL from dataset to metadata
        metadata['poster_url'] = poster_url

        # Generate image embedding
        try:
            embedding = get_image_embedding(poster_url)
        except Exception as e:
            print(f"Failed to get embedding for {poster_url}: {e}")
            continue

        # Upload to Pinecone with the specified namespace and metadata as a dictionary
        index.upsert(
            vectors=[(imdb_id, embedding.tolist(), metadata)],
            namespace=NAMESPACE
        )

#process_and_upload()


Failed to get embedding for https://m.media-amazon.com/images/M/MV5BMDkzNmRhNTMtZDI4NC00Zjg1LTgxM2QtMjYxZDQ3OWJlMDRlXkEyXkFqcGdeQXVyNTU5MjkzMTU@._V1_SX300.jpg: cannot identify image file <_io.BytesIO object at 0x7fe075541cb0>
No metadata found for IMDb ID tt5491994
No metadata found for IMDb ID tt3218680
No metadata found for IMDb ID tt3012698
Failed to get embedding for https://m.media-amazon.com/images/M/MV5BYzE2MjEwMTQtOTQ2Mi00ZWExLTkyMjUtNmJjMjBlYWFjZDdlXkEyXkFqcGdeQXVyMTI3ODAyMzE2._V1_SX300.jpg: cannot identify image file <_io.BytesIO object at 0x7fe075541c60>
No metadata found for IMDb ID tt2879552
No metadata found for IMDb ID tt2401256
No metadata found for IMDb ID tt6769208
Failed to get embedding for N/A: Invalid URL 'N/A': No scheme supplied. Perhaps you meant https://N/A?
No metadata found for IMDb ID tt4299972
No metadata found for IMDb ID tt5189670
No metadata found for IMDb ID tt7343762
No metadata found for IMDb ID tt3762944
Failed to get embedding for https://m.media-a

# Mapping from imdb id to movie name
##(for the user to be able to query on only movie names which exist in the dataset)

In [None]:
# pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))
# index = pc.Index("movie-posters")
# namespaces =['ResNet-50', 'VGG16', 'CLIP']

# mapping = {}
# for ns in namespaces:
#   for ids in index.list(namespace=ns):
#     #print(ids)
#     res = index.fetch(ids=ids, namespace=ns)

#     vectors = res['vectors']
#     for k in vectors:
#       title = vectors[k]['metadata']['title']
#       if title not in mapping:
#         mapping[title] = k
# print(mapping)



In [None]:
# print(len(mapping))

9863


# Test embedding results

In [None]:
import random

# Create a dictionary for quick lookup of poster URLs by IMDb ID
id_to_poster = {item['imdbId']: item['poster'] for item in ds['train']}

# Choose a random movie from the dataset
random_movie = random.choice(ds['train'])

# Extract the IMDb ID and poster URL of the random movie
specific_imdb_id = random_movie['imdbId']

id_to_poster = {item['imdbId']: item['poster'] for item in ds['train']}

# Look up the movie in the dataset using the specific IMDb ID
if specific_imdb_id in id_to_poster:
    poster_url = id_to_poster[specific_imdb_id]

    # Get the embedding of the specified movie (use the function get_image_embedding)
    embedding = get_image_embedding(poster_url)  # Assuming you have already defined this function

    # Query Pinecone to get the top 10 closest movie posters
    result = index.query(vector=embedding.tolist(), top_k=10, include_metadata=True, namespace="CLIP")

    # Print the specified movie and the top 10 similar movies with metadata
    print(f"Specified Movie IMDb ID: {specific_imdb_id}")
    print(f"Poster URL: {poster_url}")
    print("Top 10 Similar Movies:")
    for match in result['matches']:
        similar_imdb_id = match['id']
        similar_poster_url = id_to_poster.get(similar_imdb_id, "URL not found")
        # Access the metadata
        metadata = match.get('metadata', {})
        title = metadata.get('title', "Title not found")
        overview = metadata.get('overview', "Overview not found")
        release_date = metadata.get('release_date', "Release date not found")
        poster_url = metadata.get('poster_url', "Poster URL not found")

        print(f"IMDb ID: {similar_imdb_id}, Score: {match['score']}")
        print(f"Title: {title}")
        print(f"Overview: {overview}")
        print(f"Release Date: {release_date}")
        print(f"Poster URL: {poster_url}")
else:
    print(f"IMDb ID {specific_imdb_id} not found in the dataset.")


Specified Movie IMDb ID: tt8523678
Poster URL: https://m.media-amazon.com/images/M/MV5BZDIwMWNmNzctNjFlYS00ZTMyLTlmOWEtYThiZjJjYjcwNmE5XkEyXkFqcGdeQXVyMjk3NTUyOTc@._V1_SX300.jpg
Top 10 Similar Movies:
IMDb ID: tt8523678, Score: 1.00005567
Title: Flesh & Blood
Overview: Kimberly, a teenager suffering from agoraphobia, has not left the house since her mother's unsolved murder. On the eve of Thanksgiving, she begins to suspect that the safe harbor of home and her doting father may be a dangerous mirage.
Release Date: 2018-11-02
Poster URL: https://m.media-amazon.com/images/M/MV5BZDIwMWNmNzctNjFlYS00ZTMyLTlmOWEtYThiZjJjYjcwNmE5XkEyXkFqcGdeQXVyMjk3NTUyOTc@._V1_SX300.jpg
IMDb ID: tt4670016, Score: 0.730577171
Title: Wolves at the Door
Overview: Four friends gather at an elegant home during the Summer of Love, 1969. Unbeknownst to them, deadly visitors are waiting outside. What begins as a simple farewell party turns to a night of primal terror as the intruders stalk and torment the four, who

In [None]:
# Specify the IMDb ID of the movie you want to use
specific_imdb_id = 'tt3638686'

id_to_poster = {item['imdbId']: item['poster'] for item in ds['train']}

# Look up the movie in the dataset using the specific IMDb ID
if specific_imdb_id in id_to_poster:
    poster_url = id_to_poster[specific_imdb_id]

    # Get the embedding of the specified movie (use the function get_image_embedding)
    embedding = get_image_embedding(poster_url)  # Assuming you have already defined this function

    # Query Pinecone to get the top 10 closest movie posters
    result = index.query(vector=embedding.tolist(), top_k=10, include_metadata=True, namespace="CLIP")

    # Print the specified movie and the top 10 similar movies with metadata
    print(f"Specified Movie IMDb ID: {specific_imdb_id}")
    print(f"Poster URL: {poster_url}")
    print("Top 10 Similar Movies:")
    for match in result['matches']:
        similar_imdb_id = match['id']
        similar_poster_url = id_to_poster.get(similar_imdb_id, "URL not found")
        # Access the metadata
        metadata = match.get('metadata', {})
        title = metadata.get('title', "Title not found")
        overview = metadata.get('overview', "Overview not found")
        release_date = metadata.get('release_date', "Release date not found")
        poster_url = metadata.get('poster_url', "Poster URL not found")

        print(f"IMDb ID: {similar_imdb_id}, Score: {match['score']}")
        print(f"Title: {title}")
        print(f"Overview: {overview}")
        print(f"Release Date: {release_date}")
        print(f"Poster URL: {poster_url}")
else:
    print(f"IMDb ID {specific_imdb_id} not found in the dataset.")


Specified Movie IMDb ID: tt3638686
Poster URL: https://m.media-amazon.com/images/M/MV5BNzI0MTk5ODg4MF5BMl5BanBnXkFtZTgwODQ2NjgwOTE@._V1_SX300.jpg
Top 10 Similar Movies:
IMDb ID: tt3638686, Score: 1.00113225
Title: Famous Nathan
Overview: A Coney Island-inspired, densely-layered visually dynamic documentary portrait of the life and times of the original Nathan's Famous, created in 1916 by filmmaker Lloyd Handwerker's grandparents, Nathan and Ida Handwerker. 30 years in the making, Famous Nathan interweaves decades-spanning archival footage, family photos and home movies, an eclectic soundtrack and never-before-heard audio from Nathan: his only interview, ever as well as compelling, intimate and hilarious interviews with the dedicated band of workers, not at all shy at offering opinions, memories and the occasional tall tale.
Release Date: 2015-07-17
Poster URL: https://m.media-amazon.com/images/M/MV5BNzI0MTk5ODg4MF5BMl5BanBnXkFtZTgwODQ2NjgwOTE@._V1_SX300.jpg
IMDb ID: tt4191702, Score: 0

Test TMDB API

In [None]:
import requests

tmdb_api_key = userdata.get('TMDB_API_KEY')

# Function to get movie details from TMDB using IMDb ID
def get_movie_details(imdb_id):
    url = f"https://api.themoviedb.org/3/find/{imdb_id}"
    params = {
        'api_key': tmdb_api_key,
        'external_source': 'imdb_id'
    }
    response = requests.get(url, params=params)
    data = response.json()

    # print("respones data")
    # print(data)

    if response.status_code == 200 and 'movie_results' in data:
        if len(data['movie_results']) > 0:
            return data['movie_results'][0]  # Return the first movie result
        else:
            print("No movie found for this IMDb ID.")
            return None
    else:
        print(f"Error: {response.status_code}")
        return None

# Example usage:
imdb_id = 'tt2514894'  # Example IMDb ID
movie_details = get_movie_details(imdb_id)

print(movie_details)

if movie_details:
    print("Title:", movie_details.get('title'))
    print("Overview:", movie_details.get('overview'))
    print("Release Date:", movie_details.get('release_date'))
    print("Poster Path:", id_to_poster.get(imdb_id, "URL not found"))


{'backdrop_path': None, 'id': 173465, 'title': 'Medora', 'original_title': 'Medora', 'overview': "In America's basketball heartland, four resilient boys from rural Medora, Indiana, fight to end their high school team's three-year losing streak, as their dwindling town faces the threat of extinction.", 'poster_path': None, 'media_type': 'movie', 'adult': False, 'original_language': 'en', 'genre_ids': [99], 'popularity': 0.514, 'release_date': '2013-11-08', 'video': False, 'vote_average': 7.2, 'vote_count': 4}
Title: Medora
Overview: In America's basketball heartland, four resilient boys from rural Medora, Indiana, fight to end their high school team's three-year losing streak, as their dwindling town faces the threat of extinction.
Release Date: 2013-11-08
Poster Path: https://m.media-amazon.com/images/M/MV5BMTU0MzM2ODI2N15BMl5BanBnXkFtZTgwNTE0NDc2MDE@._V1_SX300.jpg


# Version 2 - Populate separate indexes using "popular" TMDB endpoint

In [None]:
# Pinecone and create an index
from pinecone import Pinecone
from pinecone import ServerlessSpec
from google.colab import userdata

pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))

pc.create_index(
    name="movie-posters-v2-clip",
    dimension=768, # dimensions from CLIP
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

# pc.create_index(
#     name="movie-posters-v2-resnet-50",
#     dimension=2048,
#     metric="euclidean",
#     spec=ServerlessSpec(
#         cloud="aws",
#         region="us-east-1"
#     )
# )

# pc.create_index(
#     name="movie-posters-v2-vgg16",
#     dimension=4096,
#     metric="euclidean",
#     spec=ServerlessSpec(
#         cloud="aws",
#         region="us-east-1"
#     )
# )

# Access the existing index
#index = pc.Index("movie-posters")

Getting top 10,000 popular movies (limited by page 500 max for tmdb, 20 results per page)

In [None]:
import requests
from google.colab import userdata
import json
import pprint
import time

def get_popular_page(page, TMDB_AUTH):
  '''
  Returns results page as python dict
  20 movie results per page
  '''
  time.sleep(0.02)  # 50 requests per second = 1 request every 0.02 seconds so we don't overload TMDB API
  url = f"https://api.themoviedb.org/3/movie/popular?language=en-US&page={page}"
  headers = {
    "accept": "application/json",
    "Authorization": TMDB_AUTH
  }
  try:
    response = requests.get(url, headers=headers)
  except Exception as e:
    print(f"Problem getting response for page {page} {e}")
    return None
  response_dict = json.loads(response.text)
  return response_dict['results']

In [None]:
TMDB_AUTH = userdata.get("TMDB_AUTHORIZATION")

movies = []
page_max = 2501 # not inclusive
for page in range(1, page_max):
  results = None
  try:
    results = get_popular_page(page, TMDB_AUTH)
  except Exception as e:
    print(f"Broke at {page} with error {e}")
    page_max = page
    break
  if results:
    movies.extend(results)

json_path = f'/content/drive/My Drive/Colab_Notebooks/popular_tmdb_endpoint_until_page_{page_max}.json'

with open(json_path, 'w') as f:
    json.dump(movies, f)


# # Load the movies list from the JSON file
# with open(json_path, 'r') as f:
#     movies = json.load(f)

# pp = pprint.PrettyPrinter(indent=4)
# #print(len(movies))
# #pp.pprint(movies)
# print(len(movies))
# print(json.dumps(movies))

# https://image.tmdb.org/t/p/original/[poster_path]

# https://image.tmdb.org/t/p/original
# https://image.tmdb.org/t/p/original/vKVUsumbCzK5Kn3aDpKM4EizKCA.jpg
# https://image.tmdb.org/t/p/original/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg

Broke at 501 with error 'results'


In [None]:
json_path = f'/content/drive/My Drive/Colab_Notebooks/popular_tmdb_endpoint_until_page_501.json'

# Load the movies list from the JSON file
with open(json_path, 'r') as f:
    movies = json.load(f)

for i, m in enumerate(movies):
  print(m)
  if i == 5:
    break


{'adult': False, 'backdrop_path': '/yDHYTfA3R0jFYba16jBB1ef8oIt.jpg', 'genre_ids': [28, 35, 878], 'id': 533535, 'original_language': 'en', 'original_title': 'Deadpool & Wolverine', 'overview': 'A listless Wade Wilson toils away in civilian life with his days as the morally flexible mercenary, Deadpool, behind him. But when his homeworld faces an existential threat, Wade must reluctantly suit-up again with an even more reluctant Wolverine.', 'popularity': 7253.796, 'poster_path': '/8cdWjvZQUExUUTzyp4t6EDMubfO.jpg', 'release_date': '2024-07-24', 'title': 'Deadpool & Wolverine', 'video': False, 'vote_average': 7.779, 'vote_count': 2137}
{'adult': False, 'backdrop_path': '/stKGOm8UyhuLPR9sZLjs5AkmncA.jpg', 'genre_ids': [16, 10751, 12, 35], 'id': 1022789, 'original_language': 'en', 'original_title': 'Inside Out 2', 'overview': "Teenager Riley's mind headquarters is undergoing a sudden demolition to make room for something entirely unexpected: new Emotions! Joy, Sadness, Anger, Fear and Disg

CLIP Processing on each popular movie

# Version 2 - Popular movies endpoint with 3 models (CLIP, VGG16, ResNet-50)

In [3]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
import json
from io import BytesIO
import torch
import pinecone
from pinecone import Pinecone
from google.colab import userdata
import time

# Load the model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
NAMESPACE = "clip-vit-large-patch14"

pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))
index = pc.Index("movie-posters-v2-clip")

json_path = f'/content/drive/My Drive/Colab_Notebooks/popular_tmdb_endpoint_until_page_501.json'

# Load the movies list from the JSON file
with open(json_path, 'r') as f:
    movies = json.load(f)

def get_image_embedding(poster_url):
    # Fetch the image from the URL
    try:
      poster_url = "https://image.tmdb.org/t/p/original" + poster_url
      response = requests.get(poster_url)
      image = Image.open(BytesIO(response.content)).convert('RGB')

      # Preprocess the image
      inputs = processor(images=image, return_tensors="pt")

      # Generate image embedding
      with torch.no_grad():
          outputs = model.get_image_features(**inputs)
          embedding = outputs.squeeze().numpy()

      return embedding
    except Exception as e:
      print(f"Failed to get embedding for {poster_url}: {e}")
      return None

def prepare_metadata(movie):
    return {
        "adult": movie.get("adult", False),
        "backdrop_path": movie.get("backdrop_path", "empty") if movie.get("backdrop_path") is not None else "empty",
        "genre_ids": ','.join(map(str, movie.get("genre_ids", []))),
        "id": str(movie.get("id", "")),
        "original_language": movie.get("original_language", ""),
        "original_title": movie.get("original_title", ""),
        "overview": movie.get("overview", ""),
        "popularity": movie.get("popularity", 0.0),
        "poster_path": movie.get("poster_path", ""),
        "release_date": movie.get("release_date", ""),
        "title": movie.get("title", ""),
        "video": movie.get("video", False),
        "vote_average": movie.get("vote_average", 0.0),
        "vote_count": movie.get("vote_count", 0)
    }

def process_and_upload():
    for i in range(1, 10001):
    #for i, movie in enumerate(movies):
        movie = movies[i]
        imdb_id = str(movie.get('id', '')) # Ensure ID is a string
        poster_url = movie.get('poster_path', '')
        print(f"Processing {imdb_id} - {movie.get('title')}")

        # Generate image embedding
        try:
            embedding = get_image_embedding(poster_url)
        except Exception as e:
            print(f"Failed to get embedding for {poster_url}: {e}")
            continue

        metadata = prepare_metadata(movie)
        #print(metadata)

        try:
        # Upload to Pinecone with the specified namespace and metadata as a dictionary
          index.upsert(
              vectors=[(imdb_id, embedding.tolist(), metadata)],
              namespace=NAMESPACE
          )
        except Exception as e:
            print(f"Failed to upload to Pinecone: {e}")

process_and_upload()

config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Processing 1324413 - Встреча
Processing 288158 - The Woman in Black 2: Angel of Death
Processing 1073045 - The Peacock
Processing 37765 - Ace
Processing 514999 - Murder Mystery
Processing 10862 - Bounce
Processing 1252 - Lonely Hearts
Processing 14412 - Body Heat
Processing 104201 - Noisy Requiem
Processing 81390 - A Lonely Place to Die
Processing 10219 - Snow Falling on Cedars
Processing 844417 - Marlowe
Processing 56497 - Three Men to Destroy
Processing 227359 - Force of Execution
Processing 110974 - Goliath and the Vampires
Processing 56937 - Cold Prey III
Processing 1146412 - Gabriel's Redemption: Part II
Processing 1005578 - WWE WrestleMania XL Sunday
Processing 309302 - Wolf Totem
Processing 642732 - Roadrunner: A Film About Anthony Bourdain
Processing 615904 - Marry Me
Processing 111132 - The Masseur
Processing 78049 - The Scorpion King 3: Battle for Redemption
Processing 9945 - Vampires
Processing 296349 - Female Prisoner Ayaka: Bitch-Training Torment
Processing 7548 - The Libe

IndexError: list index out of range

In [None]:
print("done")

In [4]:
!pip install timm torch torchvision pinecone-client requests

Collecting timm
  Downloading timm-1.0.8-py3-none-any.whl.metadata (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m51.2/53.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m906.3 kB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9

In [17]:
import torch
import torchvision.transforms as transforms
from torchvision.models import vgg16
from PIL import Image
from urllib.request import urlopen
import numpy as np
import pinecone
from pinecone import Pinecone
import json
from google.colab import userdata

# Load the VGG16 model with its classifier
model = vgg16(pretrained=True)
model.eval()

# Remove the final classification layer
model.classifier = torch.nn.Sequential(*list(model.classifier.children())[:-1])

# Define the transform for input images
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

NAMESPACE = "vgg16-tv-in1k"
pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))
index = pc.Index("movie-posters-v2-vgg16")

json_path = '/content/drive/My Drive/Colab_Notebooks/popular_tmdb_endpoint_until_page_501.json'

# Load the movies list from the JSON file
with open(json_path, 'r') as f:
    movies = json.load(f)

def get_image_embedding(poster_url):
    try:
        poster_url = "https://image.tmdb.org/t/p/original" + poster_url
        img = Image.open(urlopen(poster_url)).convert('RGB')  # Ensure image is in RGB format
        input_tensor = transform(img).unsqueeze(0)  # Add batch dimension

        with torch.no_grad():
            # Forward pass through the model
            features = model(input_tensor)
            embedding = features.squeeze().numpy()  # Remove batch dimension and convert to NumPy array

        #print(f"Embedding shape: {embedding.shape}")  # Print shape to confirm size

        if embedding.shape[0] != 4096:
            raise ValueError(f"Embedding dimension mismatch: Expected 4096, but got {embedding.shape[0]}")

        # Convert to list of floats
        embedding_list = embedding.tolist()  # Convert NumPy array to list of floats

        return embedding_list  # Return as a list of floats
    except Exception as e:
        print(f"Failed to get embedding for {poster_url}: {e}")
        return None

def prepare_metadata(movie):
    return {
        "adult": movie.get("adult", False),
        "backdrop_path": movie.get("backdrop_path", "empty") if movie.get("backdrop_path") is not None else "empty",
        "genre_ids": ','.join(map(str, movie.get("genre_ids", []))),
        "id": str(movie.get("id", "")),
        "original_language": movie.get("original_language", ""),
        "original_title": movie.get("original_title", ""),
        "overview": movie.get("overview", ""),
        "popularity": movie.get("popularity", 0.0),
        "poster_path": movie.get("poster_path", ""),
        "release_date": movie.get("release_date", ""),
        "title": movie.get("title", ""),
        "video": movie.get("video", False),
        "vote_average": movie.get("vote_average", 0.0),
        "vote_count": movie.get("vote_count", 0)
    }

def process_and_upload():
    for i in range(1, 10001):
        movie = movies[i]
        imdb_id = str(movie.get('id', ''))
        poster_url = movie.get('poster_path', '')
        print(f"Processing {i} - {imdb_id} - {movie.get('title')}")

        try:
            embedding = get_image_embedding(poster_url)
            if embedding is None:
                continue

            metadata = prepare_metadata(movie)

            try:
                # Upload to Pinecone with the specified namespace and metadata as a dictionary
                index.upsert(
                    vectors=[(imdb_id, embedding, metadata)],  # Ensure embedding is a list of floats
                    namespace=NAMESPACE
                )
            except Exception as e:
                print(f"Failed to upload to Pinecone: {e}")

        except Exception as e:
            print(f"Failed to get embedding for {poster_url}: {e}")

process_and_upload()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing 5025 - 567971 - Vampires vs. the Bronx
Processing 5026 - 15789 - A Goofy Movie
Processing 5027 - 70844 - Superior Firepower: Making 'Aliens'
Processing 5028 - 12481 - The Big Boss
Processing 5029 - 761 - The Wing or the Thigh?
Processing 5030 - 25278 - One Piece: The Desert Princess and the Pirates: Adventure in Alabasta
Processing 5031 - 301959 - Interstellar: Nolan's Odyssey
Processing 5032 - 44865 - The Grandmaster
Processing 5033 - 616747 - Haunted Mansion
Processing 5034 - 362058 - The Snow Queen 3: Fire and Ice
Processing 5035 - 1729 - The Forbidden Kingdom
Processing 5036 - 1229991 - The Abbess
Processing 5037 - 11901 - High Plains Drifter
Processing 5038 - 8676 - Fool's Gold
Processing 5039 - 82695 - Les Misérables
Processing 5040 - 52451 - The Three Musketeers
Processing 5041 - 73475 - Call of the Blonde Goddess
Processing 5042 - 780382 - The Wolf and the Lion
Processing 5043 - 7453 - The Hitchhiker's 

IndexError: list index out of range

resnet50

In [21]:
from urllib.request import urlopen
import torch
from transformers import AutoModel, AutoImageProcessor
from PIL import Image
import json
import pinecone
from google.colab import userdata
import numpy as np

# Load the ResNet-50 model and feature extractor
model_name = "microsoft/resnet-50"
model = AutoModel.from_pretrained(model_name)
image_processor = AutoImageProcessor.from_pretrained(model_name)

# Set model to evaluation mode
model.eval()

NAMESPACE = "resnet50"
pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))
index = pc.Index("movie-posters-v2-resnet-50")

json_path = '/content/drive/My Drive/Colab_Notebooks/popular_tmdb_endpoint_until_page_501.json'

# Load the movies list from the JSON file
with open(json_path, 'r') as f:
    movies = json.load(f)

def get_image_embedding(poster_url):
    try:
        poster_url = "https://image.tmdb.org/t/p/original" + poster_url
        img = Image.open(urlopen(poster_url))

        # Preprocess the image
        inputs = image_processor(images=img, return_tensors="pt")

        # Generate embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            pooled_output = outputs.pooler_output  # Get the pooled output
            embedding = pooled_output.numpy()

        #print(f"Embedding shape: {embedding.shape}")  # Print shape to confirm size

        # Convert to list of floats
        if isinstance(embedding, np.ndarray):
            embedding_list = embedding.flatten().tolist()
        else:
            embedding_list = list(map(float, embedding))

        return embedding_list
    except Exception as e:
        print(f"Failed to get embedding for {poster_url}: {e}")
        return None

def prepare_metadata(movie):
    return {
        "adult": movie.get("adult", False),
        "backdrop_path": movie.get("backdrop_path", "empty") if movie.get("backdrop_path") is not None else "empty",
        "genre_ids": ','.join(map(str, movie.get("genre_ids", []))),
        "id": str(movie.get("id", "")),
        "original_language": movie.get("original_language", ""),
        "original_title": movie.get("original_title", ""),
        "overview": movie.get("overview", ""),
        "popularity": movie.get("popularity", 0.0),
        "poster_path": movie.get("poster_path", ""),
        "release_date": movie.get("release_date", ""),
        "title": movie.get("title", ""),
        "video": movie.get("video", False),
        "vote_average": movie.get("vote_average", 0.0),
        "vote_count": movie.get("vote_count", 0)
    }

def process_and_upload():
    for i in range(1, 10001):
        movie = movies[i]
        imdb_id = str(movie.get('id', ''))
        poster_url = movie.get('poster_path', '')
        print(f"Processing {imdb_id} - {movie.get('title')}")

        try:
            embedding = get_image_embedding(poster_url)
            if embedding is None:
                continue

        except Exception as e:
            print(f"Failed to get embedding for {poster_url}: {e}")
            continue

        metadata = prepare_metadata(movie)

        try:
            index.upsert(
                vectors=[(imdb_id, embedding, metadata)],
                namespace=NAMESPACE
            )
        except Exception as e:
            print(f"Failed to upload to Pinecone: {e}")

process_and_upload()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing 780382 - The Wolf and the Lion
Processing 7453 - The Hitchhiker's Guide to the Galaxy
Processing 14863 - Anaconda 3: Offspring
Processing 10835 - The Killer
Processing 36685 - The Rocky Horror Picture Show
Processing 1030411 - Last Riot
Processing 1000081 - Sentinelle
Processing 814338 - Lady Chatterley's Lover
Processing 193610 - The Other Woman
Processing 399360 - Alpha
Processing 14199 - The Adventures of Sharkboy and Lavagirl
Processing 5925 - The Great Escape
Processing 56590 - All Star Superman
Processing 9387 - Conan the Barbarian
Processing 10623 - Cradle 2 the Grave
Processing 1040176 - Black Clown
Processing 87826 - Here Comes the Boom
Processing 37645 - 22 Bullets
Processing 2640 - Heathers
Processing 1158915 - Dìdi (弟弟)
Processing 11571 - Journey to the Center of the Earth
Processing 1620 - Hitman
Processing 1039292 - Bad Child
Processing 10851 - Revolver
Processing 295151 - Let It Snow
Processing 2

IndexError: list index out of range

Tried batches below instead of single instance - found that batching does not improve processing time

In [None]:
# from transformers import CLIPProcessor, CLIPModel
# from PIL import Image
# import requests
# import json
# from io import BytesIO
# import torch
# import pinecone
# from pinecone import Pinecone
# from google.colab import userdata
# import time

# # Load the model and processor
# model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
# NAMESPACE = "clip-vit-large-patch14"

# pc = Pinecone(api_key=userdata.get('PINECONE_KEY'))
# index = pc.Index("movie-posters-v2-clip")

# json_path = f'/content/drive/My Drive/Colab_Notebooks/popular_tmdb_endpoint_until_page_501.json'

# # Load the movies list from the JSON file
# with open(json_path, 'r') as f:
#     movies = json.load(f)

# def get_image_embedding(poster_url):
#     # Fetch the image from the URL
#     try:
#       poster_url = "https://image.tmdb.org/t/p/original" + poster_url
#       response = requests.get(poster_url)
#       image = Image.open(BytesIO(response.content)).convert('RGB')

#       # Preprocess the image
#       inputs = processor(images=image, return_tensors="pt")

#       # Generate image embedding
#       with torch.no_grad():
#           outputs = model.get_image_features(**inputs)
#           embedding = outputs.squeeze().numpy()

#       return embedding
#     except Exception as e:
#       print(f"Failed to get embedding for {poster_url}: {e}")
#       return None

# def get_image_embedding_batch(poster_urls):
#     try:
#         images = []
#         for poster_url in poster_urls:
#             poster_url = "https://image.tmdb.org/t/p/original" + poster_url
#             response = requests.get(poster_url)
#             image = Image.open(BytesIO(response.content)).convert('RGB')
#             images.append(image)

#         # Preprocess the images as a batch
#         inputs = processor(images=images, return_tensors="pt")

#         # Generate image embeddings for the batch
#         with torch.no_grad():
#             outputs = model.get_image_features(**inputs)
#             embeddings = outputs.squeeze().numpy()

#         return embeddings
#     except Exception as e:
#         print(f"Failed to get embedding for batch: {e}")
#         return None

# def prepare_metadata(movie):
#     return {
#         "adult": movie.get("adult", False),
#         "backdrop_path": movie.get("backdrop_path", "empty") if movie.get("backdrop_path") is not None else "empty",
#         "genre_ids": ','.join(map(str, movie.get("genre_ids", []))),
#         "id": str(movie.get("id", "")),
#         "original_language": movie.get("original_language", ""),
#         "original_title": movie.get("original_title", ""),
#         "overview": movie.get("overview", ""),
#         "popularity": movie.get("popularity", 0.0),
#         "poster_path": movie.get("poster_path", ""),
#         "release_date": movie.get("release_date", ""),
#         "title": movie.get("title", ""),
#         "video": movie.get("video", False),
#         "vote_average": movie.get("vote_average", 0.0),
#         "vote_count": movie.get("vote_count", 0)
#     }

# def process_and_upload():
#   batch_size = 16
#   for i in range(3904, len(movies), batch_size):
#       batch_movies = movies[i:i+batch_size]
#       poster_urls = [movie.get('poster_path', '') for movie in batch_movies]
#       embeddings = get_image_embedding_batch(poster_urls)

#       if embeddings is not None:
#           for j, movie in enumerate(batch_movies):
#               imdb_id = str(movie.get('id', ''))
#               print(f"Processing {imdb_id} - {movie.get('title')}")
#               metadata = prepare_metadata(movie)
#               index.upsert(
#                   vectors=[(imdb_id, embeddings[j].tolist(), metadata)],
#                   namespace=NAMESPACE
#               )
# process_and_upload()

Processing 760873 - The Colony
Processing 62764 - Mirror Mirror
Processing 87825 - Trouble with the Curve
Processing 1115939 - Jagged Mind
Processing 11825 - Police Academy 5: Assignment Miami Beach
Processing 11082 - The Seventh Sign
Processing 204082 - Homefront
Processing 10990 - Mulholland Falls
Processing 41154 - Men in Black 3
Processing 74998 - Seeking Justice
Processing 728754 - Stand by Me Doraemon 2
Processing 34134 - Barbie in A Mermaid Tale
Processing 708981 - Sex Education for the Sister-in-law
Processing 832262 - Force of Nature: The Dry 2
Processing 1042476 - Wolves of the Night
Processing 747188 - Asteroid City
Processing 487047 - A Witches' Ball
Processing 890244 - A Stalker in the House
Processing 454294 - The Kid Who Would Be King
Processing 257445 - Goosebumps
Processing 10935 - Heaven's Gate
Processing 1245 - The Remains of the Day
Processing 823491 - Out of Darkness
Processing 109584 - Broken
Processing 1042753 - Audition
Processing 2055 - Open Range
Processing 25

KeyboardInterrupt: 