In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import numpy as np

class DeepMatchNetMM(nn.Module):
    def __init__(self, text_model_name="bert-base-uncased", num_cat_features=10, num_num_features=6, embedding_sizes=None):
        super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Text encoder
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.text_tokenizer = AutoTokenizer.from_pretrained(text_model_name)
        self.text_proj = nn.Linear(self.text_encoder.config.hidden_size * 3, 256)

        # Categorical embeddings
        self.cat_embeds = nn.ModuleList([
            nn.Embedding(num_categories, embed_dim)
            for num_categories, embed_dim in embedding_sizes
        ])
        self.cat_proj = nn.Linear(sum([e.embedding_dim for e in self.cat_embeds]), 128)

        # Numeric features
        self.num_proj = nn.Sequential(
            nn.Linear(num_num_features, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )

        # Prediction head
        self.predictor = nn.Sequential(
            nn.Linear(256 + 128 + 64, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, text_inputs, cat_inputs, num_inputs):
        texts = [self.text_tokenizer(t, return_tensors="pt", padding=True, truncation=True).to(self.device) for t in text_inputs]
        text_features = [self.text_encoder(**t).pooler_output for t in texts]
        text_out = torch.cat(text_features, dim=-1)
        text_out = self.text_proj(text_out)

        cat_outs = [embed(cat_inputs[:, i]) for i, embed in enumerate(self.cat_embeds)]
        cat_out = torch.cat(cat_outs, dim=-1)
        cat_out = self.cat_proj(cat_out)

        num_out = self.num_proj(num_inputs)

        x = torch.cat([text_out, cat_out, num_out], dim=-1)
        return self.predictor(x) * 100

# --- Helper: Encode single movie for input ---
def preprocess_movie(movie, tokenizer, cat_vocab, num_cols):
    # Text fields
    title = movie['title']
    overview = movie.get('overview', '')
    tagline = movie.get('tagline', '')

    # Categorical inputs
    cat_inputs = []
    for col in cat_vocab:
        val = movie.get(col, 'unknown')
        val_id = cat_vocab[col].get(val, 0)
        cat_inputs.append(val_id)
    cat_inputs = torch.tensor([cat_inputs], dtype=torch.long)

    # Numeric features
    num_inputs = torch.tensor([[movie[col] for col in num_cols]], dtype=torch.float)

    return [title], [overview], [tagline], cat_inputs, num_inputs

# --- Recommender Function ---
def recommend_movies(user_query_movie, all_movies, model, tokenizer, cat_vocab, num_cols, top_k=5):
    model.eval()
    model.to(model.device)

    query_texts, query_cat, query_num = preprocess_movie(user_query_movie, tokenizer, cat_vocab, num_cols)[0:3], *preprocess_movie(user_query_movie, tokenizer, cat_vocab, num_cols)[3:]

    scores = []
    for idx, movie in all_movies.iterrows():
        movie_texts, movie_cat, movie_num = preprocess_movie(movie, tokenizer, cat_vocab, num_cols)[0:3], *preprocess_movie(movie, tokenizer, cat_vocab, num_cols)[3:]
        with torch.no_grad():
            score = model(movie_texts, movie_cat.to(model.device), movie_num.to(model.device)).item()
        scores.append((movie['title'], score))

    ranked = sorted(scores, key=lambda x: x[1], reverse=True)
    return ranked[:top_k]

# --- Example Usage (Mock Data) ---
if __name__ == "__main__":
    # Dummy categorical vocab and movie dataframe
    cat_vocab = {
        'original_language': {'en': 1, 'fi': 2, 'unknown': 0},
        'status': {'Released': 1, 'Canceled': 2, 'unknown': 0}
    }

    num_cols = ['runtime', 'vote_average', 'vote_count', 'popularity', 'percentage_match', 'movieId']

    embedding_sizes = [(len(vocab), 8) for vocab in cat_vocab.values()]

    # Instantiate model
    model = DeepMatchNetMM(embedding_sizes=embedding_sizes, num_num_features=len(num_cols))
    model.eval()

    # Example movie (user query)
    query_movie = {
        'title': "Inception",
        'overview': "A thief who steals corporate secrets through dream-sharing technology is given an inverse task of planting an idea.",
        'tagline': "Your mind is the scene of the crime.",
        'original_language': 'en',
        'status': 'Released',
        'runtime': 148,
        'vote_average': 8.8,
        'vote_count': 21000,
        'popularity': 80,
        'percentage_match': 90,
        'movieId': 1
    }

    # Example all movies (should be your full dataset)
    all_movies = pd.DataFrame([
        query_movie,  # In real case, you’d load your dataset here
        dict(query_movie, title="Shutter Island", percentage_match=70, vote_average=7.5),
        dict(query_movie, title="Interstellar", percentage_match=95, vote_average=8.6)
    ])

    # Recommend
    top_movies = recommend_movies(query_movie, all_movies, model, model.text_tokenizer, cat_vocab, num_cols, top_k=3)

    print("Top Recommendations:")
    for title, score in top_movies:
        print(f"{title}: {score:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Top Recommendations:
Inception: 100.00
Shutter Island: 100.00
Interstellar: 100.00


In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import numpy as np
from google.colab import auth
from google.auth import credentials
from pandas_gbq import read_gbq

# -------------------------------
# DeepMatchNetMM Definition
# -------------------------------
class DeepMatchNetMM(nn.Module):
    def __init__(self, text_model_name="bert-base-uncased", num_cat_features=2, num_num_features=5, embedding_sizes=None):
        super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.text_tokenizer = AutoTokenizer.from_pretrained(text_model_name)
        self.text_proj = nn.Linear(self.text_encoder.config.hidden_size * 3, 256)

        self.cat_embeds = nn.ModuleList([
            nn.Embedding(num_categories, embed_dim)
            for num_categories, embed_dim in embedding_sizes
        ])
        self.cat_proj = nn.Linear(sum([e.embedding_dim for e in self.cat_embeds]), 128)

        self.num_proj = nn.Sequential(
            nn.Linear(num_num_features, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )

        self.predictor = nn.Sequential(
            nn.Linear(256 + 128 + 64, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, text_inputs, cat_inputs, num_inputs):
        texts = [self.text_tokenizer(t, return_tensors="pt", padding=True, truncation=True).to(self.device) for t in text_inputs]
        text_features = [self.text_encoder(**t).pooler_output for t in texts]
        text_out = torch.cat(text_features, dim=-1)
        text_out = self.text_proj(text_out)

        cat_outs = [embed(cat_inputs[:, i]) for i, embed in enumerate(self.cat_embeds)]
        cat_out = torch.cat(cat_outs, dim=-1)
        cat_out = self.cat_proj(cat_out)

        num_out = self.num_proj(num_inputs)

        x = torch.cat([text_out, cat_out, num_out], dim=-1)
        return self.predictor(x) * 100

# -------------------------------
# Preprocess Function
# -------------------------------
def preprocess_movie(movie, cat_vocab, num_cols):
    title = movie.get('title', '')
    overview = movie.get('overview', '')
    tagline = movie.get('tagline', '')

    cat_inputs = []
    for col in cat_vocab:
        val = movie.get(col, 'unknown')
        val_id = cat_vocab[col].get(val, 0)
        cat_inputs.append(val_id)
    cat_inputs = torch.tensor([cat_inputs], dtype=torch.long)

    num_inputs = torch.tensor([[movie.get(col, 0) for col in num_cols]], dtype=torch.float)

    return [title], [overview], [tagline], cat_inputs, num_inputs

# -------------------------------
# Recommendation Function
# -------------------------------
def recommend_movies(user_query_movie, all_movies, model, cat_vocab, num_cols, top_k=5):
    model.eval()
    model.to(model.device)

    query_texts, query_cat, query_num = preprocess_movie(user_query_movie, cat_vocab, num_cols)[0:3], *preprocess_movie(user_query_movie, cat_vocab, num_cols)[3:]

    scores = []
    for idx, movie in all_movies.iterrows():
        movie_texts, movie_cat, movie_num = preprocess_movie(movie, cat_vocab, num_cols)[0:3], *preprocess_movie(movie, cat_vocab, num_cols)[3:]
        with torch.no_grad():
            score = model(movie_texts, movie_cat.to(model.device), movie_num.to(model.device)).item()
        scores.append((movie['title'], score))

    ranked = sorted(scores, key=lambda x: x[1], reverse=True)
    return ranked[:top_k]

# -------------------------------
# Main Execution
# -------------------------------
if __name__ == "__main__":
    # Google Colab Authentication for BigQuery access
    auth.authenticate_user()

    # GCP settings
    PROJECT_ID = "virtualization-and-cloud"
    TABLE = "movies.movies-metadata"

    # Read data from BigQuery
    query = f"SELECT * FROM `{TABLE}`"
    df = read_gbq(query, project_id=PROJECT_ID)

    # Define columns
    text_cols = ['title', 'overview', 'tagline']
    cat_cols = ['original_language', 'status']
    num_cols = ['runtime', 'vote_average', 'vote_count', 'popularity']

    # Preprocessing
    for col in text_cols:
        df[col] = df[col].fillna("")

    for col in cat_cols:
        df[col] = df[col].fillna("unknown")

    for col in num_cols:
        df[col] = df[col].fillna(0)

    # Categorical vocab & embedding sizes
    cat_vocab = {
        col: {v: i + 1 for i, v in enumerate(df[col].dropna().unique())}
        for col in cat_cols
    }
    embedding_sizes = [(len(vocab) + 1, 8) for vocab in cat_vocab.values()]

    # Instantiate the model
    model = DeepMatchNetMM(embedding_sizes=embedding_sizes, num_num_features=len(num_cols))

    # Query movie
    query_movie = {
        'title': "Inception",
        'overview': "A thief who steals corporate secrets through dream-sharing technology is given an inverse task of planting an idea.",
        'tagline': "Your mind is the scene of the crime.",
        'original_language': 'en',
        'status': 'Released',
        'runtime': 148,
        'vote_average': 8.8,
        'vote_count': 21000,
        'popularity': 80
    }

    # Get recommendations
    top_movies = recommend_movies(query_movie, df, model, cat_vocab, num_cols, top_k=5)

    # Show results
    print("\nTop Movie Recommendations:")
    for title, score in top_movies:
        print(f"{title}: {score:.2f}")

Downloading: 100%|[32m██████████[0m|

Top Movie Recommendations:
Pokemon no Uchi Atsumaru?: 52.54
ReBroken: 51.50
Shogun's Ninja: 50.49
The Blue Elephant: Part III: 50.32
Sherlock Holmes and Doctor Watson: 50.18


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

save_dir = "/content/drive/MyDrive/DeepMatchModel"
os.makedirs(save_dir, exist_ok=True)

In [None]:
torch.save(model.state_dict(), os.path.join(save_dir, "deep_match_model.pt"))

In [None]:
import json

config = {
    "embedding_sizes": embedding_sizes,
    "num_num_features": len(num_cols),
    "text_model_name": "bert-base-uncased"
}

with open(os.path.join(save_dir, "deep_match_config.json"), "w") as f:
    json.dump(config, f)

In [None]:
with open(os.path.join(save_dir, "cat_vocab.json"), "w") as f:
    json.dump(cat_vocab, f)

In [None]:
df.to_pickle(os.path.join(save_dir, "movie_metadata.pkl"))

In [None]:
print("Saved")

Saved


In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import json
import os

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# -------------------------------
# Load Saved Components
# -------------------------------
save_dir = "/content/drive/MyDrive/DeepMatchModel"

# Load config
with open(os.path.join(save_dir, "deep_match_config.json")) as f:
    config = json.load(f)

embedding_sizes = config["embedding_sizes"]
num_num_features = config["num_num_features"]
text_model_name = config["text_model_name"]

# Load categorical vocab
with open(os.path.join(save_dir, "cat_vocab.json")) as f:
    cat_vocab = json.load(f)
cat_vocab = {k: {str(inner_k): v for inner_k, v in inner.items()} for k, inner in cat_vocab.items()}

# Load movie metadata
movie_df = pd.read_pickle(os.path.join(save_dir, "movie_metadata.pkl"))

# Define columns again
text_cols = ['title', 'overview', 'tagline']
cat_cols = ['original_language', 'status']
num_cols = ['runtime', 'vote_average', 'vote_count', 'popularity']

Mounted at /content/drive


In [None]:
class DeepMatchNetMM(nn.Module):
    def __init__(self, text_model_name="bert-base-uncased", num_cat_features=2, num_num_features=5, embedding_sizes=None):
        super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.text_tokenizer = AutoTokenizer.from_pretrained(text_model_name)
        self.text_proj = nn.Linear(self.text_encoder.config.hidden_size * 3, 256)

        self.cat_embeds = nn.ModuleList([
            nn.Embedding(num_categories, embed_dim)
            for num_categories, embed_dim in embedding_sizes
        ])
        self.cat_proj = nn.Linear(sum([e.embedding_dim for e in self.cat_embeds]), 128)

        self.num_proj = nn.Sequential(
            nn.Linear(num_num_features, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )

        self.predictor = nn.Sequential(
            nn.Linear(256 + 128 + 64, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, text_inputs, cat_inputs, num_inputs):
        texts = [self.text_tokenizer(t, return_tensors="pt", padding=True, truncation=True).to(self.device) for t in text_inputs]
        text_features = [self.text_encoder(**t).pooler_output for t in texts]
        text_out = torch.cat(text_features, dim=-1)
        text_out = self.text_proj(text_out)

        cat_outs = [embed(cat_inputs[:, i]) for i, embed in enumerate(self.cat_embeds)]
        cat_out = torch.cat(cat_outs, dim=-1)
        cat_out = self.cat_proj(cat_out)

        num_out = self.num_proj(num_inputs)

        x = torch.cat([text_out, cat_out, num_out], dim=-1)
        return self.predictor(x) * 100

In [None]:
model = DeepMatchNetMM(
    text_model_name=text_model_name,
    num_cat_features=len(embedding_sizes),
    num_num_features=num_num_features,
    embedding_sizes=embedding_sizes
)

model.load_state_dict(torch.load(os.path.join(save_dir, "deep_match_model.pt"), map_location=model.device))
model.to(model.device)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

DeepMatchNetMM(
  (text_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [None]:
def preprocess_movie(movie, cat_vocab, num_cols):
    title = movie.get('title', '')
    overview = movie.get('overview', '')
    tagline = movie.get('tagline', '')

    cat_inputs = []
    for col in cat_vocab:
        val = movie.get(col, 'unknown')
        val_id = cat_vocab[col].get(str(val), 0)
        cat_inputs.append(val_id)
    cat_inputs = torch.tensor([cat_inputs], dtype=torch.long)

    num_inputs = torch.tensor([[movie.get(col, 0) for col in num_cols]], dtype=torch.float)

    return [title], [overview], [tagline], cat_inputs, num_inputs

def recommend_movies(query_movie, all_movies, model, cat_vocab, num_cols, top_k=5):
    scores = []
    for _, movie in all_movies.iterrows():
        movie_texts, movie_cat, movie_num = preprocess_movie(movie, cat_vocab, num_cols)[0:3], *preprocess_movie(movie, cat_vocab, num_cols)[3:]
        with torch.no_grad():
            score = model(movie_texts, movie_cat.to(model.device), movie_num.to(model.device)).item()
        scores.append((movie['title'], score))

    ranked = sorted(scores, key=lambda x: x[1], reverse=True)
    return ranked[:top_k]

In [None]:
query_movie = {
    'title': "Interstellar",
    'overview': "A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival.",
    'tagline': "Mankind was born on Earth. It was never meant to die here.",
    'original_language': 'en',
    'status': 'Released',
    'runtime': 169,
    'vote_average': 8.6,
    'vote_count': 15000,
    'popularity': 78
}

top_movies = recommend_movies(query_movie, movie_df, model, cat_vocab, num_cols, top_k=5)

print("\nTop Recommendations:")
for title, score in top_movies:
    print(f"{title}: {score:.2f}")

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import json
import os
from typing import Dict, List, Tuple
import numpy as np

class DeepMatchNetMM(nn.Module):
    def __init__(self, text_model_name="bert-base-uncased", num_cat_features=2,
                 num_num_features=5, embedding_sizes=None):
        super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Initialize text components
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.text_tokenizer = AutoTokenizer.from_pretrained(text_model_name)

        # IMPORTANT: Keep the original architecture that expects concatenated features
        # (3 * hidden_size because we concatenate title, overview, and tagline embeddings)
        self.text_proj = nn.Linear(self.text_encoder.config.hidden_size * 3, 256)

        # Initialize categorical embeddings
        self.cat_embeds = nn.ModuleList([
            nn.Embedding(num_categories, embed_dim)
            for num_categories, embed_dim in embedding_sizes
        ])
        self.cat_proj = nn.Linear(sum([e.embedding_dim for e in self.cat_embeds]), 128)

        # Initialize numerical projection
        self.num_proj = nn.Sequential(
            nn.Linear(num_num_features, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )

        # Initialize predictor
        self.predictor = nn.Sequential(
            nn.Linear(256 + 128 + 64, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

        # Move model to device
        self.to(self.device)

    def preprocess_text_batch(self, titles: List[str], overviews: List[str], taglines: List[str]) -> List[Dict]:
        """Preprocess text inputs separately for each field"""
        # Tokenize each text field separately
        title_inputs = self.text_tokenizer(
            titles,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128,
            return_attention_mask=True
        ).to(self.device)

        overview_inputs = self.text_tokenizer(
            overviews,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256,
            return_attention_mask=True
        ).to(self.device)

        tagline_inputs = self.text_tokenizer(
            taglines,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=64,
            return_attention_mask=True
        ).to(self.device)

        return [title_inputs, overview_inputs, tagline_inputs]

    def forward(self, text_inputs: List[Dict], cat_inputs: torch.Tensor, num_inputs: torch.Tensor) -> torch.Tensor:
        """Forward pass with preprocessed inputs"""
        # Process each text field separately and concatenate
        text_features = []
        for text_input in text_inputs:
            features = self.text_encoder(**text_input).pooler_output
            text_features.append(features)

        # Concatenate all text features
        text_out = torch.cat(text_features, dim=-1)
        text_out = self.text_proj(text_out)

        # Process categorical features
        cat_outs = [embed(cat_inputs[:, i]) for i, embed in enumerate(self.cat_embeds)]
        cat_out = torch.cat(cat_outs, dim=-1)
        cat_out = self.cat_proj(cat_out)

        # Process numerical features
        num_out = self.num_proj(num_inputs)

        # Combine and predict
        x = torch.cat([text_out, cat_out, num_out], dim=-1)
        return self.predictor(x) * 100


class MovieRecommender:
    def __init__(self, model_dir: str, batch_size: int = 32):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.batch_size = batch_size

        # Load configuration
        with open(os.path.join(model_dir, "deep_match_config.json")) as f:
            config = json.load(f)

        # Load categorical vocabulary
        with open(os.path.join(model_dir, "cat_vocab.json")) as f:
            self.cat_vocab = json.load(f)
            self.cat_vocab = {k: {str(inner_k): v for inner_k, v in inner.items()}
                             for k, inner in self.cat_vocab.items()}

        # Load movie data
        self.movie_df = pd.read_pickle(os.path.join(model_dir, "movie_metadata.pkl"))

        # Define columns
        self.text_cols = ['title', 'overview', 'tagline']
        self.cat_cols = ['original_language', 'status']
        self.num_cols = ['runtime', 'vote_average', 'vote_count', 'popularity']

        # Initialize model
        self.model = DeepMatchNetMM(
            text_model_name=config["text_model_name"],
            num_cat_features=len(config["embedding_sizes"]),
            num_num_features=config["num_num_features"],
            embedding_sizes=config["embedding_sizes"]
        )

        # Load model weights - using strict=False to handle potential mismatches
        state_dict = torch.load(os.path.join(model_dir, "deep_match_model.pt"),
                              map_location=self.device)
        self.model.load_state_dict(state_dict, strict=False)
        self.model.eval()

        # Preprocess all movies for faster recommendation
        self._preprocess_all_movies()

    def _preprocess_all_movies(self):
        """Preprocess all movies in the dataset for faster recommendation"""
        # Preprocess categorical features
        self.all_cat = torch.stack([
            torch.tensor([
                self.cat_vocab[col].get(str(movie.get(col, 'unknown')), 0)
                for col in self.cat_cols
            ], dtype=torch.long)
            for _, movie in self.movie_df.iterrows()
        ]).to(self.device)

        # Preprocess numerical features
        self.all_num = torch.stack([
            torch.tensor([
                movie.get(col, 0)
                for col in self.num_cols
            ], dtype=torch.float)
            for _, movie in self.movie_df.iterrows()
        ]).to(self.device)

        # Preprocess text features in batches
        num_movies = len(self.movie_df)
        self.all_text_inputs = []

        for i in range(0, num_movies, self.batch_size):
            batch = self.movie_df.iloc[i:i+self.batch_size]
            text_inputs = self.model.preprocess_text_batch(
                batch['title'].tolist(),
                batch['overview'].tolist(),
                batch['tagline'].tolist()
            )
            self.all_text_inputs.append(text_inputs)

    def preprocess_query(self, movie: Dict) -> Tuple[List[Dict], torch.Tensor, torch.Tensor]:
        """Preprocess a single query movie"""
        # Process categorical features
        cat_inputs = torch.tensor([
            self.cat_vocab[col].get(str(movie.get(col, 'unknown')), 0)
            for col in self.cat_cols
        ], dtype=torch.long).unsqueeze(0).to(self.device)

        # Process numerical features
        num_inputs = torch.tensor([
            [movie.get(col, 0) for col in self.num_cols]
        ], dtype=torch.float).to(self.device)

        # Process text features
        text_inputs = self.model.preprocess_text_batch(
            [movie.get('title', '')],
            [movie.get('overview', '')],
            [movie.get('tagline', '')]
        )

        return text_inputs, cat_inputs, num_inputs

    def recommend_movies(self, query_movie: Dict, top_k: int = 5) -> List[Tuple[str, float]]:
        """Get recommendations for a query movie"""
        # Preprocess query movie
        query_text, query_cat, query_num = self.preprocess_query(query_movie)

        # Calculate query embedding
        with torch.no_grad():
            query_score = self.model(query_text, query_cat, query_num).item()

        # Calculate scores for all movies in batch mode
        all_scores = []
        for i, text_inputs in enumerate(self.all_text_inputs):
            start_idx = i * self.batch_size
            end_idx = min(start_idx + self.batch_size, len(self.movie_df))
            with torch.no_grad():
                batch_scores = self.model(text_inputs,
                                        self.all_cat[start_idx:end_idx],
                                        self.all_num[start_idx:end_idx])
                all_scores.append(batch_scores.cpu().numpy())

        scores = np.concatenate(all_scores).flatten()

        # Get top k recommendations
        top_indices = np.argsort(scores)[-top_k:][::-1]
        recommendations = [
            (self.movie_df.iloc[idx]['title'], float(scores[idx]))
            for idx in top_indices
        ]

        return recommendations


if __name__ == "__main__":
    # Initialize recommender
    recommender = MovieRecommender("/content/drive/MyDrive/DeepMatchModel")

    # Create query movie
    query_movie = {
        'title': "Interstellar",
        'overview': "A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival.",
        'tagline': "Mankind was born on Earth. It was never meant to die here.",
        'original_language': 'en',
        'status': 'Released',
        'runtime': 169,
        'vote_average': 8.6,
        'vote_count': 15000,
        'popularity': 78
    }

    # Get recommendations
    top_movies = recommender.recommend_movies(query_movie, top_k=5)

    print("\nTop Recommendations:")
    for title, score in top_movies:
        print(f"{title}: {score:.2f}")


Top Recommendations:
Pokemon no Uchi Atsumaru?: 52.54
ReBroken: 51.50
Shogun's Ninja: 50.49
The Blue Elephant: Part III: 50.32
Sherlock Holmes and Doctor Watson: 50.18


In [26]:
query_movie = {
        'title': "The Final Season",
        'overview': "True story of Kent Stock, who in the early \u002790s gives up a job and ditches his wedding plans to take over as head coach of the Norway High School baseball team. Kent must win over his players and convince them and himself that he can fill their former coach\u0027s shoes and that they can go out winners. In the summer of 1991 Norway High\u0027s baseball tradition ended on a triumphant but sombre note.",
        'tagline': "How Do You Want To Be Remembered?",
        'original_language': 'en',
        'status': 'Released',
        'runtime': 119,
        'vote_average': 5.3,
        'vote_count': 28,
        'popularity': 1.3
    }

In [13]:
query_movie = {
        'title': "Toy Story",
        'overview': "Led by Woody, Andy\u0027s toys live happily in his room until Andy\u0027s birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy\u0027s heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
        'tagline': "Hang on for the comedy that goes to infinity and beyond!",
        'original_language': 'en',
        'status': 'Released',
        'runtime': 81,
        'vote_average': 7.968,
        'vote_count': 18690,
        'popularity': 0.0
    }

In [14]:
top_movies = recommender.recommend_movies(query_movie, top_k=20)

In [15]:
print("\nTop Recommendations:")
for title, score in top_movies:
  print(f"{title}: {score:.2f}")


Top Recommendations:
Pokemon no Uchi Atsumaru?: 52.54
ReBroken: 51.50
Shogun's Ninja: 50.49
The Blue Elephant: Part III: 50.32
Sherlock Holmes and Doctor Watson: 50.18
Circumstances 2: The Chase: 50.13
Severus Snape and the Marauders: 49.95
Shaitani Ilaaka: 49.91
Star of Ulugbek: 49.72
Jaane Hoga Kya: 49.72
Soldier's tale: 49.71
DVS - European Vacation: 49.64
На Байкал 2: На абордаж: 49.64
Ghost Planes and the Mysteries of Flight 370: 49.61
Лиса, медведь и мужик: 49.60
Entre Bateas: 49.59
Frank Blue: 49.56
Bogowie pamięci: 49.55
Normaal: Ik Kom Altied Weer Terug: 49.53
Prawdziwa historia Listów do M.: 49.47


In [20]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import json
import os
import numpy as np
from typing import Dict, List, Tuple
from sklearn.preprocessing import normalize
import pickle

class DeepMatchNetMM(nn.Module):
    def __init__(self, text_model_name="bert-base-uncased", num_cat_features=2,
                 num_num_features=5, embedding_sizes=None):
        super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Text components
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.text_tokenizer = AutoTokenizer.from_pretrained(text_model_name)
        self.text_proj = nn.Linear(self.text_encoder.config.hidden_size * 3, 256)

        # Categorical embeddings
        self.cat_embeds = nn.ModuleList([
            nn.Embedding(num_categories, embed_dim)
            for num_categories, embed_dim in embedding_sizes
        ])
        self.cat_proj = nn.Linear(sum([e.embedding_dim for e in self.cat_embeds]), 128)

        # Numerical projection
        self.num_proj = nn.Sequential(
            nn.Linear(num_num_features, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )

        # Final predictor
        self.predictor = nn.Sequential(
            nn.Linear(256 + 128 + 64, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

        self.to(self.device)

    def encode_text(self, text_inputs: List[Dict]) -> torch.Tensor:
        """Optimized text encoding with combined processing"""
        text_features = []
        for text_input in text_inputs:
            features = self.text_encoder(**text_input).pooler_output
            text_features.append(features)
        return torch.cat(text_features, dim=-1)

    def forward(self, text_inputs: List[Dict], cat_inputs: torch.Tensor,
               num_inputs: torch.Tensor) -> torch.Tensor:
        """Optimized forward pass"""
        # Process text
        text_out = self.text_proj(self.encode_text(text_inputs))

        # Process categorical
        cat_outs = [embed(cat_inputs[:, i]) for i, embed in enumerate(self.cat_embeds)]
        cat_out = self.cat_proj(torch.cat(cat_outs, dim=-1))

        # Process numerical
        num_out = self.num_proj(num_inputs)

        # Final prediction
        return self.predictor(torch.cat([text_out, cat_out, num_out], dim=-1)) * 100

class MovieRecommender:
    def __init__(self, model_dir: str, batch_size: int = 64, use_cache: bool = True):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.batch_size = batch_size
        self.use_cache = use_cache
        self.cache_file = os.path.join(model_dir, "movie_embeddings_cache.pkl")

        # Load configuration and data
        self._load_config_and_data(model_dir)

        # Initialize model
        self.model = DeepMatchNetMM(
            text_model_name=self.config["text_model_name"],
            num_cat_features=len(self.config["embedding_sizes"]),
            num_num_features=self.config["num_num_features"],
            embedding_sizes=self.config["embedding_sizes"]
        )

        # Load model weights
        state_dict = torch.load(os.path.join(model_dir, "deep_match_model.pt"),
                              map_location=self.device)
        self.model.load_state_dict(state_dict, strict=False)
        self.model.eval()

        # Initialize cache
        self.movie_embeddings = None
        self.movie_titles = None
        self._preprocess_non_text_features()
        self._initialize_cache()

    def _load_config_and_data(self, model_dir: str):
        """Load configuration and data files"""
        with open(os.path.join(model_dir, "deep_match_config.json")) as f:
            self.config = json.load(f)

        with open(os.path.join(model_dir, "cat_vocab.json")) as f:
            self.cat_vocab = json.load(f)
            self.cat_vocab = {k: {str(inner_k): v for inner_k, v in inner.items()}
                             for k, inner in self.cat_vocab.items()}

        self.movie_df = pd.read_pickle(os.path.join(model_dir, "movie_metadata.pkl"))
        self.text_cols = ['title', 'overview', 'tagline']
        self.cat_cols = ['original_language', 'status']
        self.num_cols = ['runtime', 'vote_average', 'vote_count', 'popularity']

    def _initialize_cache(self):
        """Initialize or load cached embeddings"""
        if self.use_cache and os.path.exists(self.cache_file):
            with open(self.cache_file, 'rb') as f:
                cache_data = pickle.load(f)
                self.movie_embeddings = cache_data['embeddings']
                self.movie_titles = cache_data['titles']
                self.movie_ids = cache_data.get('ids', [None]*len(self.movie_titles))
        else:
            self._precompute_embeddings()
            if self.use_cache:
                self._save_cache()

    def _save_cache(self):
        """Save computed embeddings to cache file"""
        with open(self.cache_file, 'wb') as f:
            pickle.dump({
                'embeddings': self.movie_embeddings,
                'titles': self.movie_titles,
                'ids': self.movie_ids
            }, f)

    def _precompute_embeddings(self):
        """Precompute and cache all movie embeddings"""
        # Preprocess categorical and numerical features
        self._preprocess_non_text_features()

        # Process text features in batches
        num_movies = len(self.movie_df)
        all_embeddings = []
        self.movie_titles = []
        self.movie_ids = []

        for i in range(0, num_movies, self.batch_size):
            batch = self.movie_df.iloc[i:i+self.batch_size]
            text_inputs = self._process_text_batch(batch)

            with torch.no_grad():
                # Get intermediate embeddings
                text_out = self.model.text_proj(self.model.encode_text(text_inputs))
                cat_out = self._get_categorical_embeddings(i, batch)
                num_out = self.model.num_proj(self.all_num[i:i+self.batch_size])

                # Combine features
                embeddings = torch.cat([text_out, cat_out, num_out], dim=-1)
                all_embeddings.append(embeddings.cpu().numpy())
                self.movie_titles.extend(batch['title'].tolist())
                self.movie_ids.extend(batch['movieId'].tolist())

        # Normalize embeddings for efficient similarity search
        self.movie_embeddings = normalize(np.concatenate(all_embeddings), axis=1)

    def _preprocess_non_text_features(self):
        """Preprocess categorical and numerical features"""
        self.all_cat = torch.stack([
            torch.tensor([
                self.cat_vocab[col].get(str(movie.get(col, 'unknown')), 0)
                for col in self.cat_cols
            ], dtype=torch.long)
            for _, movie in self.movie_df.iterrows()
        ]).to(self.device)

        self.all_num = torch.stack([
            torch.tensor([
                movie.get(col, 0) for col in self.num_cols
            ], dtype=torch.float)
            for _, movie in self.movie_df.iterrows()
        ]).to(self.device)

    def _process_text_batch(self, batch: pd.DataFrame) -> List[Dict]:
        """Process text batch with optimized tokenization"""
        titles = batch['title'].fillna('').tolist()
        overviews = batch['overview'].fillna('').tolist()
        taglines = batch['tagline'].fillna('').tolist()

        # Tokenize all fields in parallel
        title_inputs = self.model.text_tokenizer(
            titles, return_tensors="pt", padding=True,
            truncation=True, max_length=64).to(self.device)

        overview_inputs = self.model.text_tokenizer(
            overviews, return_tensors="pt", padding=True,
            truncation=True, max_length=128).to(self.device)

        tagline_inputs = self.model.text_tokenizer(
            taglines, return_tensors="pt", padding=True,
            truncation=True, max_length=32).to(self.device)

        return [title_inputs, overview_inputs, tagline_inputs]

    def _get_categorical_embeddings(self, batch_idx: int, batch: pd.DataFrame) -> torch.Tensor:
        """Get categorical embeddings for a batch"""
        cat_outs = [embed(self.all_cat[batch_idx:batch_idx+len(batch)][:, i])
                   for i, embed in enumerate(self.model.cat_embeds)]
        return self.model.cat_proj(torch.cat(cat_outs, dim=-1))

    def _get_query_embedding(self, movie: Dict) -> np.ndarray:
        """Get embedding for a single query movie"""
        text_inputs = self._process_text_batch(pd.DataFrame([movie]))
        cat_inputs = torch.tensor([
            [self.cat_vocab[col].get(str(movie.get(col, 'unknown')), 0)
             for col in self.cat_cols]
        ], dtype=torch.long).to(self.device)

        num_inputs = torch.tensor([
            [movie.get(col, 0) for col in self.num_cols]
        ], dtype=torch.float).to(self.device)

        with torch.no_grad():
            text_out = self.model.text_proj(self.model.encode_text(text_inputs))
            cat_out = self._get_categorical_embeddings(0, pd.DataFrame([movie]))
            num_out = self.model.num_proj(num_inputs)

            query_embedding = torch.cat([text_out, cat_out, num_out], dim=-1)
            return normalize(query_embedding.cpu().numpy(), axis=1)

    def recommend_movies(self, query_movie: Dict, top_k: int = 5) -> List[Tuple[str,int, float]]:
        """Get recommendations using cached embeddings and cosine similarity"""
        # Get query embedding
        query_embedding = self._get_query_embedding(query_movie)

        # Calculate cosine similarity (fast matrix operation)
        scores = np.dot(self.movie_embeddings, query_embedding.T).flatten()

        # Get top k recommendations
        top_indices = np.argsort(scores)[-top_k:][::-1]
        return [
            (
            self.movie_df.iloc[idx]['title'],
            int(self.movie_df.iloc[idx]['movieId']),
            float(scores[idx] * 100))
            for idx in top_indices
        ]

    def refresh_cache(self):
        """Force refresh of cached embeddings"""
        self._precompute_embeddings()
        if self.use_cache:
            self._save_cache()


if __name__ == "__main__":
    # Initialize recommender with caching enabled
    recommender = MovieRecommender("/content/drive/MyDrive/DeepMatchModel",
                                 batch_size=128,
                                 use_cache=True)

    # Example query
    query_movie = {
        'title': "Interstellar",
        'overview': "A team of explorers travel through a wormhole in space...",
        'tagline': "Mankind was born on Earth...",
        'original_language': 'en',
        'status': 'Released',
        'runtime': 169,
        'vote_average': 8.6,
        'vote_count': 15000,
        'popularity': 78
    }

    # First run will be slower (cache building), subsequent runs will be fast
    top_movies = recommender.recommend_movies(query_movie, top_k=5)

    print("\nTop Recommendations:")
    for title, movie_id, score in top_movies:
      print(f"{movie_id} - {title}: {score:.2f}")



Top Recommendations:
594 - Snow White and the Seven Dwarfs: 100.00
5444 - Lilo & Stitch: 100.00
66097 - Coraline: 100.00
216167 - 365 Days: 100.00
185231 - The Chronicles of Narnia: The Lion, the Witch and the Wardrobe: 100.00


In [12]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import json
import os
from typing import Dict, List, Tuple
import numpy as np

class DeepMatchNetMM(nn.Module):
    def __init__(self, text_model_name="bert-base-uncased", num_cat_features=2,
                 num_num_features=5, embedding_sizes=None):
        super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Text encoder and tokenizer
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.text_tokenizer = AutoTokenizer.from_pretrained(text_model_name)
        for param in self.text_encoder.parameters():
            param.requires_grad = False  # Freeze BERT
        self.text_proj = nn.Linear(self.text_encoder.config.hidden_size * 3, 256)

        # Categorical embeddings
        self.cat_embeds = nn.ModuleList([
            nn.Embedding(num_categories, embed_dim)
            for num_categories, embed_dim in embedding_sizes
        ])
        self.cat_proj = nn.Linear(sum([e.embedding_dim for e in self.cat_embeds]), 128)

        # Numerical input processor
        self.num_proj = nn.Sequential(
            nn.Linear(num_num_features, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )

        # Final prediction
        self.predictor = nn.Sequential(
            nn.Linear(256 + 128 + 64, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
        self.to(self.device)

    def preprocess_text_batch(self, titles: List[str], overviews: List[str], taglines: List[str]) -> List[Dict]:
        """Tokenize and preprocess batches of text fields."""
        with torch.no_grad():
            title_inputs = self.text_tokenizer(
                titles, padding=True, truncation=True, max_length=128,
                return_tensors="pt"
            ).to(self.device)

            overview_inputs = self.text_tokenizer(
                overviews, padding=True, truncation=True, max_length=256,
                return_tensors="pt"
            ).to(self.device)

            tagline_inputs = self.text_tokenizer(
                taglines, padding=True, truncation=True, max_length=64,
                return_tensors="pt"
            ).to(self.device)

        return [title_inputs, overview_inputs, tagline_inputs]

    def forward(self, text_inputs: List[Dict], cat_inputs: torch.Tensor, num_inputs: torch.Tensor) -> torch.Tensor:
        """Forward pass for batch inputs."""
        with torch.no_grad():
            text_features = []
            for text_input in text_inputs:
                features = self.text_encoder(**text_input).pooler_output
                text_features.append(features)

        text_out = self.text_proj(torch.cat(text_features, dim=-1))

        cat_outs = [embed(cat_inputs[:, i]) for i, embed in enumerate(self.cat_embeds)]
        cat_out = self.cat_proj(torch.cat(cat_outs, dim=-1))

        num_out = self.num_proj(num_inputs)
        combined = torch.cat([text_out, cat_out, num_out], dim=-1)

        return self.predictor(combined) * 100


class MovieRecommender:
    def __init__(self, model_dir: str, batch_size: int = 32):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.batch_size = batch_size

        with open(os.path.join(model_dir, "deep_match_config.json")) as f:
            config = json.load(f)
        with open(os.path.join(model_dir, "cat_vocab.json")) as f:
            self.cat_vocab = json.load(f)
            self.cat_vocab = {k: {str(inner_k): v for inner_k, v in inner.items()}
                              for k, inner in self.cat_vocab.items()}
        self.movie_df = pd.read_pickle(os.path.join(model_dir, "movie_metadata.pkl"))

        self.text_cols = ['title', 'overview', 'tagline']
        self.cat_cols = ['original_language', 'status']
        self.num_cols = ['runtime', 'vote_average', 'vote_count', 'popularity']

        self.model = DeepMatchNetMM(
            text_model_name=config["text_model_name"],
            num_cat_features=len(config["embedding_sizes"]),
            num_num_features=config["num_num_features"],
            embedding_sizes=config["embedding_sizes"]
        )

        state_dict = torch.load(os.path.join(model_dir, "deep_match_model.pt"), map_location=self.device)
        self.model.load_state_dict(state_dict, strict=False)
        self.model.eval()

        self._preprocess_all_movies()

    def _preprocess_all_movies(self):
        cat_data = []
        for col in self.cat_cols:
            col_data = [self.cat_vocab[col].get(str(movie.get(col, 'unknown')), 0)
                        for _, movie in self.movie_df.iterrows()]
            cat_data.append(col_data)
        self.all_cat = torch.tensor(cat_data, dtype=torch.long).T.to(self.device)

        num_data = []
        for col in self.num_cols:
            col_data = [movie.get(col, 0) for _, movie in self.movie_df.iterrows()]
            num_data.append(col_data)
        self.all_num = torch.tensor(num_data, dtype=torch.float).T.to(self.device)

        self.all_text_inputs = []
        for i in range(0, len(self.movie_df), self.batch_size):
            batch = self.movie_df.iloc[i:i+self.batch_size]
            text_inputs = self.model.preprocess_text_batch(
                batch['title'].tolist(),
                batch['overview'].tolist(),
                batch['tagline'].tolist()
            )
            self.all_text_inputs.append(text_inputs)

    def preprocess_query(self, movie: Dict) -> Tuple[List[Dict], torch.Tensor, torch.Tensor]:
        cat_inputs = torch.tensor([[
            self.cat_vocab[col].get(str(movie.get(col, 'unknown')), 0)
            for col in self.cat_cols
        ]], dtype=torch.long).to(self.device)

        num_inputs = torch.tensor([[
            movie.get(col, 0) for col in self.num_cols
        ]], dtype=torch.float).to(self.device)

        text_inputs = self.model.preprocess_text_batch(
            [movie.get('title', '')],
            [movie.get('overview', '')],
            [movie.get('tagline', '')]
        )

        return text_inputs, cat_inputs, num_inputs

    def recommend_movies(self, query_movie: Dict, top_k: int = 5) -> List[Tuple[str, float]]:
        query_text, query_cat, query_num = self.preprocess_query(query_movie)
        with torch.no_grad():
            _ = self.model(query_text, query_cat, query_num)  # Score unused here

        all_scores = []
        for i, text_inputs in enumerate(self.all_text_inputs):
            start = i * self.batch_size
            end = min(start + self.batch_size, len(self.movie_df))
            cat_batch = self.all_cat[start:end]
            num_batch = self.all_num[start:end]

            with torch.no_grad():
                batch_scores = self.model(text_inputs, cat_batch, num_batch)
                all_scores.append(batch_scores.cpu())

        scores = torch.cat(all_scores).squeeze().numpy()
        top_indices = np.argpartition(scores, -top_k)[-top_k:]
        top_indices = top_indices[np.argsort(scores[top_indices])][::-1]

        recommendations = [
            (self.movie_df.iloc[idx]['title'], float(scores[idx]))
            for idx in top_indices
        ]
        return recommendations


if __name__ == "__main__":
    model_path = "/content/drive/MyDrive/DeepMatchModel"
    recommender = MovieRecommender(model_path)

    query_movie = {
        'title': "Interstellar",
        'overview': "A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival.",
        'tagline': "Mankind was born on Earth. It was never meant to die here.",
        'original_language': 'en',
        'status': 'Released',
        'runtime': 169,
        'vote_average': 8.6,
        'vote_count': 15000,
        'popularity': 78
    }

    top_movies = recommender.recommend_movies(query_movie, top_k=5)

    print("\nTop Recommendations:")
    for title, score in top_movies:
        print(f"{title}: {score:.2f}")


Top Recommendations:
Pokemon no Uchi Atsumaru?: 52.54
ReBroken: 51.50
Shogun's Ninja: 50.49
The Blue Elephant: Part III: 50.32
Sherlock Holmes and Doctor Watson: 50.18


In [22]:
print(np.__version__)

2.0.2
