In [7]:
import pandas as pd
import ast
import os
from google.cloud import bigquery
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch

In [8]:
client = bigquery.Client(project="virtualization-and-cloud")

In [9]:
query = """
SELECT * FROM `virtualization-and-cloud.movies.movies-metadata` WHERE overview IS NOT NULL AND title IS NOT NULL
"""

df = client.query(query).to_dataframe()

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [11]:
print(device)

cpu


In [13]:
def extract_genre_names(genre_list_str):
    try:
        genre_list = ast.literal_eval(genre_list_str) if isinstance(genre_list_str, str) else genre_list_str
        return ' '.join(genre['name'] for genre in genre_list)
    except (ValueError, SyntaxError, TypeError):
        return ''

# Apply to DataFrame
df['genre_text'] = df['genres'].apply(extract_genre_names)

# Create final text for embedding
df['text'] = (
    df['overview'].fillna('') + ' ' +
    df['tagline'].fillna('') + ' ' +
    df['genre_text'].fillna('')
)

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [14]:
# ✅ Generate embeddings
print("Encoding texts...")
df['embedding'] = df['text'].apply(lambda x: model.encode(x, device=device))

Encoding texts...


In [15]:
# ✅ Build cosine similarity matrix
print("Building similarity matrix...")
embeddings = list(df['embedding'])
similarity_matrix = cosine_similarity(embeddings)

Building similarity matrix...


In [16]:
# ✅ Create a recommendation function
def recommend_movies(title, top_k=5):
    try:
        idx = df[df['title'].str.lower() == title.lower()].index[0]
    except IndexError:
        return f"Movie '{title}' not found."

    scores = list(enumerate(similarity_matrix[idx]))
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:top_k+1]

    recommendations = df.iloc[[i[0] for i in sorted_scores]][['title', 'overview', 'genre_text']]
    return recommendations

# ✅ Try it out!
print(recommend_movies("Ariel", top_k=3))

                           title  \
22397               Land of Love   
18292            Boy Upside Down   
45850  I Hired a Contract Killer   

                                                overview  \
22397  A comedy of a Finnish man who falls in love, g...   
18292  A Finnish dramedy about an 11-year-old boy dea...   
45850  After losing his job and realizing that he is ...   

                       genre_text  
22397              Comedy Romance  
18292                Drama Comedy  
45850  Comedy Crime Drama Romance  


In [18]:
print(recommend_movies("Toy Story", top_k=5))

                                                   title  \
75034                                        Toy Story 2   
75633                                        Toy Story 3   
56971  Buzz Lightyear of Star Command: The Adventure ...   
82451                                        Toy Story 4   
17642                                      A Goofy Movie   

                                                overview  \
75034  Andy heads off to Cowboy Camp, leaving his toy...   
75633  Woody, Buzz, and the rest of Andy's toys haven...   
56971  Buzz Lightyear must battle Emperor Zurg with t...   
82451  Woody has always been confident about his plac...   
17642  An endearing modern-day story about how the lo...   

                                              genre_text  
75034                            Animation Comedy Family  
75633                            Animation Family Comedy  
56971  Animation Family Comedy Science Fiction Adventure  
82451          Family Adventure Animation 

In [19]:
df.to_pickle("movies_nlp_df.pkl")

In [20]:
import numpy as np
embeddings = np.vstack(df['embedding'].values)
np.save("embeddings.npy", embeddings)

In [21]:
np.save("nlp_similarity_matrix.npy", similarity_matrix)

In [22]:
!pip install faiss-cpu

Defaulting to user installation because normal site-packages is not writeable
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-win_amd64.whl.metadata (4.5 kB)
Downloading faiss_cpu-1.10.0-cp310-cp310-win_amd64.whl (13.7 MB)
   ---------------------------------------- 0.0/13.7 MB ? eta -:--:--
   -- ------------------------------------- 0.8/13.7 MB 11.4 MB/s eta 0:00:02
   ------- -------------------------------- 2.6/13.7 MB 8.9 MB/s eta 0:00:02
   ------------ --------------------------- 4.2/13.7 MB 8.7 MB/s eta 0:00:02
   --------------- ------------------------ 5.2/13.7 MB 8.0 MB/s eta 0:00:02
   ---------------------- ----------------- 7.6/13.7 MB 7.8 MB/s eta 0:00:01
   --------------------------- ------------ 9.4/13.7 MB 7.9 MB/s eta 0:00:01
   -------------------------------- ------- 11.3/13.7 MB 8.0 MB/s eta 0:00:01
   ------------------------------------- -- 12.8/13.7 MB 8.1 MB/s eta 0:00:01
   ---------------------------------------- 13.7/13.7 MB 8.0 MB/s eta 0:

In [23]:
import faiss

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
faiss.write_index(index, "faiss.index")