In [98]:
import os

import numpy as np
import pandas as pd
import plotly.express as px
from dotenv import load_dotenv
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from sklearn.decomposition import PCA
import requests

load_dotenv()

True

In [99]:
client = MongoClient(os.getenv("MONGO_URI"), server_api=ServerApi("1"))

db_name = "sample_mflix"
collection_name = "embedded_movies"

collection = client[db_name][collection_name]

pipeline = [
    {"$match": {"plot_embedding": {"$exists": 1}, "title": {"$exists": 1}, "genres": {"$exists": 1}}},
    {"$sample": {"size": 300}},
    {"$project": {"plot_embedding": 1, "title": 1, "genres": 1}},
]

In [100]:
embeddings = []
titles = []
genres = []
for doc in collection.aggregate(pipeline):
    embeddings.append(doc["plot_embedding"])
    titles.append(doc["title"])
    genres.append(doc["genres"])

In [101]:
embeddings = np.array(embeddings)
pca = PCA(n_components=3).fit(embeddings)
pcad_embeddings = pca.transform(embeddings)

In [102]:
df = pd.DataFrame(pcad_embeddings)
df['title'] = titles
df['genres'] = genres


In [103]:
# Step 1: Get all unique strings from the 'tags' column
unique_genres = set(genre for sublist in df['genres'] for genre in sublist)

# Step 2: Expand the 'tags' column into new columns
for tag in unique_genres:
    df[tag] = df['genres'].apply(lambda x: 1 if tag in x else 0)

# Step 3: Drop the original 'tags' column if you don't need it
df = df.drop(columns=['genres'])

In [104]:
df.head()

Unnamed: 0,0,1,2,title,Biography,Crime,Romance,Animation,Horror,Music,...,History,Western,Family,War,Thriller,Documentary,Musical,Fantasy,Short,Mystery
0,0.066961,0.067713,-0.053641,Detective Dee: Mystery of the Phantom Flame,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.10901,-0.164084,-0.111305,Don't Look Under the Bed,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,-0.15967,-0.064904,0.011324,Peter Pan,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,0.005492,-0.130533,-0.007381,Eagle Eye,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,0.16367,0.008592,-0.053351,Fulltime Killer,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [119]:
# Multiply the embedding values to exaggerate the differences between them
n = 10
df[0] = df[0] * n
df[1] = df[1] * n
df[2] = df[2] * n

In [106]:
df.head()

Unnamed: 0,0,1,2,title,Biography,Crime,Romance,Animation,Horror,Music,...,History,Western,Family,War,Thriller,Documentary,Musical,Fantasy,Short,Mystery
0,0.133923,0.135427,-0.107281,Detective Dee: Mystery of the Phantom Flame,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.21802,-0.328168,-0.222609,Don't Look Under the Bed,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,-0.319341,-0.129809,0.022648,Peter Pan,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,0.010985,-0.261066,-0.014762,Eagle Eye,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,0.32734,0.017185,-0.106702,Fulltime Killer,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
print(unique_genres)

{'Biography', 'Crime', 'Romance', 'Animation', 'Horror', 'Music', 'Comedy', 'Sci-Fi', 'Drama', 'Adventure', 'Action', 'History', 'Western', 'Family', 'War', 'Thriller', 'Documentary', 'Musical', 'Fantasy', 'Short', 'Mystery'}


In [108]:
# Fetch query embedding
query = "What movies take place in italy?"

url = "https://iltvectorsearch.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2023-05-15"
headers = {
            "Content-Type": "application/json",
            "api-key": os.getenv('MDB_API_KEY')
        }
repsonse = requests.post(url, headers=headers, json={"input": query}).json()
query_embedding = repsonse['data'][0]['embedding']

In [120]:
query_embedding_pcad = pca.transform(np.array(query_embedding).reshape(1, -1))[0] * n

In [121]:
query_embedding_pcad

array([-0.11959344,  0.14666784,  0.61419797])

In [122]:
fig = px.scatter_3d(df, x=0, y=1, z=2, hover_data="title", color="Short")
fig.add_scatter3d(
    x=[0], y=[0], z=[0], marker=dict(color="orange"), name="origin (0,0,0)"
)
fig.add_scatter3d(
    x=[query_embedding_pcad[0]],
    y=[query_embedding_pcad[1]],
    z=[query_embedding_pcad[2]],
    marker=dict(color="#16FF32"),
    name="query",
)

## Normalize the vectors to visualize how the Cosine Similairty function might find vectors close to it

In [137]:
# Normalize the vectors 
# Step 1: Compute the magnitude (norm) for each row (vector)
df['magnitude'] = np.linalg.norm(df[[0, 1, 2]], axis=1)

# Step 2: Normalize the x, y, z columns by dividing each by the magnitude
df_normalized = df[[0, 1, 2]].div(df['magnitude'], axis=0)

df_normalized = pd.concat([df_normalized, df[df.columns[3:]]], axis=1)

# Normalize the query vector
query_magnitude = np.linalg.norm(query_embedding_pcad)
query_normalized = query_embedding_pcad / query_magnitude

In [144]:
genre_to_color = "Fantasy"

fig = px.scatter_3d(df_normalized, x=0, y=1, z=2, hover_data="title", color=genre_to_color)
fig.add_scatter3d(
    x=[0], y=[0], z=[0], marker=dict(color="orange"), name="origin (0,0,0)"
)
fig.add_scatter3d(
    x=[query_normalized[0]],
    y=[query_normalized[1]],
    z=[query_normalized[2]],
    marker=dict(color="#16FF32"),
    name="query",
)