In [146]:
import os

import numpy as np
import pandas as pd
import plotly.express as px
from dotenv import load_dotenv
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from sklearn.decomposition import PCA
import requests
from plotly.offline import plot, iplot, init_notebook_mode

load_dotenv()

True

In [147]:
client = MongoClient(os.getenv("MONGO_URI"), server_api=ServerApi("1"))

db_name = "sample_mflix"
collection_name = "embedded_movies"

collection = client[db_name][collection_name]

pipeline = [
    {"$match": {"plot_embedding": {"$exists": 1}, "title": {"$exists": 1}, "genres": {"$exists": 1}}},
    {"$sample": {"size": 300}},
    {"$project": {"plot_embedding": 1, "title": 1, "genres": 1}},
]

In [148]:
embeddings = []
titles = []
genres = []
for doc in collection.aggregate(pipeline):
    embeddings.append(doc["plot_embedding"])
    titles.append(doc["title"])
    genres.append(doc["genres"])

In [149]:
embeddings = np.array(embeddings)
pca = PCA(n_components=3).fit(embeddings)
pcad_embeddings = pca.transform(embeddings)

In [150]:
df = pd.DataFrame(pcad_embeddings)
df['title'] = titles
df['genres'] = genres


In [151]:
# Step 1: Get all unique strings from the 'tags' column
unique_genres = set(genre for sublist in df['genres'] for genre in sublist)

# Step 2: Expand the 'tags' column into new columns
for tag in unique_genres:
    df[tag] = df['genres'].apply(lambda x: 1 if tag in x else 0)

# Step 3: Drop the original 'tags' column if you don't need it
df = df.drop(columns=['genres'])

In [152]:
df.head()

Unnamed: 0,0,1,2,title,Biography,Crime,Romance,Animation,Horror,Music,...,History,Western,Family,War,Thriller,Documentary,Musical,Fantasy,Short,Mystery
0,-0.00984,-0.100139,-0.035375,2000 AD,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,-0.160988,0.048163,0.034459,Hercules,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,-0.069607,0.050623,0.052116,"Hey, Happy!",0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,-0.021946,0.035216,-0.073218,Ip Man,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.044506,-0.1234,-0.119202,Assembly,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [153]:
# Multiply the embedding values to exaggerate the differences between them
n = 10
df[0] = df[0] * n
df[1] = df[1] * n
df[2] = df[2] * n

In [154]:
df.head()

Unnamed: 0,0,1,2,title,Biography,Crime,Romance,Animation,Horror,Music,...,History,Western,Family,War,Thriller,Documentary,Musical,Fantasy,Short,Mystery
0,-0.098404,-1.001392,-0.35375,2000 AD,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,-1.609879,0.481632,0.344586,Hercules,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,-0.696073,0.50623,0.521165,"Hey, Happy!",0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,-0.219463,0.352155,-0.732178,Ip Man,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.445062,-1.234004,-1.192016,Assembly,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [155]:
print(unique_genres)

{'Biography', 'Crime', 'Romance', 'Animation', 'Horror', 'Music', 'Comedy', 'Sci-Fi', 'Drama', 'Adventure', 'Action', 'Sport', 'History', 'Western', 'Family', 'War', 'Thriller', 'Documentary', 'Musical', 'Fantasy', 'Short', 'Mystery'}


In [156]:
# Fetch query embedding
query = "What movies take place in italy?"

url = "https://iltvectorsearch.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2023-05-15"
headers = {
            "Content-Type": "application/json",
            "api-key": os.getenv('MDB_API_KEY')
        }
repsonse = requests.post(url, headers=headers, json={"input": query}).json()
query_embedding = repsonse['data'][0]['embedding']

In [157]:
query_embedding_pcad = pca.transform(np.array(query_embedding).reshape(1, -1))[0] * n

In [158]:
query_embedding_pcad

array([-0.31507605,  0.33755311, -1.00439486])

In [159]:
fig = px.scatter_3d(df, x=0, y=1, z=2, hover_data="title", color="Short")
fig.add_scatter3d(
    x=[0], y=[0], z=[0], marker=dict(color="orange"), name="origin (0,0,0)"
)
fig.add_scatter3d(
    x=[query_embedding_pcad[0]],
    y=[query_embedding_pcad[1]],
    z=[query_embedding_pcad[2]],
    marker=dict(color="#16FF32"),
    name="query",
)

## Normalize the vectors to visualize how the Cosine Similairty function might find vectors close to it

In [160]:
# Normalize the vectors 
# Step 1: Compute the magnitude (norm) for each row (vector)
df['magnitude'] = np.linalg.norm(df[[0, 1, 2]], axis=1)

# Step 2: Normalize the x, y, z columns by dividing each by the magnitude
df_normalized = df[[0, 1, 2]].div(df['magnitude'], axis=0)

df_normalized = pd.concat([df_normalized, df[df.columns[3:]]], axis=1)

# Normalize the query vector
query_magnitude = np.linalg.norm(query_embedding_pcad)
query_normalized = query_embedding_pcad / query_magnitude

In [161]:
genre_to_color = "Fantasy"

fig = px.scatter_3d(df_normalized, x=0, y=1, z=2, hover_data="title", color=genre_to_color)
fig.add_scatter3d(
    x=[0], y=[0], z=[0], marker=dict(color="orange"), name="origin (0,0,0)"
)
fig.add_scatter3d(
    x=[query_normalized[0]],
    y=[query_normalized[1]],
    z=[query_normalized[2]],
    marker=dict(color="#16FF32"),
    name="query",
)