In [192]:
import os

import numpy as np
import pandas as pd
import plotly.express as px
from dotenv import load_dotenv
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from sklearn.decomposition import PCA
import requests
from plotly.offline import plot, iplot, init_notebook_mode
import json
load_dotenv()

True

In [187]:
embeddings = np.load("data/embeddings.npy")
pca = PCA(n_components=3).fit(embeddings)
pcad_embeddings = pca.transform(embeddings)

In [193]:
df = pd.DataFrame(pcad_embeddings)
metadata = pd.read_csv("data/metadata.csv")
df = pd.concat([df, metadata], axis=1, ignore_index=False)
df.head()

Unnamed: 0,0,1,2,titles,genres
0,-0.155683,0.005991,0.042268,Orlando,"['Drama', 'Fantasy', 'Romance']"
1,0.015299,-0.146868,-0.085495,Small Soldiers,"['Action', 'Adventure', 'Comedy']"
2,-0.01363,-0.06008,0.193336,Lone Rider,['Western']
3,0.05699,-0.039466,-0.058687,High Noon,"['Drama', 'Thriller', 'Western']"
4,0.007569,-0.106919,-0.084748,Devil's Playground,"['Action', 'Horror']"


In [199]:
df['genres'][0].replace("'", '"')

'["Drama", "Fantasy", "Romance"]'

In [200]:
json.loads(df['genres'][0].replace("'", '"'))

['Drama', 'Fantasy', 'Romance']

In [201]:
df['genres'] = df['genres'].apply(lambda x: json.loads(x.replace("'", '"')))

# Step 1: Get all unique strings from the 'tags' column
unique_genres = set(genre for sublist in df['genres'] for genre in sublist)

# Step 2: Expand the 'tags' column into new columns
for tag in unique_genres:
    df[tag] = df['genres'].apply(lambda x: 1 if tag in x else 0)

# Step 3: Drop the original 'tags' column if you don't need it
df = df.drop(columns=['genres'])

In [202]:
df.head()

Unnamed: 0,0,1,2,titles,Film-Noir,Biography,Crime,Romance,Animation,Horror,...,History,Western,Family,War,Thriller,Documentary,Musical,Fantasy,Short,Mystery
0,-0.155683,0.005991,0.042268,Orlando,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.015299,-0.146868,-0.085495,Small Soldiers,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.01363,-0.06008,0.193336,Lone Rider,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0.05699,-0.039466,-0.058687,High Noon,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
4,0.007569,-0.106919,-0.084748,Devil's Playground,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [203]:
print(unique_genres)

{'Film-Noir', 'Biography', 'Crime', 'Romance', 'Animation', 'Horror', 'Comedy', 'Sci-Fi', 'Drama', 'Adventure', 'Action', 'Sport', 'History', 'Western', 'Family', 'War', 'Thriller', 'Documentary', 'Musical', 'Fantasy', 'Short', 'Mystery'}


### Get the embeddings for a custom query to compare

In [179]:
# # Fetch query embedding
# query = "What movies take place in italy?"

# url = os.getenv('OPEN_AI_URL')
# headers = {
#             "Content-Type": "application/json",
#             "api-key": os.getenv('API_KEY')
#         }
# repsonse = requests.post(url, headers=headers, json={"input": query}).json()
# query_embedding = repsonse['data'][0]['embedding']
# query_embedding_pcad = pca.transform(np.array(query_embedding).reshape(1, -1))[0] * n

In [210]:
genre_to_color = "Fantasy"

fig = px.scatter_3d(df, x=0, y=1, z=2, hover_data="titles", color=genre_to_color)
fig.add_scatter3d(
    x=[0], y=[0], z=[0], marker=dict(color="orange"), name="origin (0,0,0)"
)
# fig.add_scatter3d(
#     x=[query_embedding_pcad[0]],
#     y=[query_embedding_pcad[1]],
#     z=[query_embedding_pcad[2]],
#     marker=dict(color="#16FF32"),
#     name="query",
# )

## Normalize the vectors to visualize how the Cosine Similairty function might find vectors close to it

In [205]:
# Normalize the vectors 
# Step 1: Compute the magnitude (norm) for each row (vector)
df['magnitude'] = np.linalg.norm(df[[0, 1, 2]], axis=1)

# Step 2: Normalize the x, y, z columns by dividing each by the magnitude
df_normalized = df[[0, 1, 2]].div(df['magnitude'], axis=0)

df_normalized = pd.concat([df_normalized, df[df.columns[3:]]], axis=1)

# # Normalize the query vector
# query_magnitude = np.linalg.norm(query_embedding_pcad)
# query_normalized = query_embedding_pcad / query_magnitude

In [207]:
genre_to_color = "Fantasy"

fig = px.scatter_3d(df_normalized, x=0, y=1, z=2, hover_data="titles", color=genre_to_color)
fig.add_scatter3d(
    x=[0], y=[0], z=[0], marker=dict(color="orange"), name="origin (0,0,0)"
)
# fig.add_scatter3d(
#     x=[query_normalized[0]],
#     y=[query_normalized[1]],
#     z=[query_normalized[2]],
#     marker=dict(color="#16FF32"),
#     name="query",
# )