# Save to ChromaDB with just embedding-id pairs

Reduces space usage by ~3x by not storing the documents but pre-calculating embeddings first, as we store the documents in the Azure db anyways. 

By adding in comp585_movies.csv (from Kaggle) and azure_movies_oct22_filtered.csv (from Azure movies dump on that date), we have a total of 23,909 movies in ChromaDB already (out of ~27K total).

In [21]:
import chromadb
from chromadb.utils import embedding_functions
import json
import pandas as pd

In [22]:
client = chromadb.PersistentClient(path="../inference/chromadb_test2")
movies = client.get_or_create_collection("movies",metadata={"hnsw:space":"cosine"})

In [23]:
movies.count()

0

In [24]:
# Load the data
df = pd.read_csv("./kaggle_dataset/comp585_movies_final.csv")
# Drop duplicates
df = df.drop_duplicates(subset=['movie_id'])

In [25]:
df_cleaned = df[['movie_id', 'overview', 'title', 'genres', 'production_companies' , 'production_countries', 'spoken_languages']]
#Drop rows with NaN
cols_to_check = ['movie_id', 'overview', 'title', 'genres', 'production_companies', 'production_countries', 'spoken_languages']
df_cleaned = df_cleaned.dropna(subset=cols_to_check)
def convert(text):
    try:
        if isinstance(text, list):
            # If the input is a list, convert it to a JSON string
            text = json.dumps(text)
        # Use json.loads to parse the string as a JSON object
        list_of_dicts = json.loads(text.replace("'", "\""))
        # Extract the 'name' values from the dictionaries
        names = [item['name'] for item in list_of_dicts]
        return names
    except (json.JSONDecodeError, ValueError):
        return []
df_cleaned['genres'] = df_cleaned['genres'].apply(convert)
df_cleaned['production_countries'] = df_cleaned['production_countries'].apply(convert)
df_cleaned['production_companies'] = df_cleaned['production_companies'].apply(convert)
df_cleaned['spoken_languages'] = df_cleaned['spoken_languages'].apply(convert)
df_cleaned['genres'] = df_cleaned['genres'].apply(lambda x:[i.replace(' ','') for i in x])
df_cleaned['production_countries'] = df_cleaned['production_countries'].apply(lambda x:[i.replace(' ','') for i in x])
df_cleaned['production_companies'] = df_cleaned['production_companies'].apply(lambda x:[i.replace(' ','') for i in x])
df_cleaned['overview'] = df_cleaned['overview'].str.lower()
df_cleaned['overview'] = df_cleaned['overview'].apply(lambda x: x.split() if isinstance(x, str) else [])

In [26]:
# Convert list of strings to string
def list_to_string(lst):
    if isinstance(lst, list):
        return ' '.join(lst)
    return lst

# Apply the function to columns in the DataFrame that contain lists
df_cleaned = df_cleaned.applymap(list_to_string)

  df_cleaned = df_cleaned.applymap(list_to_string)


In [27]:
# Combine all the text columns into one
df_cleaned['tags'] = df_cleaned['overview'] +" "+ df_cleaned['genres'] +" "+ df_cleaned['production_countries'] +" "+ df_cleaned['production_companies'] +" "+ df_cleaned['spoken_languages']

# --- TEXT PROCESSING ---
movie_combined = df_cleaned[['movie_id', 'tags']]
jsons = df_cleaned['tags'].tolist()
# Get the ids
ids = df_cleaned['movie_id'].tolist()

In [28]:
# Get embeddings for all
default_ef = embedding_functions.DefaultEmbeddingFunction()
embeddings = default_ef(jsons)

In [29]:
len(ids)

17491

In [30]:
# Store all movies to ChromaDB with just embeddings and ids, no documents
movies.add(
    embeddings=embeddings,
    ids=ids
)