In [None]:
import numpy as np
import pandas as pd
import yaml

CONFIG = yaml.safe_load(open("../config.yml"))

from langchain.embeddings import HuggingFaceEmbeddings

model_name = CONFIG["EmbeddingModel"]
model_kwargs = {'device': CONFIG["EmbeddingDevice"]} # "cuda"
encode_kwargs = {'normalize_embeddings': CONFIG["EmbeddingNormalized"]}
model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# movies vectorstore

In [None]:
def create_movie_collection(model: HuggingFaceEmbeddings):
    from tqdm.auto import tqdm
    import chromadb
    
    client = chromadb.PersistentClient(path=f"../{CONFIG['ChromaDBPath']}")
    movie_collection = client.create_collection("movies", metadata={"hnsw:space": "cosine"})

    # movies data preprocessing
    movie_df = pd.read_csv(f"../{CONFIG['MovieCsv']}")
    COLS = ["movieId", "Title", "Release Year", "Origin/Ethnicity", "Director", "Cast", "genres", "Wiki Page", "Plot"]
    movie_df = movie_df[COLS]
    movie_df['movieId'] = movie_df['movieId'].astype(int).astype(str)
    movie_df['Cast'] = movie_df['Cast'].fillna('Unknown')

    # create collection in chromadb
    # we will use batches of 64
    batch_size = 64
    
    for i in tqdm(range(0, len(movie_df), batch_size)):
        
        # find end of batch
        i_end = min(i+batch_size, len(movie_df))
        
        # extract batch
        batch = movie_df.iloc[i:i_end]
        
        # generate embeddings for batch, every row gets transformed into string and encoded
        embeddings = model.embed_documents(batch["Plot"].tolist())
        
        # get metadata and documents
        meta = batch.drop("Plot", axis=1).to_dict(orient='records')
        docs = batch["Plot"].tolist()
        
        # create IDs
        ids = batch["movieId"].values.tolist()
        
        # add all to upsert list
        movie_collection.upsert(ids=ids, embeddings=embeddings, metadatas=meta, documents=docs)

In [None]:
create_movie_collection(model)

# Users vectorstore

In [None]:
def calculate_average_embedding(movieIds: list[str], timestamps: list[int], movie_collection):
    max_timestamp = max(timestamps)
    decay_factor = 0.1  # You can adjust the decay factor as needed
    weights = np.exp(decay_factor * (np.array(timestamps) - max_timestamp))
    
    # Normalize the weights so that they sum up to 1
    weights = weights / weights.sum()
    weights = weights.reshape(-1,1)
    
    # Calculate the weighted average of the vectors
    embeddings = np.array(movie_collection.get(ids=movieIds, include=["embeddings"])["embeddings"]).reshape(CONFIG["EmbeddingDim"],-1)
    weighted_embedding = embeddings @ weights
    
    return weighted_embedding.flatten()
    
def create_user_collection(model: HuggingFaceEmbeddings):
    from tqdm.auto import tqdm
    import chromadb
    
    # preprocess user data
    users_df = pd.read_csv(f"../{CONFIG['UserCsv']}")
    COLS = ["userId", "movieId", "timestamp"]
    users_df = users_df[COLS]
    users_df['userId'] = users_df['userId'].astype(str)
    users_df['movieId'] = users_df['movieId'].astype(str)

    # load chromadb client and collections
    client = chromadb.PersistentClient(path=f"../{CONFIG['ChromaDBPath']}")
    movie_collection = client.get_collection("movies")
    user_collection = client.get_or_create_collection("users", metadata={"hnsw:space": "cosine"})

    for userId in tqdm(users_df.userId.unique()):

        user_data = users_df[users_df.userId==userId]
        
        # find all movieId and timestamps pairs of user
        timestamps = user_data.timestamp.to_numpy()
        movieIds = user_data.movieId.tolist()
        
        # get exponential moving average of corresponding embeddings wrt timestamps
        embedding = calculate_average_embedding(movieIds, timestamps, movie_collection).tolist()
        
        # get metadata and documents
        meta = {"userId": userId, 
                "movieIds": str(movieIds),
                "timestamps": str(timestamps)
               }
        
        # create IDs
        ids = userId
        
        # add all to upsert list
        user_collection.upsert(ids=ids, embeddings=embedding, metadatas=meta)

In [None]:
create_user_collection(model)