In [1]:
import json
import os
from tqdm import tqdm


# Chroma database
import chromadb

# Llamaindex framework
from llama_index.core import Document
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

from utils.read_config import read_config

# Create prompt to fit to embedding
def get_document_string(df_current_movie):
    embedding_string = f'''Embedding this movie with the information:
- Title: {df_current_movie["title"]}.
- Release Year: {df_current_movie["year"]}.
- Country: {df_current_movie["country"]}.
- Genre: {df_current_movie["genre"]}.
- Duration: {df_current_movie["runtime"]}.
- Writer: {df_current_movie["writer"]}.
- Director: {df_current_movie["director"]}.
- Cast: {df_current_movie["actors"]}.
- Description: {df_current_movie["plot"]}.
'''
    return embedding_string


# Get all document string
def get_document_list(movie_data_path: os.PathLike):
    movie_data = []
    for line in open(movie_data_path, "r"):
        movie_data.append(json.loads(line))
        
    print(len(movie_data))
        
    documents = [] # Drop all na value of movieId
    for i in tqdm(range(len(movie_data))):
        df_current_movie = movie_data[i]
        if len(df_current_movie) > 2:# Get current row
            embedding_string = get_document_string(df_current_movie)  # Get prompt to fit
            documents.append(Document(text=embedding_string,
                                      doc_id=str(df_current_movie['movieId'])))  # Append to llamaindex document
    return documents


# Generate embedding
def create_embedding_db(db_path, collection_name, embedding_model, api_key, movie_data_path):
    documents = get_document_list(movie_data_path=movie_data_path)
    gemini_embedding_model = GeminiEmbedding(api_key=api_key, 
                                             model_name=embedding_model)
    client = chromadb.PersistentClient(path=db_path)
    chroma_collection = client.get_or_create_collection(collection_name)
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(documents, 
                                            storage_context=storage_context, 
                                            embed_model=gemini_embedding_model)
    return index


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
config_value = read_config()

chromadb_path = config_value['insp_chroma_db_path'] 
collection_name = config_value['insp_collection_name'] 
google_api_key = config_value['google_api_key']
embedding_model_name = config_value['model_embedding']
movie_data_path = config_value['processed_insp_movie_data_path'] 

# 

# # Create embedding model
# gemini_embedding_model = GeminiEmbedding(api_key= google_api_key, model_name= embedding_model_name)

# # Load meta data from disk
# movie_data = []
# for line in open(config_value['file_path_output_movie'], "r"):
#     movie_data.append(json.loads(line))

# Create document
# documents = get_document_list(movie_data_path=movie_data_path)

# Call and save embedding
create_embedding_db(db_path=chromadb_path, 
                    collection_name=collection_name, 
                    embedding_model=embedding_model_name,
                    movie_data_path=movie_data_path,
                    api_key=google_api_key)

16684


100%|██████████| 16684/16684 [00:00<00:00, 25046.01it/s]


<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x215f2c27150>

In [None]:
# %pip install -r requirements.txt

In [3]:
file = open('dataset/preprocessed_data/INSPIRED/movie_data/movie_database_no_missing.json', 'r', encoding='utf-8')
df_movie = json.load(file)

df_movie

JSONDecodeError: Extra data: line 2 column 1 (char 982)

In [46]:
import pandas as pd

data = pd.read_csv('output\INSPIRED\output_600.tsv', 
                   delimiter='\t',
                   names=['recall', 'id', 'target', 'summary', 'top_movie', 'candidate'])
data

Unnamed: 0,recall,id,target,summary,top_movie,candidate
0,1.0,20191126-061250_477_live.pkl,Avengers: Endgame,"The seeker likes action, comedies, and animate...",Black Widow|The Avengers|Avengers: Age of Ultr...,[Captain America: The Winter Soldier|Snow Whit...
1,1.0,20191126-112230_370_live.pkl,Joker,The seeker likes psychological thrillers and e...,Joker (2019)|Taxi Driver (1976)|The Silence of...,[{}]
2,1.0,20191126-114841_816_live.pkl,Zombieland,The seeker is looking for a good action comedy...,Shaun of the Dead|Hot Fuzz|The World's End|Zom...,[Hot Fuzz|Shaun of the Dead|Dead Heat|Lost in ...
3,1.0,20191126-170006_197_live.pkl,Superbad,The seeker likes comedy and horror movies and ...,Ted|Superbad|This Is the End|Sausage Party|Kno...,[Superbad|Ted|Ted 2|Neighbors|Knocked Up|Role ...
4,1.0,20191126-170349_558_live.pkl,Black Panther,The seeker is interested in sci-fi and action ...,Black Panther|The Avengers|Avengers: Infinity ...,[Toy Story 4|Avengers: Age of Ultron|Toy Story...
...,...,...,...,...,...,...
796,1.0,20200325-125027_114_live.pkl,Okja,The seeker enjoys movies that are unpredictabl...,The Nice Guys|Knives Out|Seven Psychopaths|Swi...,[The Host|Hide and Seek|The Suspect|The Guest|...
797,0.0,20200325-125530_234_live.pkl,Open Windows,"The seeker enjoys horror, love, and fantasy mo...",Zombieland|Happy Death Day|Happy Death Day 2U|...,[Zombieland|Zombieland: Double Tap|Fantastic B...
798,1.0,20200325-163433_682_live.pkl,Annihilation,"The seeker enjoys horror movies, as evidenced ...",Annihilation|Under the Skin|Black Swan|Ex Mach...,[Annihilation|Hollow Man|Planetarium|Hide and ...
799,1.0,20200325-190432_840_live.pkl,Nocturnal Animals,"The seeker enjoys comedy, serious movies with ...",Nocturnal Animals|Gone Girl|The Nice Guys|Game...,[Nocturnal Animals|Nocturnal Animals|The Nice ...
