In [None]:
import sys
from pathlib import Path

ROOT_DIR = Path().resolve().parent
sys.path.append(str(ROOT_DIR))

In [None]:
import numpy as np
import pandas as pd

from src.loader_dataset import load_names, get_caption_by_image_name
from src.embeddings import generate_text_embeddings, generate_image_embeddings, combine_img_embeddings_text_embeddings
from src.vector import build_vector_index, search_similar_vectors, save_index, load_index
from src.preprocessing import merge_captions_by_image, preprocess_documents

In [None]:
paths = ['../data/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv','../data/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv','../data/1429_1.csv' ]
img_dir = '../data/img'

df_raw = load_names(paths, img_dir)
df = merge_captions_by_image(df_raw, image_col='image_path', caption_col='caption')
df.head()

In [None]:
df_processed = preprocess_documents(df['combined_caption'].tolist())
df['prep_doc'] = df_processed['prep_doc']
text_embeddings = generate_text_embeddings(df['prep_doc'].tolist(), device='cuda')
df['text_embedding'] = [vec for vec in text_embeddings]

In [None]:
img_embeddings, img_paths = generate_image_embeddings(img_dir, device = 'cuda')

In [None]:
df_img = pd.DataFrame({
    'image_path' : [path for path in img_paths],
    'img_embedding' : list(img_embeddings)
})
df = df.merge(df_img, on='image_path', how = 'left')
df.head()

In [None]:
from src.embeddings import combine_img_embeddings_text_embeddings
combined_emb = combine_img_embeddings_text_embeddings(np.array(df['text_embedding']), np.array(df['img_embedding']))

index_faiss = build_vector_index(np.array(combined_emb))
save_index(index_faiss)

In [None]:
from src.search import retrieve_by_text, retrieve_by_image

retrieve_by_text('red kindle', index_faiss)

In [None]:
retrieve_by_image('test_3.png', index_faiss)