In [1]:
# # %%capture
# !pip install sentence-transformers langchain_cohere cohere

In [4]:
# Import
import os
import sys

# Change to the project root directory
os.chdir('/home/alaa/repos/seez-assignment')
current_dir = os.getcwd()
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)

try:
    from src.utils.tools import read_dialogue
    print("Import successful!")
except:
    print("Failed")

import os

from src.api_key import OPENAI_API_KEY

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

Import successful!


In [5]:
from pydantic import BaseModel, Field
from typing import Dict, Any, List

class SearchEngineResponse(BaseModel):
    status: bool
    response: List
    # [SearchResult]

    class Config:
        arbitrary_types_allowed = True

In [8]:
# pip install cohere

In [9]:
import re
import os
import warnings
import numpy as np
from typing import Optional, List, Tuple

from langchain_openai import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain_community.vectorstores.faiss import FAISS

from src.common.logger import logger
from src.common.config import ConfigBase

from src.utils.tools import read_dialogue, read_user_data, read_jsonl, read_json, get_conversation_by_id

# from src.infrastructure.embeddings.base_embedding import EmbeddingBase
# from src.infrastructure.search_engine.base_engines import SimilaritySearchEngine
# from ..types import SearchEngineResponse

class FAISS_Search(SimilaritySearchEngine):
    def __init__(
            self,
        config: Optional[ConfigBase] = None,
        encoder = None,
        **kwargs
        ) -> None:
        super().__init__(config, encoder, **kwargs)
        
        self.logger = logger
        self.data = None
        self.vectorstore = None
        self.embeddings_loaded = False
        self.top_k = config.similarity_search.top_k

    def load_embeddings(self) -> bool:
        if os.path.exists(self.config.embeddings.vectordb_path):
            self.vectorstore = FAISS.load_local(
                self.config.embeddings.vectordb_path, 
                self.encoder, 
                allow_dangerous_deserialization=True)
            self.embeddings_loaded = True
            return True
        else:
            self.logger.error(f"Embeddings not found: {self.config.embeddings.vectordb_path}")
            return self.embed_data()

    def _read_data(self) -> List[Document]:
        
        path = self.config.data.conversations
        docs = read_dialogue(path)
        
        pattern = r"(\d+)\n\n((?:User:.*?\n\nAgent:.*?(?:\n\n|$))+)"
        matches = re.findall(pattern, docs, re.DOTALL)

        documents = []
        for match in matches:
            dialogue_number = match[0]
            dialogue_content = match[1].strip()
            metadata = {"dialogue_number": dialogue_number}
            documents.append(Document(page_content=dialogue_content, metadata=metadata))

        return documents

    def embed_data(self) -> bool:
        try:
            self.logger.info("Reading the data")
            docs = self._read_data()
            self.logger.info("Embedding the data")
            self.vectorstore = FAISS.from_documents(docs, self.encoder)
            self.embeddings_loaded = True
            self.logger.info("Saving the embeddings")
            self.save_embeddings()
            return True

        except Exception as e:
            self.logger.error(f"Failed to embed data: {e}")
            return False
            
    # def rerank(self, query: str, docs: List[Tuple[Document, np.float64]]):
    #     import os        
    #     if "COHERE_API_KEY" in os.environ:
        
    def save_embeddings(self) -> None:
        if self.vectorstore:
            self.vectorstore.save_local(self.config.embeddings.vectordb_path)
            self.logger.info("Embeddings saved locally.")
        else:
            self.logger.error("No vectorstore to save embeddings from.")

    def search(
        self, 
        query: str, 
        filter_neg: bool = True, 
    ) -> SearchEngineResponse:

        if not self.embeddings_loaded:
            if not self.load_embeddings():
                return SearchEngineResponse(status=False, response=[])

        query = query.strip()
        self.logger.info(f"Searching for: {query[:100]} ...")

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            # try:
            docs = self.vectorstore.similarity_search_with_relevance_scores(query, k=self.top_k)
            docs = sorted(docs, key=lambda x: x[1], reverse=True)
            
            if filter_neg:
                Hscore = docs[0][1]
                if Hscore < 0.0:
                    self.logger.info("No similar items found in the item list for item `{}`.".format(query))
                    return SearchEngineResponse(status=True, response=[])
                qdocs = [d for d in docs if d[1] >= 0.0]
            # if rerank:
            #     docs = self.rerank(query, docs)
            # else:
            #     docs = [d for d in docs if d[1] >= self.config.min_similarity_thr]
            #     qdocs = sorted(docs, key=lambda x: x[1], reverse=True)

            return SearchEngineResponse(status=True, response=qdocs)

            # except Exception as e:
            #     self.logger.error(f"Similarity search error: \n{e}")
            #     return SearchEngineResponse(status=False, response=[])

In [10]:
from src.common.config import load_config

config = load_config()

# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

ss = FAISS_Search(config)

[2024-11-20 13:15:43,688: INFO: config: Config loaded successfully: config/config.yaml]
[2024-11-20 13:15:43,839: INFO: 3707362177: Initialized SearchEngineBase!]


In [37]:
# ss.search("looking for a classic romantic movie")
type(ss.vectorstore.similarity_search_with_relevance_scores("action thrilling for tom hanks", 25)[0][1])

[2024-11-20 10:15:24,558: INFO: _client: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"]


numpy.float64

In [32]:
ss.search("recommend me a mystery movie")

[2024-11-19 22:24:13,925: INFO: 234237106: Searching for: recommend me a mystery movie ...]
[2024-11-19 22:24:14,680: INFO: _client: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"]


SearchEngineResponse(status=True, response=[(Document(metadata={'dialogue_number': '7807'}, page_content='User: Hey, I\'m a book lover and my name is John. I really enjoyed watching the movie "Only Angels Have Wings" on VHS. It had a great atmosphere and multiple storylines. Cary Grant\'s performance in a darker role was impressive. I would definitely watch it again.\n\nAgent: Hi John! I\'m glad you liked "Only Angels Have Wings". It\'s a fantastic movie with a captivating atmosphere and talented actors. Is there anything specific you\'d like a recommendation for?\n\nUser: Can you recommend another movie for me?\n\nAgent: Sure! How about "And Then There Were None" on VHS? It\'s a gripping murder mystery. I read a review that described it as deliciously amusing with a perfect blend of comedy and eerie atmosphere. \n\nUser: Oh, I\'ve already seen that movie before. Do you have any other recommendations?\n\nAgent: No problem! How about "Gazebo" on VHS? It\'s a comedy involving a televisio