In [63]:
from langchain.text_splitter import MarkdownTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain import FAISS
from langchain.docstore.document import Document
from langchain.document_loaders import DirectoryLoader
import os
import glob
import openai

In [64]:
openai.api_key = "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" ## To configure OpenAI API
os.environ["OPENAI_API_KEY"] = "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" ## To configure langchain connections with OpenAI

In [30]:
dir_path = "../content/"
loader = DirectoryLoader(dir_path)
data = loader.load()

15

In [56]:
chunk_size=256
chunk_overlap=20
text_splitter = MarkdownTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
upf_splits = text_splitter.split_documents(data)

In [59]:
# get only chunk longer than 1 sentence, 10 words
filtered_upf_splits = []
for chunk in upf_splits:
    if len(chunk.page_content.split()) > 5:
        filtered_upf_splits.append(chunk)

In [67]:
chunks = []
for item in filtered_upf_splits:
    chunks.append(item.page_content)

In [70]:
embeddings = OpenAIEmbeddings()
knowledge_base = FAISS.from_texts(chunks[:10], embeddings)

In [74]:
knowledge_base.similarity_search_with_score("what is ultra-processed food?")

[(Document(page_content='Confronting the dangers of ultra-processed food\n\nA cocktail of additives and preservatives poses a risk to people’s health'),
  0.25893632),
 (Document(page_content='Mr van Tulleken, a doctor and television presenter, draws a distinction between “ultra-processed food” (upf) and “processed food”. Almost everything people consume is processed in some form: rice is harvested and hulled, animals are butchered. He uses a'),
  0.2813555),
 (Document(page_content='there is much to cheer about calories being cheap and abundant, when for most of human history they were neither. But as Chris van Tulleken’s new book, “Ultra-Processed People”, explains, that cheapness and abundance come at a cost.'),
  0.32375494),
 (Document(page_content='nutrient-poor, upf contributes to obesity in part because its palatability and soft texture foster overconsumption, overriding satiety signals from the brain.'),
  0.34942997)]

In [75]:
knowledge_base.save_local("faiss_index")

In [76]:
test = FAISS.load_local("faiss_index", embeddings)

In [77]:
test.similarity_search_with_score("what is ultra processed food")

[(Document(page_content='Mr van Tulleken, a doctor and television presenter, draws a distinction between “ultra-processed food” (upf) and “processed food”. Almost everything people consume is processed in some form: rice is harvested and hulled, animals are butchered. He uses a'),
  0.32386637),
 (Document(page_content='Confronting the dangers of ultra-processed food\n\nA cocktail of additives and preservatives poses a risk to people’s health'),
  0.33703643),
 (Document(page_content='there is much to cheer about calories being cheap and abundant, when for most of human history they were neither. But as Chris van Tulleken’s new book, “Ultra-Processed People”, explains, that cheapness and abundance come at a cost.'),
  0.39965206),
 (Document(page_content='and technology”. A pizza made from scratch contains minimally processed food (wheat turned into flour, tomatoes into sauce, milk into cheese). The one in the freezer, with its thiamine mononitrate and sodium phosphate, is upf.'),
  0.

In [80]:
class Knowledge_base():
    def __init__(self, knowledge_base, threshold=0.5):
        self.threshold=threshold
        self.knowledge_base = knowledge_base
    
    def search_doc_from_knowledge_base(self,knowledge_base, question):
        # Return the closet doc to question
        docs = self.knowledge_base.similarity_search_with_score(question)
        closest_doc = self._get_closet_doc_from_docs(docs)
        if closest_doc:
            return [closest_doc]
        else:
            return None
    
    # Only return closest doc
    def _get_closet_doc_from_docs(self, docs):
        # Return the doc having the min score below threshold
        min_score = self.threshold
        min_id = -1
        for id, item in enumerate(docs):
            doc, score = item
            if min_score > score:
                min_id = id
                min_score = score
        if min_score < threshold:
            return docs[min_id][0]  # return doc
        else:
            return None
        
class Text_processor():
    def __init__(self, folder_path, chunk_size=256, chunk_overlap=20):
        self.folder_path = folder_path
        self.folder_list = self._get_recursive_folder()
        self.doc_list = self._get_doc_path()
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.embeddings=OpenAIEmbeddings()
        self.data = None
        self.knowledge_base=None
    
    def _get_rescursive_folder(self):
        for folder in glob.iglob(f"../{self.folder_path}/**"):
            self.folder_list.append(folder)
    
    def _get_doc_path(self):
        for filename in glob.iglob(f'../{self.folder_path}/**/*.md', recursive=True):
            self.doc_list.append(filename)
    
    def _load_docs(self):
        loader = DirectoryLoader(self.dir_path)
        data = loader.load()
        return data
        
    def _split_docs(self):
        # Split the text into chunks using Langchain's CharacterTextSplitter
        text_splitter = MarkdownTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap
        )
        upf_splits = text_splitter.split_documents(self.data)
        return upf_splits
    
    def embed_docs(self):
        # Load docs
        self.data = self._load_docs()
        
        # Split docs
        self.upf_splits = self._split_docs()
        
        # Filter chunks shorter than 1 sentence or 10 words
        self.upf_splits = self._filter_chunk(self)
        
        # Embed chunks
        self.knowledge_base = FAISS.from_texts(documents=self.upf_splits, embedding=self.embeddings)
        return Knowledge_base(self.knowledge_base)

    def _filter_chunk(self):
        filtered_upf_splits = []
        for chunk in self.upf_splits:
            if len(chunk.page_content.split()) > 5:
                filtered_upf_splits.append(chunk)
        return filtered_upf_splits
    
    def _get_doc_len(self, text, threshold_char=10):
        # Check if the split is longer than 1 sentences
        return len(text.split()) >= threshold_char

    def get_n_doc(self):
        return len(self.doc_list)

    def save_knowledge_base(self, output_path):
        self.knowledge_base.save_local(output_path)
    
    def load_knowledge_base(self, input_path):
        self.knowledge_base.load_local(input_path, self.embeddings)
        return Knowledge_base(self.knowledge_base)

SyntaxError: invalid syntax (1828340610.py, line 51)