In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import os

In [None]:
docs_path = "../data/raw/"
documents = []

document_infos = []

for filename in os.listdir(docs_path):
    if filename.endswith(".txt"):
        with open(os.path.join(docs_path, filename), "r", encoding="utf-8") as f:
            text = f.read()
            documents.append(text)
            
            metadata = {
                "filename": filename,
                "type": "cv" if "cv" in filename.lower() else "project" if "project" in filename.lower() else "portfolio_section"
            }
            document_infos.append(metadata)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

docs_chunks = []
metadatas = []

for doc, meta in zip(documents, document_infos):
    chunks = text_splitter.split_text(doc)
    for chunk in chunks:
        docs_chunks.append(chunk)
        metadatas.append(meta)

In [None]:
embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
vectorstore = FAISS.from_texts(docs_chunks, embeddings_model, metadatas=metadatas)

In [None]:
vectorstore.save_local("../data/vectorstore")

print("Vector store created and saved in ../data/vectorstore")