In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import os

In [2]:
docs_path = "../data/raw/"
documents = []

document_infos = []

for filename in os.listdir(docs_path):
    if filename.endswith(".txt"):
        with open(os.path.join(docs_path, filename), "r", encoding="utf-8") as f:
            text = f.read()
            documents.append(text)
            
            metadata = {
                "filename": filename,
                "type": "cv" if "cv" in filename.lower() else "project" if "project" in filename.lower() else "portfolio_section"
            }
            document_infos.append(metadata)

In [3]:
document_infos

[{'filename': 'about_me_portfolio.txt', 'type': 'portfolio_section'},
 {'filename': 'bachelor_project.txt', 'type': 'project'},
 {'filename': 'cv.txt', 'type': 'cv'},
 {'filename': 'diablo_project.txt', 'type': 'project'},
 {'filename': 'education_portfolio.txt', 'type': 'portfolio_section'},
 {'filename': 'experience_portfolio.txt', 'type': 'portfolio_section'},
 {'filename': 'facial_recognition_project.txt', 'type': 'project'},
 {'filename': 'skills_portfolio.txt', 'type': 'portfolio_section'},
 {'filename': 'smart_chess_project.txt', 'type': 'project'},
 {'filename': 'splash_project.txt', 'type': 'project'},
 {'filename': 'sweepresort_project.txt', 'type': 'project'},
 {'filename': 'taskify_project.txt', 'type': 'project'},
 {'filename': 'vclinic_project.txt', 'type': 'project'},
 {'filename': 'zaki_project.txt', 'type': 'project'},
 {'filename': 'zombie_invasion_project.txt', 'type': 'project'}]

In [4]:
documents

["Hi, I'm Ahmed\nSoftware Engineer\n\nI'm a software engineer with a strong background in building scalable applications.\nMy experience includes both frontend and backend, working with modern JavaScript frameworks. \nI focus on delivering clean, efficient code and solving real world problems.",
 'Title: Breast Cancer Diagnosis System\nCompany: Bachelor Project\nGitHub: https://github.com/Ahmedmk11/breast-cancer-diagnosis\nLink: https://doi.org/10.13140/RG.2.2.28574.24644\nDescription: An embedded system for real-time breast cancer diagnosis using ultrasound imaging and a multi-stage CNN approach.\nTools: Python, PyTorch, TensorFlow, Scikit-learn, Pandas, NumPy, Transfer Learning, ResNet-101, InceptionV3, Raspberry Pi 3, Bioinformatics, Computer Vision, Deep Learning, Embedded Systems\nSignificance: 1\nCompleted: True\n',
 'Summary\nI am a full-stack software engineer with experience in React, Next.js, Node.js, Express, and NestJS. I focus on building responsive frontends and scalable 

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=350,
    chunk_overlap=100
)

docs_chunks = []
metadatas = []

for doc, meta in zip(documents, document_infos):
    chunks = text_splitter.split_text(doc)
    for chunk in chunks:
        docs_chunks.append(chunk)
        metadatas.append(meta)

In [6]:
embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
embeddings_model

HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [8]:
vectorstore = FAISS.from_texts(docs_chunks, embeddings_model, metadatas=metadatas)

In [9]:
vectorstore.save_local("../data/vectorstore")

print("Vector store created and saved in ../data/vectorstore")

Vector store created and saved in ../data/vectorstore
