In [21]:
#imports
import os
import fitz
import re
from sentence_transformers import SentenceTransformer
import math
from typing import List, Tuple

In [22]:
#reading files
def read_txt(file_path): #reading .txt files
    with open(file_path, 'r') as f:
        content = f.read()
        f.close()
    return content


def read_pdf(file_path): #reading .pdf files
    document = fitz.open(file_path)
    text = ""

    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

def read_file(file_path): #one function to read all supported file types
    _, file_extention = os.path.splitext(file_path)

    if file_extention.lower() == '.txt':
        return read_txt(file_path=file_path)
    elif file_extention.lower() == '.pdf':
        return read_pdf(file_path=file_path)
    else:
        return "Unsupported file type"

In [23]:
#storing the data as a corpus
def store_data(data_dir):
    data = []
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            data.append([file, read_file(os.path.join(root, file))])
    return data


def chunk_data(data, max_chunk_length=500):
    print("Chunking the data...")
    chunks = []
    for file in data:
        content = file[1]
        pdf_length = len(content)
        for i in range(pdf_length//max_chunk_length + 5):
            if content == "":
                break
            contentL = len(content)
            temp_length = min(max_chunk_length, contentL)
            temp_chunk = content[:temp_length]

            dotIndex = temp_chunk.rfind(". ")
            dotIndex2 = temp_chunk.rfind(".\n")
            qIndex = temp_chunk.rfind("? ")
            qIndex2 = temp_chunk.rfind("?\n")
            excIndex = temp_chunk.rfind("! ")
            excIndex2 = temp_chunk.rfind("!\n")
            entIndex = temp_chunk.rfind("\n")

            lastIndex = max(dotIndex, dotIndex2, qIndex, qIndex2, excIndex, excIndex2, entIndex)
            chunk = [file[0], content[:lastIndex]]
            content = content.replace(chunk[1], "", 1)
            chunks.append(chunk)
    return chunks


In [24]:
#embedding the chunks
model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_embeddings(chunks):
    print("Generating embeddings...")
    texts = [chunk[1] for chunk in chunks]
    chunk_embeddings = model.encode(texts)
    return list(zip([chunk[0] for chunk in chunks], chunk_embeddings))
    

In [25]:
#storing the embeddings in a vector DB
class VectorDB:
    def __init__(self):
        self.embeddings = []
        self.metadata = []

    def add_embeddings(self, embeddings: List[Tuple[str, List[float]]]):
        for source, embedding in embeddings:
            self.embeddings.append(embedding)
            self.metadata.append({"source": source})

    def _euclidean_distance(self, v1, v2):
        score = 0
        for i in range(len(v1)):
            score += (v1[i] - v2[i])**2
        return score * (1 / len(v1))

    def search(self, query_embedding: List[float], top_k: int = 5) -> List[Tuple[float, dict]]:
        if not self.embeddings:
            return []

        distances = [self._euclidean_distance(query_embedding, emb) for emb in self.embeddings]
        
        # Sort distances and get top_k (smallest distances)
        sorted_results = sorted(enumerate(zip(distances, self.metadata)), key=lambda x: x[1][0])
        return [(distance, metadata, index) for index, (distance, metadata) in sorted_results[:top_k]]

    def __len__(self):
        return len(self.embeddings)
    
    def save(self, filepath: str):
        with open(filepath, 'w') as f:
            for embedding, metadata in zip(self.embeddings, self.metadata):
                embedding_str = ','.join(map(str, embedding))
                f.write(f"{metadata['source']}|{embedding_str}\n")

    @classmethod
    def load(cls, filepath: str):
        vector_store = cls()
        with open(filepath, 'r') as f:
            for line in f:
                source, embedding_str = line.strip().split('|')
                embedding = list(map(float, embedding_str.split(',')))
                vector_store.embeddings.append(embedding)
                vector_store.metadata.append({"source": source})
        return vector_store


In [26]:
if __name__ == "__main__":
    vector_store = VectorDB()
    print("Reading Files...")
    data = store_data('Data')
    chunked_data = chunk_data(data=data)

    if not(os.path.exists("vector_store.txt")):    
        embeddings = generate_embeddings(chunked_data)
        vector_store.add_embeddings(embeddings=embeddings)
        print(f"Number of embeddings stored: {len(vector_store)}")
        vector_store.save('vector_store.txt')


    else:
        shouldUpdate = input("Do you wish to update the DB? (y/n)")
        if shouldUpdate == 'y':
            embeddings = generate_embeddings(chunked_data)
            vector_store.add_embeddings(embeddings=embeddings)
            print(f"Number of embeddings stored: {len(vector_store)}")
            vector_store.save('vector_store.txt')

        else:
            vector_store = VectorDB.load('vector_store.txt')


    query = input('->')
    query_embedding = model.encode([query])[0]

    results = vector_store.search(query_embedding, top_k=3)

    for distance, metadata, chunk_num in results:
            print(f"Source: {metadata['source']}, Distance: {distance:.4f}")
            print('\n'.join(chunked_data[chunk_num]))
            print()

Reading Files...
Chunking the data...
Source: 02.Preparation.pdf, Distance: 0.0037
02.Preparation.pdf
Modern C++
Programming
2. Preparation
Federico Busato
2024-04-10
Table of Contents
1 Books and References
2 Slide Legend
3 What Editor/ IDE/Compiler Should I Use?
4 How to compile?
5 Hello World
I/O Stream
1/22
Books and
References
Suggested Books
Programming and Principles
using C++ (2nd)
B. Stroustrup, 2014
Professional C++ (5th)
S. J. Kleper, N. A. Solter, 2021
Absolute C++ (6th)
W. Savitch, 2015
2/22
More Advanced Books
Effective Modern C++
S. Meyer, 2014
Embracing Modern C++
Safely
J

Source: modern-cpp.pdf, Distance: 0.0040
modern-cpp.pdf
. Savitch, 2015
2/22
More Advanced Books
Eﬀective Modern C++
S. Meyer, 2014
Embracing Modern C++
Safely
J. Lakos, V. Romeo, R.
Khlebnikov, A. Meredith, 2021
Beautiful C++: 30 Core
Guidelines for Writing Clean,
Safe, and Fast Code
J. G. Davidson, K. Gregory, 2021
3/22
References
1/3
(Un)oﬃcial C++ reference:*
• en.cppreference.com
• C++ Standard 

In [27]:
len(chunked_data[0][1])

446