#### Setup
uv add "fastapi[standard]" uvicorn

#### RUN
Testing only code runnig on main.py

uvicorn main:app --reload
uv run uvicorn main:app --reload
http://127.0.0.1:8000/search/?query=admissions&department=Electrical&year=2023

#### Result
http://127.0.0.1:8000/search/?query=admissions&department=CSE&year=2023<br>
{<br>
  "0": "Engineering as its flagship academic course. The intake of this program has grown steadily from 16 students in the inaugural",<br>
  "1": "The CSE Department currently has about 380 students total in the three batches following a four-year honours program leading to",<br>
  "Supplemental": "Top Related Results:\n"<br>
}

In [11]:
import os, glob, json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from fastapi import FastAPI

In [3]:
documents = {
    "uom-about.txt": ["General", "2025"],
    "uom-ee-about.txt" : ["EE", "2025", "Electrical", "Electrical Engineering"],
    "uom-cse-about.txt" : ["CSE", "2025", "Computer", "Computer Science" ]
}

corpus = {}
for doc in documents.keys():
    with open(os.path.join("docs", doc), 'r', encoding='utf-8') as file:
        corpus[doc] = file.read()
        
print(f"Loaded {len(corpus)} documents.")


Loaded 3 documents.


In [5]:
chunk_size = 500  # Number of words per chunk
overlap = 50      # Number of overlapping words between chunks

def chunk_text(text, chunk_size=500, overlap=50):
    """
    Splits the input text into chunks of specified size with overlap.
    """

    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

chunked_corpus = []

for doc_id, text in corpus.items():
    chunks = chunk_text(text, chunk_size, overlap)
    for i, chunk in enumerate(chunks):
        chunked_corpus.append({
            'doc_id': doc_id,
            'chunk_id': f"{doc_id}_chunk_{i}",
            'text': chunk
        })

chunked_corpus[:2]

[{'doc_id': 'uom-about.txt',
  'chunk_id': 'uom-about.txt_chunk_0',
  'text': 'About the University The University of Moratuwa (UOM), the successor to the Institute of Practical Technology, Katubedda set up in 1960 and Ceylon College of Technology set up in 1966, commenced functioning as Katubedda Campus on 15th February 1972. Under the provisions of Universities Act No. 16 of 1978, the Katubedda Campus of University of Sri Lanka acquired the status of an independent University with its present corporate name "The University of Moratuwa, Sri Lanka". At present , the UOM consists of five faculties namely Architecture, Business, Engineering, Graduate Studies and Information Technology. It has twenty two (22) academic departments offering twelve (12) Bachelors degree programs to students selected by the University Grants Commission (UGC) and fifty six (56) postgraduate programs conducted on cost recovery basis. The University of Moratuwa has gained reputation as the overall best universit

In [10]:
chucked_metadata = []
for doc in documents.keys():
    chunks = chunk_text(' '.join(documents[doc]), chunk_size, overlap)
    for i, chunk in enumerate(chunks):
        chucked_metadata.append({
            'doc_id': doc_id,
            'chunk_id': f"{doc_id}_chunk_{i}",
            'text': chunk
        })

chucked_metadata

[{'doc_id': 'uom-cse-about.txt',
  'chunk_id': 'uom-cse-about.txt_chunk_0',
  'text': 'General 2025'},
 {'doc_id': 'uom-cse-about.txt',
  'chunk_id': 'uom-cse-about.txt_chunk_0',
  'text': 'EE 2025 Electrical Electrical Engineering'},
 {'doc_id': 'uom-cse-about.txt',
  'chunk_id': 'uom-cse-about.txt_chunk_0',
  'text': 'CSE 2025 Computer Computer Science'}]

In [13]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode([chunk['text'] for chunk in chunked_corpus], show_progress_bar=True, convert_to_numpy=True)

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.50it/s]


In [14]:
def normalize(vec):
    norm = np.linalg.norm(vec, axis=1, keepdims=True)
    return vec / norm

vec = normalize(embeddings).astype('float32')
dimension = vec.shape[1]
print(f"Dimension of embeddings: {dimension}")

Dimension of embeddings: 384


In [15]:
index = faiss.IndexFlatIP(dimension)
index.add(vec)
print(f"Total vectors in index: {index.ntotal}")

Total vectors in index: 5


In [22]:
def search_items(query_string: str | None = None):
    if query_string:
        parsed_dict = urllib.parse.parse_qs(query_string)
        query = {k: v[0] for k, v in parsed_dict.items()}


        query_embedding = model.encode([query.get("query", 'None')], convert_to_numpy=True)

        department = query.get("department", 'all')
        year = query.get("year", '2025')


        

        query_vec = normalize(query_embedding).astype('float32')

        # Retrieve from the index
        result = index.search(query_vec, 2)

        
        return result
    
    
    return {"message": "No query provided"}

# Example usage
search_items("search?query=admissions&department=Electrical&year=2025")

(array([[0.0766115 , 0.04713733]], dtype=float32), array([[2, 0]]))