# mRAG-01
test

//----------------------------------------------------------------------------------------------------------------------------------------//

## Imports

In [2]:
import asyncio
import requests
import httpx
import faiss
import pandas as pd # for DataFrame
import numpy as np # for normalizing vectors
from datetime import datetime

In [4]:
!pip install pymupdf
!pip install ollama

Collecting ollama
  Downloading ollama-0.6.1-py3-none-any.whl.metadata (4.3 kB)
Downloading ollama-0.6.1-py3-none-any.whl (14 kB)
Installing collected packages: ollama
Successfully installed ollama-0.6.1


In [5]:
import os
import pymupdf
import ollama

## Variables

### Ollama

In [3]:
# Ollama endpoints
ollama_base_url = "http://127.0.0.1:11434"
ollama_chat = "/api/chat"
ollama_embedding = "/api/embeddings"

# Ollama models
embedding_model = "mxbai-embed-large"
llm_model = "llama3.1:8b"

### FAISS

In [None]:
base = read_fvecs('vec_test/siftsmall/sift_base.fvecs') # For testing
queries = read_fvecs('vec_test/siftsmall/sift_query.fvecs') # For testing
groundtruth = read_ivecs('vec_test/siftsmall/sift_groundtruth.ivecs') # For testing

## Key Functions

### LLM

In [4]:
async def get_llm_response(in_text, timeout=60.0): # Handles routing to Ollama server to process chat completion requests
    data = {
        "model": llm_model,
        "messages": [
            {"role": "user", "content": in_text}
        ],
        "stream": False
    }
    
    try:
        # Create a client with an increased timeout
        async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
            response = await client.post((ollama_base_url + ollama_chat), json=data)
            response.raise_for_status()
            
            result = response.json()
            
            # Extract the response text based on Ollama's API structure
            if "message" in result and "content" in result["message"]:
                return result["message"]["content"].strip()
            else:
                print('Unexpected response structure:', result)
                return None
                
    except httpx.ReadTimeout:
        print("Request timed out. The Ollama server might be busy or the model is taking too long to respond.")
        return None
    except httpx.ConnectError:
        print("Could not connect to the Ollama server. Make sure it's running at http://127.0.0.1:11434.")
        return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

In [5]:
async def get_embeddings(text, timeout=60.0): # Handles routing to Ollama server to generate embeddings from a user or assistant prompt
    """
    Get embeddings for the provided text using Ollama's API.
    
    Args:
        text: The text to generate embeddings for
        model: The model to use for embeddings (default: "llama3.1:8b")
        timeout: Timeout in seconds (default: 60 seconds)
    
    Returns:
        A list of embedding values or None if an error occurs
    """
    data = {
        "model": embedding_model,
        "prompt": text,
        "options": {
            "temperature": 0.0  # Lower temperature for more deterministic embeddings
        }
    }
    
    try:
        # Create a client with an increased timeout
        async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
            response = await client.post((ollama_base_url + ollama_embedding), json=data)
            response.raise_for_status()
            
            result = response.json() # TODO Normalize embeddings for FlatIP np.linalg.qnorm(result)
            
            # Extract the embeddings from the response
            if "embedding" in result:
                return result["embedding"]
            else:
                print('No embeddings found in response:', result)
                return None
                
    except httpx.ReadTimeout:
        print("Request timed out. The Ollama server might be busy or the model is taking too long to respond.")
        return None
    except httpx.ConnectError:
        print("Could not connect to the Ollama server. Make sure it's running at http://127.0.0.1:11434.")
        return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

In [13]:
def search_text_files(keyword: str) -> str:

    directory = os.listdir('./files/')
    for name in directory: 

        # Look through all the files in our directory that aren't hidden files
        if os.path.isfile('./files/' +fname) and not fname.startswith('.'):
            # Check if file is pdf format
            if(fname.endswith('.pdf')):

                document_text = ""
                doc = pymupdf.open('./files/' + fname)

                for page in doc: # TODO Consider splitting function to handle one page, then making another to iterate through pages
                    document_text += page.get_text()

                doc.close()

                prompt = "Respond only 'yes' or 'no', do not add any additional information. Is the followinbg text about " + keyword + "? " + document_text

                res = ollama.chat(
                    model="granite3.3:2b",
                    messsages=[{'role': 'user', 'content': prompt}]
                )

                if 'Yes' in res['message']['content']:
                    f.close()
                    return "./files/" + fname

    return "None"

In [14]:
def search_image_files(keyword:str) -> str:

    directory = os.listdir('./files/')
    image_file_types = ("jpg","png","jpeg")
    
    for name in directory: 
        # Look through all the files in our directory that aren't hidden files
        if os.path.isfile('./files/' +fname) and not fname.startswith('.') and fname.endswith(image_file_types):
            res = ollama.chat(
                model="llava",
                messages=[
                    {
                        'role': 'user',
                        'content': 'Describe this image in short sentences. Use simple phrases first and then describe it more fully.',
                        'images': ["./files/"+ fname]
                    }
                ]
            )

            if keyword in res['message']['content']:
                return "./files/" + fname
                
    return "None"
            

### Vector Database

In [85]:
async def load_matrix(user):
    # Pull memories from user JSON
    memories = await get_memories('memories/' + user + '.json')
    print('DataFrame \n', memories)
    
    # Create np array of the embeddings
    faiss_data = np.vstack(memories['embeddings'].values).astype('float32')
    print('Array \n', faiss_data)
    
    # Normalize array to optimize for inner product search
    faiss_data /= np.linalg.norm(faiss_data, axis=1, keepdims=True)
    print('Normalized array \n', faiss_data)
    
    
    # Build index
    dim = faiss_data.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(faiss_data)
    print('Index \n', index)
    
    return index

In [None]:
async def read_fvecs(path):
    data = np.fromfile(path, dtype=np.float32)
    dim = data.view(np.int32)[0]
    return data.reshape(-1, dim +1)[:, 1:]

In [None]:
async def read_ivecs(path):
    data = np.fromfile(path, dtype=np.int32)
    dim = data.view(np.int32)[0]
    return data.reshape(-1, dim +1)[:, 1:]

In [89]:
memories = await load_matrix('test')

DataFrame 
                              text                 timestamp  \
0        This is a sample message 2025-12-13 15:57:00+00:00   
1  This is another sample message 2025-12-13 16:17:00+00:00   

                                          embeddings  
0  [0.12345678900000001, 0.23456789, 0.3456789010...  
1  [0.12345678900000001, 0.23456789, 0.3456789010...  
Array 
 [[0.12345679 0.2345679  0.3456789  0.654321   0.7654321  0.8765432 ]
 [0.12345679 0.2345679  0.3456789  0.654321   0.7654321  0.8765432 ]]
Normalized array 
 [[0.0879123  0.16703336 0.24615435 0.4659352  0.5450563  0.62417734]
 [0.0879123  0.16703336 0.24615435 0.4659352  0.5450563  0.62417734]]
Index 
 <faiss.swigfaiss.IndexFlatIP; proxy of <Swig Object of type 'faiss::IndexFlatIP *' at 0x746fdb79de60> >


In [78]:
async def append_matrix():
    return null

In [81]:
async def find_nearest_k(query, k):
    return null

### DataFrame

In [6]:
async def create_memory(inText): #Handles the assignment of embeddings to the dataframe stored in pandas

    # Create the memory date/time element of the memory object
    timestamp = datetime.now()
    # Create the embedding element of the memory object
    new_embeddings = await get_embeddings(inText)
    
    # Create the new memory object with the time, embeddings, and inText
    new_memory = (inText, timestamp, new_embeddings)

    print(new_memory) # Sanity check
    
    # Concat the new memory onto the existing dataframe
    data.loc[len(data)] = new_memory

In [65]:
async def get_memories(path):
    data = pd.DataFrame(pd.read_json(path))
    #print(data)
    return data

In [None]:
async def get_user(user): # Returns a memories.json file #TODO whenever I decide to implement multiple user matrices
    return null 

### File

CSV and JSON will be critically important during the initial phases of development, though their continued presence is still up in the air. Overall I will come to prefer JSON for its robusticity, but CSV may come into play as I load in the initial conversation samples for processing. I intend for the final version of all of these testing objects to be JSON formatted

As a side note: I recognize that these functions are entirely redundant, given this is python and read_csv is already native to the csv library, but defining these functions here is a mearsure to keep my thinking organized and clear, and any specific changes I will need to make to either csv or JSON can be made universally here

In [10]:
# Save dataframe to output.json
# TODO Create naming convention for new files to prevent redundancy
data.to_json("memories/output.json", orient="records", indent=4)

## Testing

In [None]:
data = pd.read_json("memories/test.json")
data = pd.DataFrame(data)
data.head()

### Embedding

In [11]:
prompt = "This is another test prompt"
response = await get_llm_response(prompt)
print(response)

It looks like you're just testing the waters. How can I assist you today? Would you like to simulate a conversation, ask a question, or try something specific? I'm here to help!


In [12]:
embeddings = await get_embeddings(prompt)
print(embeddings)

[0.5756283402442932, 0.025145139545202255, 0.017586376518011093, 0.2662537097930908, -0.9467823505401611, -0.378476619720459, -0.19218437373638153, -0.05940169841051102, 0.22314892709255219, 1.241896629333496, -0.1529102921485901, 0.5244695544242859, -0.2150721251964569, -0.4756650924682617, -0.4049847424030304, -0.7585371732711792, -0.22175472974777222, -0.043068669736385345, -0.5012223720550537, -0.9111211895942688, -0.026720233261585236, 0.5962637662887573, -0.7997405529022217, -0.26611700654029846, -0.48238348960876465, -0.07176170498132706, -0.26466166973114014, 0.40670204162597656, 0.30144816637039185, 0.5829399824142456, -0.42821004986763, 0.5520590543746948, 0.02861153334379196, -0.6617875099182129, -0.13064660131931305, -0.6627650856971741, 1.2395473718643188, -0.12948891520500183, -0.02363390475511551, -0.4530979096889496, -0.49942171573638916, 0.33722245693206787, -0.047672830522060394, -0.788583517074585, -1.2296503782272339, -0.8267309665679932, -0.5478063225746155, -0.203

In [13]:
data = await get_memories('memories/test.json')

In [14]:
await create_memory("This is a third test")

('This is a third test', datetime.datetime(2025, 12, 21, 15, 28, 17, 350015), [0.664160430431366, 0.41141027212142944, 0.397152841091156, -0.1774732619524002, -0.8042889833450317, 0.11293699592351913, -0.3364812731742859, -0.07498068362474442, 0.20418491959571838, 0.7869546413421631, 0.3839385211467743, 0.2576581537723541, 0.2883857190608978, -1.1344465017318726, -0.5203862190246582, -0.7002549767494202, -0.4981207251548767, 0.3260970413684845, -0.24095642566680908, -0.14679613709449768, -0.31740114092826843, 0.32165834307670593, -0.5628244280815125, 0.03677096962928772, -0.3574230670928955, 0.2994132339954376, -0.14778786897659302, 0.25451719760894775, 0.197573721408844, 0.2618364691734314, 0.0037433169782161713, -0.8016471266746521, 0.4618974030017853, -0.6337639689445496, 0.05799247324466705, -0.4812524616718292, 1.196499228477478, 0.03362645208835602, -0.23210477828979492, -0.5161462426185608, -0.158101886510849, 0.31100982427597046, -0.6174250841140747, -0.27724432945251465, -0.92

In [15]:
data.head()

Unnamed: 0,text,timestamp,embeddings
0,This is a sample message,2025-12-13 15:57:00+00:00,"[0.12345678900000001, 0.23456789, 0.3456789010..."
1,This is another sample message,2025-12-13 16:17:00+00:00,"[0.12345678900000001, 0.23456789, 0.3456789010..."
2,This is a third test,2025-12-21 15:28:17.350015,"[0.664160430431366, 0.41141027212142944, 0.397..."


In [16]:
embeddings = data['embeddings']
embeddings.head()

0    [0.12345678900000001, 0.23456789, 0.3456789010...
1    [0.12345678900000001, 0.23456789, 0.3456789010...
2    [0.664160430431366, 0.41141027212142944, 0.397...
Name: embeddings, dtype: object

### FAISS

Following instructions from: https://shayan-fazeli.medium.com/faiss-a-quick-tutorial-to-efficient-similarity-search-595850e08473

In [17]:
import numpy as np

In [18]:
# Create simulation of data (This will be replaced with a function that creates a df of embeddings from memories.json)
faiss_data = np.random.rand(100000,768) # (Number of vectors, dimension of vectors) 
# TODO add .astype('float32')

In [19]:
index = faiss.IndexFlatL2(768) # Matches the number of dimensions in the matrix

In [20]:
assert index.is_trained # it is true

In [21]:
index.add(faiss_data)

In [22]:
index.ntotal
# prints 100000, as it is = data.shape[0]

100000

In [23]:
# Generating mock data to add to the index
query = np.random.rand(128, 768) # 128 queries, each with 768 embeddings

In [26]:
D, I = index.search(query, 10)
print(I) # Indices for Nearest Vectors

[[41540  9720 26022 ... 55357 43959 47280]
 [67675   481 12388 ... 56614 56119 32671]
 [65628 83515 78075 ... 20496 15851 74303]
 ...
 [63172 65880 12215 ... 99486 60816 90804]
 [33063  4219 10158 ... 43674 57412 59413]
 [96749 81008 50296 ... 36442 32010 75103]]


In [27]:
print(D) # Distance Values for Nearest Vectors

[[104.27838  106.25299  106.27136  ... 106.86594  107.41339  107.613434]
 [106.15222  107.22699  107.37073  ... 108.354004 108.37744  108.42474 ]
 [109.70032  109.98169  110.11923  ... 110.856415 110.87152  111.23999 ]
 ...
 [106.86914  107.27759  107.842896 ... 109.23193  109.29477  109.38745 ]
 [107.54193  107.80859  107.91632  ... 109.13617  109.2619   109.43204 ]
 [109.32019  109.643555 109.914764 ... 111.55847  111.795654 111.83508 ]]


In [44]:
n_seeds = 50
quantizer = faiss.IndexFlatL2(768)
index = faiss.IndexIVFFlat(quantizer, 768, n_seeds)
print(quantizer)
print(index)

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7e381e0e4240> >
<faiss.swigfaiss.IndexIVFFlat; proxy of <Swig Object of type 'faiss::IndexIVFFlat *' at 0x7e381e0e7fc0> >


In [43]:
index_f = faiss.index_factory(768, "IVF50,Flat")
print(index_f)

<faiss.swigfaiss.IndexIVFFlat; proxy of <Swig Object of type 'faiss::IndexIVFFlat *' at 0x7e387bd58e70> >


In [45]:
assert not index.is_trained  # it is False first
index.train(faiss_data)  # finding the Voronoi cells
print(index)
assert index.is_trained # it is trained now
index.add(faiss_data) # adding the data
print(index)

<faiss.swigfaiss.IndexIVFFlat; proxy of <Swig Object of type 'faiss::IndexIVFFlat *' at 0x7e381e0e7fc0> >
<faiss.swigfaiss.IndexIVFFlat; proxy of <Swig Object of type 'faiss::IndexIVFFlat *' at 0x7e381e0e7fc0> >


In [48]:
D, I = index.search(query, 10)
print(D,I)

[[109.23565  110.8861   112.41859  ... 113.63674  113.771576 113.9578  ]
 [107.902176 109.66973  112.14169  ... 113.61433  113.82791  113.89395 ]
 [111.6258   111.71193  112.93569  ... 113.950645 114.07269  114.666016]
 ...
 [108.92247  109.907906 110.89156  ... 114.73294  114.81004  114.883484]
 [110.25907  111.82844  112.61605  ... 114.07134  114.112045 114.13245 ]
 [106.9693   108.412796 108.89107  ... 110.151024 110.220375 110.38115 ]] [[27891 67576 23204 ... 20735 55955 50593]
 [57171 65568  7453 ... 89486 40352 69603]
 [34892 10258 30038 ... 43358 62702 18522]
 ...
 [68480 53462 76223 ... 66342  5615  9437]
 [14029  4647 19300 ... 68795 92662 96796]
 [67061 51155  9677 ... 56289 51427 44575]]


Let’s pick an index, say I[0,0] , the value it has is 18786. Let’s try to see what that value actually was:

In [49]:
index.reconstruct(18786)

# RuntimeError: Error in faiss::DirectMap::idx_t faiss::DirectMap::get(faiss::DirectMap::idx_t) 
# const at /project/faiss/faiss/invlists/DirectMap.cpp:82: direct map not initialized

RuntimeError: Error in faiss::idx_t faiss::DirectMap::get(faiss::idx_t) const at /home/runner/miniconda3/conda-bld/faiss-pkg_1728491247778/work/faiss/invlists/DirectMap.cpp:83: direct map not initialized

The reason why the error above is what you encounter is that the direct mapping between the vectors and positions is not established yet. To do so, you have to first run index.make_direct_map() method:

In [50]:
index.make_direct_map()
index.reconstruct(18786).shape
# (768,)

(768,)

## More/Notes
Current Tasks: 
- create larger test.json with full embeddings added to allow for more robust faiss evaluation
- - No longer needed, as FAISS has testing datasets to verify FAISS scalability on current system hardware
- It seems that the implementation of filesearch is going to make the functions I use to handle ollama calls more or less deprecated, as I can use ollama's internal functions to call it. On the other hand, I could avoid needing the ollama library altogether and refactor the filesearch functions to use the ollama calls I've already defined. The questions is what are the costs and benefits of doing this kind of switch. On one hand, it continues building on a codebase I'm familiar with, but on the other, it could impact readability moving forward. As with all decisions, what I decide now will have far reaching implications
