### LLM Setup

In [3]:
from langchain_community.llms import Ollama

llm = Ollama(model="llama3")

### Utility functions

In [4]:
# streaming helper

async def astream(invocable, inputs, *args, **kwargs):
    async for chunk in invocable.astream(inputs, *args, **kwargs):
        print(chunk, end="")

### Vectorstore database generation

#### Code splits into chunks

In [10]:

import os, os.path as osp
import glob
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language

repo_path = ".\\PathTracerAP"

loader = DirectoryLoader(repo_path, "*.*", loader_cls=TextLoader, show_progress=True, use_multithreading=True)
source_files = loader.load()

cpp_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.CPP, chunk_size=200, chunk_overlap=10)
code_chunks = cpp_splitter.split_documents(source_files)
print(len(code_chunks))

100%|██████████| 11/11 [00:00<00:00, 1257.32it/s]

513





### Information chain creation

In [6]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings

gpt4all_embd = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf")
from langchain_core.prompts import ChatPromptTemplate

info_prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are an AI agent tasked with processing chunks from a codebase. Your objective is to thoroughly understand the information contained in each chunk and return a CONCISE summary. You must provide your answer strictly in BULLET POINTS - all output lines must start with the character '•'. Do NOT use any other sentence formatting, and do NOT include any introductory or concluding statements."),
    ("user", "The code chunk you will need to process is:\n{code_chunk}")
])

info_chain = info_prompt_template | llm
# print(code_chunks[0].page_content, '\n')
# await astream(info_chain, code_chunks[0].page_content)
# print(code_chunks[1].page_content, '\n')
# await astream(info_chain, code_chunks[1].page_content)

In [7]:
def parse_bullet_points(text):
    bullet_char = '•'
    lines = text.split('\n')

    lines = list(filter(lambda line: line.startswith(bullet_char), lines))

    return "\n".join(lines)

text = """
Summary is:
• The code chunk includes several header files and defines a macro to enable CUDA functionality.
• The namespace Common defines an entity index type and structs for Vertex, Triangle, and BoundingBox.
• The namespace Geometry defines structs for Vertex and Triangle.
• The BoundingBox struct has methods for initializing and updating its bounds.
"""

print(parse_bullet_points(text))

• The code chunk includes several header files and defines a macro to enable CUDA functionality.
• The namespace Common defines an entity index type and structs for Vertex, Triangle, and BoundingBox.
• The namespace Geometry defines structs for Vertex and Triangle.
• The BoundingBox struct has methods for initializing and updating its bounds.


In [8]:
heirarchy_prompt_template = ChatPromptTemplate.from_messages([
    ("system", """You are an AI agent tasked with combining 2 code summaries. Your task is to merge two summaries of different code base parts into a single, clear, and comprehensive knowledge base. Combine overlapping details, organize content logically by grouping related functionalities, include all unique information from both summaries, use concise technical language, maintain context and purpose of each code chunk, and ensure consistency in terminology and formatting.You must provide your response strictly in BULLET POINTS - all output lines must start with the character '•'. Do NOT use any other sentence formatting. Do NOT include any introductory or concluding statements. DO NOT use the words like - 'summary' or 'chunk' in your response.
"""),
    ("user", "The two summaries you will need to process are:\n summary 1:{summary_1} \n summary 2:{summary_2}")
])

heirarchy_chain = heirarchy_prompt_template | llm

summary_1 = """
• The code chunk includes several directives and includes, indicating that it is a C++ header file.
• The pragma once directive indicates that the file should be included only once in the entire program.
• The #include <iostream> directive includes the standard output stream functions from the C++ standard library.
• The #include <vector> directive includes the vector data structure from the C++ standard library.
• The #define GLM_FORCE_CUDA directive enables NVIDIA CUDA optimization for the glm library.
• The #include <glm/glm.hpp> and #include <glm/gtc/matrix_transform.hpp> directives include the glm library, which provides geometric computations and transformations.
• The #include "Config.h" directive includes a configuration file.
• The namespace Common defines a new namespace named Common.
• The struct IndexRange defines an index range structure.
• The struct Vertex defines a vertex structure with position, normal, and uv components.
• The struct Triangle defines a triangle structure with vertex indices.
• The struct BoundingBox defines a bounding box structure with minimum and maximum coordinates in each dimension.
• The BoundingBox constructor initializes the bounding box with arbitrary large values, and the update method updates the bounds based on the position of a vertex.namespace SceneElements
"""
summary_2 = """
• The code chunk is a part of a 3D graphics library, specifically the namespace "SceneElements".
• The code defines several structures: Material, Mesh, Model, EntityType, Voxel3DIndex, Voxel, Grid.
• The Material structure has fields for material type (DIFFUSE, SPECULAR, etc.), refractive index, and phong exponent.
• The Mesh structure has fields for vertex indices, triangle indices, and bounding box.
• The Model structure has fields for grid index, mesh index, model-to-world and world-to-model matrices, and material.
• The EntityType enum defines four values: MODEL, SCENE, TRIANGLE, SPHERE.
• The Voxel3DIndex struct has fields for x, y, and z indices.
• The Voxel struct has fields for entity index range and entity type.
• The Grid struct has fields for voxel indices, entity type, and entity index.
• The code also defines an enum for spatial acceleration.
"""
#await astream(heirarchy_chain, {"summary_1":summary_1, "summary_2":summary_2})

In [22]:
# Build global vector store, part 1 - within files
# Per file: summarize chunks, combine summaries, put summaries across all levels in vector store

global_summary_doc_list = []

In [21]:

# 1. Organize chunks by file
chunks_by_file = {}

for chunk in code_chunks:
    file_path = chunk.metadata["source"]
    if file_path not in chunks_by_file:
        chunks_by_file[file_path] = []
    chunks_by_file[file_path].append(chunk)

{file_path: len(chunks) for file_path, chunks in chunks_by_file.items()}

{'PathTracerAP\\Debug_Visualizer.h': 3,
 'PathTracerAP\\Config.h': 3,
 'PathTracerAP\\Experimentation.h': 102,
 'PathTracerAP\\GPUMemoryPool.h': 6,
 'PathTracerAP\\main.cpp': 4,
 'PathTracerAP\\Primitive.h': 17,
 'PathTracerAP\\Scene.cpp': 130,
 'PathTracerAP\\Renderer.cpp': 187,
 'PathTracerAP\\Renderer.h': 9,
 'PathTracerAP\\Scene.h': 7,
 'PathTracerAP\\utility.h': 45}

In [24]:
# 2. For each file, build hierarchical summaries and store in global list

from langchain.docstore.document import Document
from tqdm import tqdm

for file_path, chunks in chunks_by_file.items():
    if file_path != 'PathTracerAP\\Debug_Visualizer.h':
        continue

    print(f"Processing file: {file_path} ({len(chunks)} chunks)...")

    print("- Summarizing individual chunks")

    for i, chunk in enumerate(chunks):
        print(f"\tChunk {i+1} / {len(chunks)}")
        summary = info_chain.invoke(chunk.page_content)
        summary_doc = Document(page_content=summary, metadata=chunk.metadata)
        global_summary_doc_list.append(summary_doc)

    print("- Building hierarchical summaries")

    read_buffer = [summary_doc for summary_doc in global_summary_doc_list]
    merge_buffer = []

    level = 1
    while len(read_buffer) > 1:
        print(f"\tLevel {level}")

        for i in range(0, len(read_buffer)-1, 2):
            print(f"\t\tEntries ({i},{i+1}) out of {len(read_buffer)}")
            doc1, doc2 = read_buffer[i], read_buffer[i+1]
            common_meta = doc1.metadata
            info = heirarchy_chain.invoke({
                "summary_1": doc1.page_content,
                "summary_2": doc2.page_content
            })
            info_points = parse_bullet_points(info)

            merge_buffer.append(Document(
                page_content=common_meta["source"]+"\n"+info_points,
                metadata=common_meta
            ))

        read_buffer.clear()
        read_buffer.extend(merge_buffer)
        global_summary_doc_list.extend(merge_buffer)
        level += 1
        merge_buffer.clear()

for doc in global_summary_doc_list:
    print(doc.page_content)
    print()
    print(doc.metadata)
    print('*********************')

Processing file: PathTracerAP\Debug_Visualizer.h (3 chunks)...
- Summarizing individual chunks
	Chunk 1 / 3
	Chunk 2 / 3
	Chunk 3 / 3
- Building hierarchical summaries
	Level 1
	Level 2
PathTracerAP\Debug_Visualizer.h
• #pragma once
• #include "GL/glew.h"
• #include "GLFW/glfw3.h"
• #include <glm/glm.hpp>
• #include <glm/gtc/matrix_transform.hpp>
• #include <glm/gtc/type_ptr.hpp>
• #include "Renderer.h"
• #ifdef ENABLE_VISUALIZER
•  #endif

{'source': 'PathTracerAP\\Debug_Visualizer.h'}
*********************
• The code chunk includes various headers and libraries for graphics programming:
	+ OpenGL (GLEW)
	+ GLFW (for windowing and input handling)
	+ GLM (a mathematics library for vectors, matrices, and transformations)
	• Additionally, it includes a custom header file "Renderer.h"

{'source': 'PathTracerAP\\Debug_Visualizer.h'}
*********************
• This code snippet is a preprocessor directive that checks if a constant `ENABLE_VISUALIZER` is defined. 
• If `ENABLE_VISUALIZER` is en

In [None]:

from langchain.docstore.document import Document
from tqdm import tqdm

info_docs = []

for index, chunk in tqdm(enumerate(code_chunks[:2])):
    print(f"Processing chunk: {index+1} / {len(code_chunks)}...")

    info = info_chain.invoke(chunk.page_content)
    info_points = parse_bullet_points(info)
    source_file_name = chunk.metadata["source"]
    info_docs.append(Document(page_content=source_file_name+"\n"+info_points, metadata=chunk.metadata))

# persistence
print(info_docs)

info_store = Chroma.from_documents(info_docs, embedding=gpt4all_embd, persist_directory='.\\temp\\info_store_lvl_1')

In [12]:
info_store_1 = Chroma(persist_directory=".\\temp\\info_store_lvl_1", embedding_function=gpt4all_embd)
info_store_1.get()

{'ids': ['4f149d5f-e5b5-4a84-a700-7d613f05ad2e',
  '5dc05819-3459-47fb-9397-4ae6b35dac0a'],
 'embeddings': None,
 'metadatas': [{'source': 'PathTracerAP\\Debug_Visualizer.h'},
  {'source': 'PathTracerAP\\Debug_Visualizer.h'}],
 'documents': ['PathTracerAP\\Debug_Visualizer.h\n• Includes the GLEW, GLFW, and GLM libraries for OpenGL functionality.\n• Includes the Renderer header file.',
  'PathTracerAP\\Debug_Visualizer.h\n• This code snippet defines a preprocessor directive #ifdef, indicating that the following block of code should only be compiled if the symbol "ENABLE_VISUALIZER" is defined.'],
 'uris': None,
 'data': None}

In [52]:
#Hierarchical summarization

read_buffer = info_docs.copy()
merge_buffer = []
level_index = 2
while len(read_buffer) > 1:
    print("Processing level: "+str(level_index))
    for i in range(0, len(read_buffer)-1, 2):
        info = heirarchy_chain.invoke({"summary_1":read_buffer[i].page_content, "summary_2":read_buffer[i+1].page_content})
        info_points = parse_bullet_points(info)
        source_list_set = set(read_buffer[i].metadata["source"].split("|")).union(read_buffer[i+1].metadata["source"].split("|"))
        source_list_str = "|".join(source_list_set)
        combined_meta = {"source": source_list_str}

        merge_buffer.append(Document(page_content=source_list_str+"\n"+info_points, metadata=combined_meta))

    read_buffer.clear()
    read_buffer.extend(merge_buffer)
    info_store_nm = '.\\temp\\info_store_lvl_'+str(level_index)
    info_store = Chroma.from_documents(merge_buffer, embedding=gpt4all_embd, persist_directory=info_store_nm)
    level_index = level_index + 1
    merge_buffer.clear()


Processing level: 2


### UI

In [None]:
import gradio as gr
import time

def process_user_query(message, history):
    if len(history) % 2 == 0:
        return f"Yes, I do think that '{message}'"
    else:
        return "I don't think so"

def process_source_code(dir_path):
    progress = gr.Progress()
    # Simulate file processing with a delay
    for i in range(10):
        time.sleep(0.5)
        progress(i / 10)
        a = True

# Define Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# CodeDocBot")
    dir_path = gr.Textbox(label="Source-code directory path", placeholder="Enter the path to the folder")
    process_button = gr.Button("Process File")
    output_text = gr.Textbox(label="")
    
    process_button.click(fn=process_source_code, inputs=dir_path, outputs=output_text)

demo.launch()
gr.ChatInterface(process_user_query).launch()
    