In [None]:
!pip install tree_sitter_python
!pip install tree_sitter_java
!pip install tree_sitter_cpp
!pip install tree_sitter_javascript
!pip install tree_sitter_go
!pip install tree_sitter_html

Collecting tree_sitter_python
  Downloading tree_sitter_python-0.23.6-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading tree_sitter_python-0.23.6-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (112 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/112.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m81.9/112.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.3/112.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tree_sitter_python
Successfully installed tree_sitter_python-0.23.6
Collecting tree_sitter_java
  Downloading tree_sitter_java-0.23.5-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)
Downloading tree_s

In [None]:
import tiktoken
import json

def count_tokens(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def load_json(json_file):
    with open(json_file) as f:
        return json.load(f)

In [None]:
import tree_sitter_python as tspython
import tree_sitter_java as tsjava
import tree_sitter_cpp as tscpp
import tree_sitter_javascript as tsjs
import tree_sitter_go as tsgo
import tree_sitter_html as tshtml
from tree_sitter import Parser, Language

# Load language parsers
PY_LANGUAGE = Language(tspython.language())
JAVA_LANGUAGE = Language(tsjava.language())
CPP_LANGUAGE = Language(tscpp.language())
JS_LANGUAGE = Language(tsjs.language())
GO_LANGUAGE = Language(tsgo.language())
HTML_LANGUAGE = Language(tshtml.language())

# Create Parsers
parsers = {
    "py": Parser(PY_LANGUAGE),
    "java": Parser(JAVA_LANGUAGE),
    "cpp": Parser(CPP_LANGUAGE),
    "js": Parser(JS_LANGUAGE),
    "go": Parser(GO_LANGUAGE),
    "html": Parser(HTML_LANGUAGE),
}

class CodeParser:
    def __init__(self, file_extension):
        self.file_extension = file_extension
        if file_extension not in parsers:
            raise ValueError(f"Unsupported language: {file_extension}")
        self.parser = parsers[file_extension]

    def parse_code(self, code):
        """Parses the code and returns the syntax tree."""
        tree = self.parser.parse(bytes(code, "utf8"))
        return tree.root_node

    def extract_breakpoints(self, code):
        """Extracts function/class definitions as breakpoints."""
        tree = self.parse_code(code)
        breakpoints = []

        # Define syntax structures to extract based on language
        node_types = {
            "py": ["import_statement","function_definition", "class_definition"],
            "java": ["method_declaration", "class_declaration","annotation"],
            "cpp": ["function_definition", "class_specifier"],
            "js": ["function_declaration", "class_declaration", "arrow_function"],
            "go": ["function_declaration", "method_declaration"],
            "html": ["tag_name"]
        }

        def traverse(node):
            if node.type in node_types.get(self.file_extension, []):
                breakpoints.append(node.start_point[0])  # Store line number
            for child in node.children:
                traverse(child)

        traverse(tree)

        return sorted(set(breakpoints))  # Ensure unique and sorted breakpoints

    def extract_comments(self, code):
        """Extracts comment line numbers."""
        tree = self.parse_code(code)
        comments = []

        comment_nodes = {
            "py": ["comment"],
            "java": ["line_comment", "block_comment"],
            "cpp": ["comment"],
            "js": ["comment"],
            "go": ["comment"],
            "html": ["comment"]
        }

        def traverse(node):
            if node.type in comment_nodes.get(self.file_extension, []):
                comments.append(node.start_point[0])
            for child in node.children:
                traverse(child)

        traverse(tree)

        return sorted(set(comments))  # Return unique sorted comment line numbers


In [None]:
class CodeChunker:
    def __init__(self, file_extension, encoding_name="gpt-4"):
        self.file_extension = file_extension
        self.encoding_name = encoding_name

    def chunk(self, code, token_limit):
        parser = CodeParser(self.file_extension)
        chunks = {}
        lines = code.split("\n")

        # Get function/class breakpoints and comment lines
        breakpoints = parser.extract_breakpoints(code)
        comments = parser.extract_comments(code)

        adjusted_breakpoints = []
        inside_function=False

        for bp in breakpoints:
            highest_comment_line = None
            current_line = bp - 1

            # Find the highest preceding comment
            while current_line in comments:
                highest_comment_line = current_line
                current_line -= 1

            if highest_comment_line:
                adjusted_breakpoints.append(highest_comment_line)
            else:
                adjusted_breakpoints.append(bp)

        breakpoints = sorted(set(adjusted_breakpoints))  # Remove duplicates

        i = 0
        chunk_number = 1
        start_line = 0
        token_count = 0

        while i < len(lines):
            line = lines[i]
            new_token_count = count_tokens(line, self.encoding_name)
            if self.file_extension == "cpp":
              if "{" in line and not inside_function:
                  inside_function = True  # Start function block
              elif "}" in line:
                  inside_function = False  # End function block

            if token_count + new_token_count > token_limit:
                # Find the best breakpoint
                if i in breakpoints:
                    stop_line = i
                else:
                    stop_line = max([x for x in breakpoints if x < i], default=start_line)

                if stop_line == start_line:
                    token_count += new_token_count
                    i += 1
                else:
                    chunks[chunk_number] = "\n".join(lines[start_line:stop_line])
                    chunk_number += 1
                    token_count = 0
                    start_line = stop_line
                    i = stop_line
            else:
                token_count += new_token_count
                i += 1

        # Add any remaining lines
        remaining_chunk = "\n".join(lines[start_line:])
        if remaining_chunk.strip():
            chunks[chunk_number] = remaining_chunk

        return chunks

    def get_chunk(self, chunked_codebase, chunk_number):
        return chunked_codebase.get(chunk_number, "")


In [None]:
# Define multi-language test cases
python_code = """import math
# This function adds two numbers
def add(a, b):
    return a + b
class Calculator:
    def multiply(self, a, b):
        return a * b
"""

javascript_code = """// Function declaration
function add(a, b) {
    return a + b;
}
// Class declaration
class Calculator {
    multiply(a, b) {
        return a * b;
    }
}
"""

java_code = """public class Calculator {
    public int add(int a, int b) {
        return a + b;
    }
    public int multiply(int a, int b) {
        return a * b;
    }
}
"""

cpp_code = """#include <iostream>
class Calculator {
public:
    int add(int a, int b) {
        return a + b;
    }
};
"""

# Run tests for multiple languages
def test_chunking(code, file_extension, token_limit=20):
    print(f"\n🔹 Chunking Code for *.{file_extension} Files 🔹\n")

    # Initialize the chunker
    chunker = CodeChunker(file_extension=file_extension, encoding_name="gpt-4")

    # Generate chunks
    chunks = chunker.chunk(code, token_limit)

    # Print the results
    for chunk_id, chunk_text in chunks.items():
        print(f"\nChunk {chunk_id}:\n{'='*40}")
        print(chunk_text)
        print("="*40)

# Run tests
test_chunking(python_code, "py")
test_chunking(javascript_code, "js")
test_chunking(java_code, "java")
#test_chunking(cpp_code, "cpp")



🔹 Chunking Code for *.py Files 🔹


Chunk 1:
import math
# This function adds two numbers
def add(a, b):
    return a + b

Chunk 2:
class Calculator:
    def multiply(self, a, b):
        return a * b


🔹 Chunking Code for *.js Files 🔹


Chunk 1:
// Function declaration
function add(a, b) {
    return a + b;
}

Chunk 2:
// Class declaration
class Calculator {
    multiply(a, b) {
        return a * b;
    }
}


🔹 Chunking Code for *.java Files 🔹


Chunk 1:
public class Calculator {

Chunk 2:
    public int add(int a, int b) {
        return a + b;
    }

Chunk 3:
    public int multiply(int a, int b) {
        return a * b;
    }
}



In [None]:
python_code_2="""import math

# Function to add two numbers
def add(a, b):
    return a + b

# Function to subtract two numbers
def subtract(a, b):
    return a - b

# Function to multiply two numbers
def multiply(a, b):
    return a * b

# Function to divide two numbers
def divide(a, b):
    if b == 0:
        return "Cannot divide by zero"
    return a / b

# Main execution block
if __name__ == "__main__":
    x = 10
    y = 5

    print("Addition:", add(x, y))
    print("Subtraction:", subtract(x, y))
    print("Multiplication:", multiply(x, y))
    print("Division:", divide(x, y))
"""

In [None]:
test_chunking(python_code_2,"py")


🔹 Chunking Code for *.py Files 🔹


Chunk 1:
import math

# Function to add two numbers
def add(a, b):
    return a + b


Chunk 2:
# Function to subtract two numbers
def subtract(a, b):
    return a - b


Chunk 3:
# Function to multiply two numbers
def multiply(a, b):
    return a * b


Chunk 4:
# Function to divide two numbers
def divide(a, b):
    if b == 0:
        return "Cannot divide by zero"
    return a / b

# Main execution block
if __name__ == "__main__":
    x = 10
    y = 5

    print("Addition:", add(x, y))
    print("Subtraction:", subtract(x, y))
    print("Multiplication:", multiply(x, y))
    print("Division:", divide(x, y))



In [None]:
java_code_1="""import java.util.Scanner;

// Base class
abstract class Shape {
    abstract double area();
}

// Rectangle class
class Rectangle extends Shape {
    private double length, width;

    public Rectangle(double length, double width) {
        this.length = length;
        this.width = width;
    }

    @Override
    double area() {
        return length * width;
    }
}

// Circle class
class Circle extends Shape {
    private double radius;

    public Circle(double radius) {
        this.radius = radius;
    }

    @Override
    double area() {
        return Math.PI * radius * radius;
    }
}

// Threading example
class CounterThread implements Runnable {
    private int count;

    public CounterThread(int count) {
        this.count = count;
    }

    @Override
    public void run() {
        for (int i = 0; i < count; i++) {
            System.out.println("Counter: " + i);
        }
    }
}

// Main class
public class Main {
    public static void main(String[] args) {
        Scanner scanner = new Scanner(System.in);
        System.out.print("Enter radius of circle: ");
        double radius = scanner.nextDouble();

        Shape circle = new Circle(radius);
        System.out.println("Circle area: " + circle.area());

        System.out.print("Enter length and width of rectangle: ");
        double length = scanner.nextDouble();
        double width = scanner.nextDouble();

        Shape rectangle = ne
"""

In [None]:
test_chunking(java_code_1,"java")


🔹 Chunking Code for *.java Files 🔹


Chunk 1:
import java.util.Scanner;

// Base class
abstract class Shape {
    abstract double area();
}


Chunk 2:
// Rectangle class
class Rectangle extends Shape {
    private double length, width;

    public Rectangle(double length, double width) {
        this.length = length;
        this.width = width;
    }


Chunk 3:
    @Override
    double area() {
        return length * width;
    }
}


Chunk 4:
// Circle class
class Circle extends Shape {
    private double radius;

    public Circle(double radius) {
        this.radius = radius;
    }


Chunk 5:
    @Override
    double area() {
        return Math.PI * radius * radius;
    }
}


Chunk 6:
// Threading example
class CounterThread implements Runnable {
    private int count;
    
    public CounterThread(int count) {
        this.count = count;
    }


Chunk 7:
    @Override
    public void run() {
        for (int i = 0; i < count; i++) {
            System.out.println("Counter: " + i)

In [None]:
cpp_code_1="""#include <iostream>
#include <fstream>
#include <vector>
#include <thread>

// Base class
class Shape {
public:
    virtual double area() = 0;
};

// Derived class: Rectangle
class Rectangle : public Shape {
private:
    double length, width;
public:
    Rectangle(double l, double w) : length(l), width(w) {}

    double area() override {
        return length * width;
    }
};

// Derived class: Circle
class Circle : public Shape {
private:
    double radius;
public:
    Circle(double r) : radius(r) {}

    double area() override {
        return 3.14159 * radius * radius;
    }
};

// Generic function using templates
template <typename T>
void displayValue(T value) {
    std::cout << "Value: " << value << std::endl;
}

// Function to write data to a file
void writeToFile(const std::string &filename, const std::string &data) {
    std::ofstream file(filename);
    if (file.is_open()) {
        file << data;
        file.close();
    } else {
        std::cerr << "Unable to open file!" << std::endl;
    }
}

// Function for multithreading
void printNumbers(int max) {
    for (int i = 1; i <= max; i++) {
        std::cout << "Number: " << i << std::endl;
    }
}

// Main function
int main() {
    std::cout << "Enter radius of circle: ";
    double radius;
    std::cin >> radius;

    Shape *circle = new Circle(radius);
    std::cout << "Circle area: " << circle->area() << std::endl;
    delete circle;

    std::cout << "Enter length and width of rectangle: ";
    double length, width;
    std::cin >> length >> width;

    Shape *rectangle = new Rectangle(length, width);
    std::cout << "Rectangle area: " << rectangle->area() << std::endl;
    delete rectangle;

    // Multithreading example
    std::thread t1(printNumbers, 5);
    t1.join();

    // File handling example
    writeToFile("output.txt", "This is a test file!");

    // Exception handling
    try {
        int divisor;
        std::cout << "Enter divisor: ";
        std::cin >> divisor;
        if (divisor == 0)
            throw std::runtime_error("Division by zero!");
        std::cout << "Result: " << (10 / divisor) << std::endl;
    } catch (const std::exception &e) {
        std::cerr << "Error: " << e.what() << std::endl;
    }

    return 0;
}
"""

In [None]:
test_chunking(cpp_code_1, "cpp")


🔹 Chunking Code for *.cpp Files 🔹


Chunk 1:
#include <iostream>
#include <fstream>
#include <vector>
#include <thread>


Chunk 2:
// Base class
class Shape {
public:
    virtual double area() = 0;
};


Chunk 3:
// Derived class: Rectangle
class Rectangle : public Shape {
private:
    double length, width;
public:

Chunk 4:
    Rectangle(double l, double w) : length(l), width(w) {}


Chunk 5:
    double area() override {
        return length * width;
    }
};


Chunk 6:
// Derived class: Circle
class Circle : public Shape {
private:
    double radius;
public:

Chunk 7:
    Circle(double r) : radius(r) {}


Chunk 8:
    double area() override {
        return 3.14159 * radius * radius;
    }
};

// Generic function using templates
template <typename T>

Chunk 9:
void displayValue(T value) {
    std::cout << "Value: " << value << std::endl;
}


Chunk 10:
// Function to write data to a file
void writeToFile(const std::string &filename, const std::string &data) {
    std::ofstream file(f

In [None]:
import tree_sitter_python as tspython
import tree_sitter_java as tsjava
import tree_sitter_cpp as tscpp
import tree_sitter_javascript as tsjs
import tree_sitter_go as tsgo
import tree_sitter_html as tshtml
from tree_sitter import Parser, Language

# Load language parsers
PY_LANGUAGE = Language(tspython.language())
JAVA_LANGUAGE = Language(tsjava.language())
CPP_LANGUAGE = Language(tscpp.language())
JS_LANGUAGE = Language(tsjs.language())
GO_LANGUAGE = Language(tsgo.language())
HTML_LANGUAGE = Language(tshtml.language())

# Create Parsers
parsers = {
    "py": Parser(PY_LANGUAGE),
    "java": Parser(JAVA_LANGUAGE),
    "cpp": Parser(CPP_LANGUAGE),
    "js": Parser(JS_LANGUAGE),
    "go": Parser(GO_LANGUAGE),
    "html": Parser(HTML_LANGUAGE),
}

class CodeParser:
    def __init__(self, file_extension):
        self.file_extension = file_extension
        if file_extension not in parsers:
            raise ValueError(f"Unsupported language: {file_extension}")
        self.parser = parsers[file_extension]

    def parse_code(self, code):
        """Parses the code and returns the syntax tree."""
        tree = self.parser.parse(bytes(code, "utf8"))
        return tree.root_node

    def extract_breakpoints(self, code):
        """Extracts function/class definitions as breakpoints."""
        tree = self.parse_code(code)
        breakpoints = []
        prev_line=[]

        # Define syntax structures to extract based on language
        node_types = {
            "py": ["import_statement","function_definition", "class_definition"],
            "java": ["method_declaration", "class_declaration","annotation"],
            "cpp": ["function_definition", "class_specifier"],
            "js": ["function_declaration", "class_declaration", "arrow_function"],
            "go": ["function_declaration", "method_declaration"],
            "html": ["tag_name"]
        }

        def traverse(node):
            nonlocal prev_line
            if node.type in node_types.get(self.file_extension, []):
                line_number=node.start_point[0]
                if self.file_extension=="py" and node.type=="import_statement":
                  if line_number not in breakpoints:
                      breakpoints.append(line_number)
                breakpoints.append(line_number)  # Store line number
                prev_line=line_number
            for child in node.children:
                traverse(child)

        traverse(tree)

        return sorted(set(breakpoints))  # Ensure unique and sorted breakpoints

    def extract_comments(self, code):
        """Extracts comment line numbers."""
        tree = self.parse_code(code)
        comments = []

        comment_nodes = {
            "py": ["comment"],
            "java": ["line_comment", "block_comment"],
            "cpp": ["comment"],
            "js": ["comment"],
            "go": ["comment"],
            "html": ["comment"]
        }

        def traverse(node):
            if node.type in comment_nodes.get(self.file_extension, []):
                comments.append(node.start_point[0])
            for child in node.children:
                traverse(child)

        traverse(tree)

        return sorted(set(comments))  # Return unique sorted comment line numbers


In [None]:
class CodeChunker:
    def __init__(self, file_extension, encoding_name="gpt-4"):
        self.file_extension = file_extension
        self.encoding_name = encoding_name

    def chunk(self, code, token_limit):
        parser = CodeParser(self.file_extension)
        chunks = {}
        lines = code.split("\n")

        # Get function/class breakpoints and comment lines
        breakpoints = parser.extract_breakpoints(code)
        comments = parser.extract_comments(code)

        adjusted_breakpoints = []

        for bp in breakpoints:
            highest_comment_line = None
            current_line = bp - 1

            # Find the highest preceding comment
            while current_line in comments:
                highest_comment_line = current_line
                current_line -= 1

            if highest_comment_line:
                adjusted_breakpoints.append(highest_comment_line)
            else:
                adjusted_breakpoints.append(bp)

        breakpoints = sorted(set(adjusted_breakpoints))  # Remove duplicates

        i = 0
        chunk_number = 1
        start_line = 0
        token_count = 0

        while i < len(lines):
            line = lines[i]
            new_token_count = count_tokens(line, self.encoding_name)
            if self.file_extension == "py" and "import" in line:
              if token_count > 0:  # If there's already content, finalize the current chunk
                  chunks[chunk_number] = "\n".join(lines[start_line:i])
                  chunk_number += 1
                  start_line = i  # Start new chunk from this import
                  token_count = 0  # Reset token count for new chunk




            if self.file_extension == "cpp":
              if "{" in line and not inside_function:
                  inside_function = True  # Start function block
              elif "}" in line:
                  inside_function = False  # End function block

            if token_count + new_token_count > token_limit:
                # Find the best breakpoint
                if i in breakpoints:
                    stop_line = i
                else:
                    stop_line = max([x for x in breakpoints if x < i], default=start_line)

                if stop_line == start_line:
                    token_count += new_token_count
                    i += 1
                else:
                    chunks[chunk_number] = "\n".join(lines[start_line:stop_line])
                    chunk_number += 1
                    token_count = 0
                    start_line = stop_line
                    i = stop_line
            else:
                token_count += new_token_count
                i += 1

        # Add any remaining lines
        remaining_chunk = "\n".join(lines[start_line:])
        if remaining_chunk.strip():
            chunks[chunk_number] = remaining_chunk

        return chunks

    def get_chunk(self, chunked_codebase, chunk_number):
        return chunked_codebase.get(chunk_number, "")


Embeddings

In [None]:
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
class CodeEmbedder:
  def __init__(self,model)