<a href="https://colab.research.google.com/github/Bigizic/get_real/blob/main/coding_agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pathlib openai numpy



In [None]:
from pathlib import Path

def read_files(paths, max_chars=15000):
    """
    read project files from a list of paths and return a single string.
    it stops once it hits max_chars to avoid sending too much to gpt.
    """
    collected = []
    total = 0

    for p in paths:
        p = Path(p)
        if p.is_file():
            try:
                text = p.read_text(encoding="utf-8", errors="ignore")
                if total + len(text) > max_chars:
                    text = text[: max_chars - total]
                collected.append(f"\n--- file: {p} ---\n{text}")
                total += len(text)
                if total >= max_chars:
                    break
            except Exception as e:
                print(f"could not read {p}: {e}")
        elif p.is_dir():
            for file in p.rglob("*"):
                if file.is_file():
                    try:
                        text = file.read_text(encoding="utf-8", errors="ignore")
                        if total + len(text) > max_chars:
                            text = text[: max_chars - total]
                        collected.append(f"\n--- file: {file} ---\n{text}")
                        total += len(text)
                        if total >= max_chars:
                            break
                    except Exception as e:
                        print(f"could not read {file}: {e}")
            if total >= max_chars:
                break

    return "\n".join(collected)

In [None]:
#!/usr/bin/env python3
import os
import re
import cmd
import numpy as np
from pathlib import Path
from openai import OpenAI
from google.colab import userdata

OPEN_API_KEY = userdata.get('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = OPEN_API_KEY
client = OpenAI()

BASE_DIR = Path.cwd()
CHUNK_SIZE = 2000  # number of characters per chunk
CODE_INDEX = {}    # { "relative/path.js": [ (chunk_text, embedding), ... ] }


def chunk_text(text, size=CHUNK_SIZE):
    """split text into chunks of given size"""
    return [text[i:i+size] for i in range(0, len(text), size)]


def embed_text(text):
    """generate embedding for a given string"""
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return np.array(response.data[0].embedding)


def index_codebase(project_path, exts=(".js", ".ts", ".jsx", ".tsx", ".json", ".py")):
    """index all code files by splitting into chunks and embedding them"""
    index = {}
    project_path = Path(project_path)
    files = None
    if project_path.is_file():
      files = [project_path]
    else:
      files = [f for f in project_path.rglob("*") if f.suffix in exts]

    for file in files:
        try:
            with open(file, "r", encoding="utf-8") as f:
                content = f.read()
            chunks = chunk_text(content)

            indexed_chunks = []
            for chunk in chunks:
                embedding = embed_text(chunk)
                indexed_chunks.append((chunk, embedding))

            index[str(file.relative_to(project_path))] = indexed_chunks
        except Exception as e:
            print(f"skipping {file}: {e}")
    return index


def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


def retrieve_relevant_chunks(query, top_k=10):
    """return the most relevant code chunks for a given query prompt"""
    query_embedding = embed_text(query)
    scored_chunks = []

    for file, chunks in CODE_INDEX.items():
        for chunk_text, embedding in chunks:
            score = cosine_similarity(query_embedding, embedding)
            scored_chunks.append((score, file, chunk_text))

    # sort by score (highest first)
    scored_chunks.sort(key=lambda x: x[0], reverse=True)
    return scored_chunks[:top_k]


class GptCodeAgent(cmd.Cmd):

    def __init__(self):
        super().__init__()
        self.project_path = None
        self.output_dir = None
        self.project_path = None
        self.use_rawinput = True

        self.chat_history = []

        self.intro = "Coding agent started. type 'help' for commands.\n"
        self.global_prompt = ""
        self.prompt = "Set up your coding agent. Lets get started.\n"
        self.prompter = "Select: "
        self.prompt1 = "1. Import codebase"
        self.prompt2 = "2. Chat with coding agent"
        self.prompt3 = "Enter path to codebase or path to file: "
        self.prompt4 = ("Enter a new project directory to save changes in a separate\n"
                         "location, or press enter to save changes directly in\n"
                         "the existing codebase:\n")

        self.prompt5 = "Select 1 to continue working on project\n"
        self.prompt6 = "Select 2 to choose from a new file or directory"

    def preloop(self):
        print(self.intro)
        print(self.prompt)
        print(self.prompt1)
        print(self.prompt2)
        choice = input(self.prompter).strip()

        if choice == "1":
            self.import_codebase()
        elif choice == "2":
            self.chat()
        else:
            print("invalid choice.")
            self.preloop()

    def import_codebase(self, should_continue_chat=False):
        entry = False

        if should_continue_chat:
          print("Select a new file or directory or continue to work on current project\n")
          print(self.prompt5)
          print(self.prompt6)
          continue_chat_choice = input(self.prompter)
          if continue_chat_choice == "1":
            return self.project_wide_update()
          elif continue_chat_choice == "2":
            entry = True
        else:
          entry = True

        if entry:
          # project path
          self.project_path = Path(input(self.prompt3).strip()).resolve()
          if not self.project_path.exists():
            print("invalid project path, exiting.")
            return self.do_exit("")

          # build index
          global CODE_INDEX
          print("\nindexing project files and generating embeddings...")
          CODE_INDEX = index_codebase(self.project_path)
          print(f"indexed {len(CODE_INDEX)} files.\n")

          # output directory
          user_input = input(self.prompt4).strip()
          self.output_dir = Path(user_input).resolve() if user_input else self.project_path
          if user_input and user_input.len() > 0:
            self.output_dir.mkdir(parents=True, exist_ok=True)
          print(f"Output will be stored in: {self.output_dir}")
          self.project_wide_update()

    def chat(self):
        """continue chatting with gpt to edit or create files"""
        user_prompt = input("enter your instruction:\n")

        # add user message to history
        self.chat_history.append({"role": "user", "content": user_prompt})

        messages = [{"role": "system", "content": "you are a senior full-stack coding assistant."}]
        # include global prompt at the start if set
        if self.global_prompt:
            messages.append({"role": "user", "content": f"global project instructions:\n{self.global_prompt}"})
        # include history
        messages.extend(self.chat_history)

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
            temperature=0.3
        )
        gpt_output = response.choices[0].message.content
        self.chat_history.append({"role": "assistant", "content": gpt_output})

        # try parsing file blocks like project_wide_update
        file_pattern = r"file:\s*(.+?)\n```[a-zA-Z]*\n(.*?)```"
        matches = re.findall(file_pattern, gpt_output, re.DOTALL)

        if not matches:
            # fallback: save whole thing in raw
            output_file = self.output_dir / "chat_output_raw.txt"
            with open(output_file, "a", encoding="utf-8") as f:
                f.write(gpt_output + "\n\n")
            print(f"raw chat output saved to {output_file}")
        else:
            for rel_path, code in matches:
                rel_path = rel_path.strip()
                # if model gives full path like project/foo.js, use it
                # if not, fallback to rel_path as is
                file_path = rel_path.split('/')[2:] or [rel_path]
                output_file = self.output_dir / "/".join(file_path)
                output_file.parent.mkdir(parents=True, exist_ok=True)
                with open(output_file, "w", encoding="utf-8") as f:
                    f.write(code.strip() + "\n")
                print(f"wrote changes to {output_file}")

        self.chat()

    def send_to_gpt(self, instruction, retrieved_chunks=None):
        messages = [
            {"role": "system", "content": "you are a senior full-stack coding assistant."},
            {"role": "user", "content": f"global project instructions:\n{self.global_prompt}"}
        ]

        if retrieved_chunks:
            for _, file, chunk in retrieved_chunks:
                messages.append({"role": "user", "content": f"file: {file}\ncode:\n{chunk}"})

        messages.append({"role": "user", "content": f"instruction:\n{instruction}"})

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
            temperature=0.3
        )
        return response.choices[0].message.content

    def project_wide_update(self):
      # global prompt
      self.global_prompt = input("\nEnter your prompt:\n")
      print("\nworking on full project update...")

      files_context = read_files([self.project_path])

      messages = [
          {
              "role": "system",
              "content": (
                  "you are a senior full-stack developer. the project is a full-stack javascript web app "
                  "with react (redux) frontend and express + mongodb backend. "
                  "when outputting changes, you must strictly follow this format:\n\n"
                  "file: path/to/file.js\n"
                  "```js\n"
                  "// full file code here\n"
                  "```\n\n"
                  "file: another/file.js\n"
                  "```js\n"
                  "// full file code here\n"
                  "```\n\n"
                  "only output file blocks in this format. do not include explanations, comments, or text outside of this structure."
              )
          },
          {
              "role": "user",
              "content": f"my project includes these files:\n{files_context[:12000]}..."
          },
          {
              "role": "user",
              "content": f"implement this feature:\n\n{self.global_prompt}"
          }
      ]

      response = client.chat.completions.create(
          model="gpt-4o",
          messages=messages,
          temperature=0.3
      )

      full_suggestion = response.choices[0].message.content

      # regex to parse gpt output: capture file path and code inside fences
      file_pattern = r"file:\s*(.+?)\n```[a-zA-Z]*\n(.*?)```"
      matches = re.findall(file_pattern, full_suggestion, re.DOTALL)

      if not matches:
          print("no valid file blocks detected, saving raw output...")
          output_file = self.output_dir / "project_wide_changes_raw.txt"
          with open(output_file, "w", encoding="utf-8") as f:
              f.write(full_suggestion)
          print(f"raw output saved to: {output_file}")
          return

      for rel_path, code in matches:
          rel_path = rel_path.strip()
          output_file = ""
          if self.output_dir == self.project_path:  # edit file or files in project here
            print(rel_path)
            # file_path = rel_path.split('/')[-1]
            # output_file = self.output_dir / file_path if self.output_dir != rel_
            output_file = rel_path
          else:
            file_path = rel_path.split('/')[2:]
            output_file = self.output_dir / "/".join(file_path)
            output_file.parent.mkdir(parents=True, exist_ok=True)
          with open(output_file, "w", encoding="utf-8") as f:
              f.write(code.strip() + "\n")
          print(f"\nwrote changes to {output_file}")
      self.import_codebase(True)

    def do_exit(self, arg):
        print("goodbye")
        return True


def start():
    cli = GptCodeAgent()
    cli.cmdloop()


if __name__ == "__main__":
    start()


Coding agent started. type 'help' for commands.

Set up your coding agent. Lets get started.

1. Import codebase
2. Chat with coding agent
Select: 1
Enter path to codebase or path to file: /content/niox/TLH/server/models/cart.js

indexing project files and generating embeddings...
indexed 1 files.

Enter a new project directory to save changes in a separate
location, or press enter to save changes directly in
the existing codebase:

Output will be stored in: /content/niox/TLH/server/models/cart.js

Enter your prompt:
the _id field of the user here  in the cart should be of type ObjectId and it should ref to 'User'

working on full project update...
/content/niox/TLH/server/models/cart.js

wrote changes to /content/niox/TLH/server/models/cart.js
Select a new file or directory or continue to work on current project
Select 1 to continue working on project

Select 2 to choose from a new file or directory
Select: 2
Enter path to codebase or path to file: /content/niox/TLH/server/models/orde

KeyboardInterrupt: Interrupted by user

In [None]:
import shutil
from google.colab import files

shutil.make_archive("/content/niox", 'zip', "/content/niox")

files.download("/content/niox.zip")
