<a href="https://colab.research.google.com/github/Bigizic/get_real/blob/main/coding_agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
pip install pathlib openai numpy



In [5]:
#!/usr/bin/env python3
from pathlib import Path
from typing import List

def smart_sample_js_files(base_path: Path, inputs: List[str]):
    print("Smart-sampling JS files...")

    input_paths = [(base_path / i).resolve() for i in inputs]
    js_files = []

    for path in input_paths:
        if not path.exists():
            print(f"Path does not exist: {path}")
            continue

        if path.is_file() and path.suffix in [".js", ".jsx", ".ts", ".tsx"]:
            js_files.append(path)
        elif path.is_dir():
            for f in path.rglob("*"):
                if (
                    f.suffix in [".js", ".jsx", ".ts", ".tsx"]
                    and all(skip not in f.parts for skip in ["node_modules", "build", "dist", "Gpt"])
                ):
                    js_files.append(f)

    summaries = []
    for file_path in js_files:
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                lines = f.readlines()
                print(lines)

            total_lines = len(lines)
            # if total_lines <= 10000:
            sample = lines
            """else:
                sample = (
                    lines[:40]
                    + ["\n// ...\n"]
                    + lines[total_lines // 2 - 15: total_lines // 2 + 15]
                    + ["\n// ...\n"]
                    + lines[-30:]
                )
            """

            summary = f"\n\n// === FILE: {file_path.relative_to(base_path)} ===\n"
            summary += "".join(sample)
            summaries.append(summary)
        except Exception as e:
            print(f"Skipping file {file_path}: {e}")

    print(f"{len(summaries)} files summarized.")
    return "\n".join(summaries)


In [11]:
from pathlib import Path

def read_files(paths, max_chars=15000):
    """
    read project files from a list of paths and return a single string.
    it stops once it hits max_chars to avoid sending too much to gpt.
    """
    collected = []
    total = 0

    for p in paths:
        p = Path(p)
        if p.is_file():
            try:
                text = p.read_text(encoding="utf-8", errors="ignore")
                if total + len(text) > max_chars:
                    text = text[: max_chars - total]
                collected.append(f"\n--- file: {p} ---\n{text}")
                total += len(text)
                if total >= max_chars:
                    break
            except Exception as e:
                print(f"could not read {p}: {e}")
        elif p.is_dir():
            for file in p.rglob("*"):
                if file.is_file():
                    try:
                        text = file.read_text(encoding="utf-8", errors="ignore")
                        if total + len(text) > max_chars:
                            text = text[: max_chars - total]
                        collected.append(f"\n--- file: {file} ---\n{text}")
                        total += len(text)
                        if total >= max_chars:
                            break
                    except Exception as e:
                        print(f"could not read {file}: {e}")
            if total >= max_chars:
                break

    return "\n".join(collected)

In [12]:
#!/usr/bin/env python3
import os
import re
import cmd
import numpy as np
from pathlib import Path
from openai import OpenAI
from google.colab import userdata

OPEN_API_KEY = userdata.get('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = OPEN_API_KEY
client = OpenAI()

BASE_DIR = Path.cwd()
CHUNK_SIZE = 2000  # number of characters per chunk
CODE_INDEX = {}    # { "relative/path.js": [ (chunk_text, embedding), ... ] }


def chunk_text(text, size=CHUNK_SIZE):
    """split text into chunks of given size"""
    return [text[i:i+size] for i in range(0, len(text), size)]


def embed_text(text):
    """generate embedding for a given string"""
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return np.array(response.data[0].embedding)


def index_codebase(project_path, exts=(".js", ".ts", ".jsx", ".tsx", ".json", ".py")):
    """index all code files by splitting into chunks and embedding them"""
    index = {}
    project_path = Path(project_path)
    files = [f for f in project_path.rglob("*") if f.suffix in exts]

    for file in files:
        try:
            with open(file, "r", encoding="utf-8") as f:
                content = f.read()
            chunks = chunk_text(content)

            indexed_chunks = []
            for chunk in chunks:
                embedding = embed_text(chunk)
                indexed_chunks.append((chunk, embedding))

            index[str(file.relative_to(project_path))] = indexed_chunks
        except Exception as e:
            print(f"skipping {file}: {e}")
    return index


def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


def retrieve_relevant_chunks(query, top_k=10):
    """return the most relevant code chunks for a given query prompt"""
    query_embedding = embed_text(query)
    scored_chunks = []

    for file, chunks in CODE_INDEX.items():
        for chunk_text, embedding in chunks:
            score = cosine_similarity(query_embedding, embedding)
            scored_chunks.append((score, file, chunk_text))

    # sort by score (highest first)
    scored_chunks.sort(key=lambda x: x[0], reverse=True)
    return scored_chunks[:top_k]


class GptCodeAgent(cmd.Cmd):
    intro = "gpt project coding agent started. type 'help' for commands.\n"
    prompt = "(gpt) "
    use_rawinput = True

    def preloop(self):
        print("welcome, let's set up your coding agent.\n")

        # project path
        self.project_path = Path(input("enter path to your existing project folder: ").strip()).resolve()
        if not self.project_path.exists():
            print("invalid project path, exiting.")
            return self.do_exit("")

        # output directory
        self.output_dir = Path(input("enter new project directory name (for generated files): ").strip()).resolve()
        self.output_dir.mkdir(parents=True, exist_ok=True)
        print(f"output will be stored in: {self.output_dir}")

        # global prompt
        self.global_prompt = input("enter your global prompt (instructions for gpt):\n")

        # build index
        global CODE_INDEX
        print("\nindexing project files and generating embeddings...")
        CODE_INDEX = index_codebase(self.project_path)
        print(f"indexed {len(CODE_INDEX)} files.")

        # continue
        self.ask_action()

    def ask_action(self):
        print("\nwhat do you want to do?")
        print("1. implement feature across project")
        print("2. create a new file")
        print("3. update existing file")
        choice = input("enter 1, 2, or 3: ").strip()

        if choice == "1":
            self.project_wide_update()
        elif choice == "2":
            self.create_file()
        elif choice == "3":
            self.update_file()
        else:
            print("invalid choice.")
            self.ask_action()

    def send_to_gpt(self, instruction, retrieved_chunks=None):
        messages = [
            {"role": "system", "content": "you are a senior full-stack coding assistant."},
            {"role": "user", "content": f"global project instructions:\n{self.global_prompt}"}
        ]

        if retrieved_chunks:
            for _, file, chunk in retrieved_chunks:
                messages.append({"role": "user", "content": f"file: {file}\ncode:\n{chunk}"})

        messages.append({"role": "user", "content": f"instruction:\n{instruction}"})

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
            temperature=0.3
        )
        return response.choices[0].message.content

    def project_wide_update(self):
      print("working on full project update...")
      """def project_wide_update(self):
        feature_prompt = input("describe the feature you want to implement:\n")

        relevant_chunks = retrieve_relevant_chunks(feature_prompt, top_k=10)
        gpt_response = self.send_to_gpt(feature_prompt, retrieved_chunks=relevant_chunks)

        output_file = self.output_dir / "project_wide_changes.txt"
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(gpt_response)
        print(f"project-wide suggestions saved to: {output_file}")
        self.ask_action()
      """
      prompt = self.global_prompt

      files_context = read_files(self.project_path)

      messages = [
          {
              "role": "system",
              "content": (
                  "you are a senior full-stack developer. the project is a full-stack javascript web app "
                  "with react (redux) frontend and express + mongodb backend. "
                  "when outputting changes, you must strictly follow this format:\n\n"
                  "file: relative/path/to/file.js\n"
                  "```js\n"
                  "// full file code here\n"
                  "```\n\n"
                  "file: another/file.js\n"
                  "```js\n"
                  "// full file code here\n"
                  "```\n\n"
                  "only output file blocks in this format. do not include explanations, comments, or text outside of this structure."
              )
          },
          {
              "role": "user",
              "content": f"my project includes these files:\n{files_context[:12000]}..."
          },
          {
              "role": "user",
              "content": f"implement this feature:\n\n{prompt}"
          }
      ]

      response = self.client.chat.completions.create(
          model="gpt-4o",
          messages=messages,
          temperature=0.3
      )

      full_suggestion = response.choices[0].message.content

      # regex to parse gpt output: capture file path and code inside fences
      file_pattern = r"file:\s*(.+?)\n```[a-zA-Z]*\n(.*?)```"
      matches = re.findall(file_pattern, full_suggestion, re.DOTALL)

      if not matches:
          print("no valid file blocks detected, saving raw output...")
          output_file = self.output_dir / "project_wide_changes_raw.txt"
          with open(output_file, "w", encoding="utf-8") as f:
              f.write(full_suggestion)
          print(f"raw output saved to: {output_file}")
          return

      for rel_path, code in matches:
          rel_path = rel_path.strip()
          output_file = self.output_dir / rel_path
          output_file.parent.mkdir(parents=True, exist_ok=True)
          with open(output_file, "w", encoding="utf-8") as f:
              f.write(code.strip() + "\n")
          print(f"wrote changes to {output_file}")

      print("project-wide update complete")
      self.ask_action()

    def create_file(self):
        file_name = input("enter new file path (relative to project): ").strip()
        prompt = input("describe what the file should contain:\n")
        gpt_response = self.send_to_gpt(prompt)

        output_file = self.output_dir / file_name
        output_file.parent.mkdir(parents=True, exist_ok=True)
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(gpt_response)
        print(f"new file created at: {output_file}")
        self.ask_action()

    def update_file(self):
        rel_path = input("enter file path (relative to project): ").strip()
        if rel_path not in CODE_INDEX:
            print("file not found in index.")
            return self.ask_action()

        prompt = input("enter update instruction:\n")

        # retrieve most relevant chunks from this specific file
        file_chunks = CODE_INDEX[rel_path]
        query_embedding = embed_text(prompt)
        scored = [(cosine_similarity(query_embedding, emb), chunk) for chunk, emb in file_chunks]
        scored.sort(key=lambda x: x[0], reverse=True)
        best_chunks = [("file", rel_path, chunk) for _, chunk in scored[:3]]

        gpt_response = self.send_to_gpt(prompt, retrieved_chunks=best_chunks)

        output_file = self.output_dir / rel_path
        output_file.parent.mkdir(parents=True, exist_ok=True)
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(gpt_response)
        print(f"updated file saved at: {output_file}")
        self.ask_action()

    def do_exit(self, arg):
        print("goodbye")
        return True


def start():
    cli = GptCodeAgent()
    cli.cmdloop()


if __name__ == "__main__":
    start()


welcome, let's set up your coding agent.

enter path to your existing project folder: TLH/server
enter new project directory name (for generated files): niox1
output will be stored in: /content/niox1
enter your global prompt (instructions for gpt):
create this models i've structured here: {   "nioxDB": [     {       "user": {         "name": [           {             "type": "String",             "required": true           }         ],         "phone_number": [           {             "type": "String",             "required": true           }         ],         "email": [           {             "type": "String",             "unique": true,             "required": true           }         ],         "password": [           {             "type": "String",             "require": true,             "select": false           }         ],         "google_id": [           {             "type": "String",             "default": null           }         ],         "apple_id": [           {      

TypeError: 'PosixPath' object is not iterable