In [7]:
import os
import shutil
import ast
import pandas as pd
from pathlib import Path
from typing import List
import sys
import nbformat
from git import Repo, GitCommandError
from langchain.vectorstores import FAISS
from langchain_core.documents import Document
from langchain.chat_models import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from docx import Document as DocxDocument

from openai import OpenAI
client = OpenAI()



In [2]:
def is_low_value_file(filepath):
    low_value_exts = [
        '.css', '.min.js', '.json', '.svg', '.csv', '.xlsx', '.xls',
        '.log', '.lock', '.pyc', '.pyo', '.pyd', '.class', '.jar', '.war',
        '.o', '.obj', '.dll', '.exe', '.so', '.a', '.db', '.sqlite', '.sqlite3',
        '.bak', '.tmp', '.ico', '.icns', '.pdf', '.docx', '.pptx',
        '.7z', '.zip', '.tar', '.gz', '.rar', '.iml'
    ]

    low_value_files = [
        'readme.md', 'license', '.gitignore', '.gitattributes', 'post-update.sample',
        'fsmonitor-watchman.sample', 'pre-commit', 'pre-push', 'commit-msg',
        'tags', 'head', 'config', 'description', 'index', '.editorconfig',
        '.prettierrc', '.eslintrc', '.gitmodules', '.mailmap', '.clang-format',
        'pipfile.lock', 'yarn.lock', 'package-lock.json', '.env', '.env.example', '.npmrc',
        'update.sample'
    ]

    low_value_dirs = {
        '.git', '.vscode', '.idea', '__pycache__',
        'node_modules', 'dist', 'build', '.pytest_cache'
    }

    filepath_str = str(filepath).lower()
    parts = set(Path(filepath).parts)

    return (
        Path(filepath).suffix.lower() in low_value_exts or
        os.path.basename(filepath).lower() in low_value_files or
        any(d in parts for d in low_value_dirs) or
        'mock' in filepath_str
    )


In [3]:
def clone_repo(repo_url, clone_path="tmp_repo") -> str:
    """
    Shallow-clone the repo at the latest commit only and return the clone path.
    """
    if os.path.exists(clone_path):
        shutil.rmtree(clone_path, ignore_errors=True)
    Repo.clone_from(repo_url, clone_path, depth=1)
    return clone_path

In [4]:
def extract_all_chunks_to_faiss(repo_path: str, index_dir: str = "docs_index") -> list[Document]:
    chunks = []
    repo_path = Path(repo_path)

    # 1. README files
    for readme_name in ("README.md", "README.rst", "README.txt"):
        readme_path = repo_path / readme_name
        if readme_path.exists():
            content = readme_path.read_text(encoding="utf-8").strip()
            if 50 < len(content) < 5000:
                chunks.append(Document(
                    page_content=content,
                    metadata={
                        "source": str(readme_path),
                        "file_ext": "text",
                        "type": "readme",
                        "name": readme_name,
                        "lines": f"1-{content.count(chr(10)) + 1}"
                    }
                ))
            break

    # 2. Other Markdown files
    for md_path in repo_path.rglob("*.md"):
        if md_path.name.lower() == "readme.md":
            continue
        try:
            content = md_path.read_text(encoding="utf-8").strip()
            if 50 < len(content) < 5000:
                chunks.append(Document(
                    page_content=content,
                    metadata={
                        "source": str(md_path),
                        "file_ext": "markdown",
                        "type": "markdown",
                        "name": md_path.name,
                        "lines": f"1-{content.count(chr(10)) + 1}"
                    }
                ))
        except Exception:
            continue

    # 3. Code and misc files
    for filepath in repo_path.rglob("*.*"):
        if is_low_value_file(filepath):
            continue

        try:
            suffix = filepath.suffix.lower()

            # 3a. Python files
            if suffix == ".py":
                code = filepath.read_text(encoding="utf-8")
                tree = ast.parse(code)

                mod_doc = ast.get_docstring(tree)
                if mod_doc and 50 < len(mod_doc) < 5000:
                    chunks.append(Document(
                        page_content=mod_doc,
                        metadata={
                            "source": str(filepath),
                            "file_ext": "code",
                            "type": "module_docstring",
                            "name": filepath.name,
                            "lines": f"1-{code.count(chr(10)) + 1}"
                        }
                    ))

                for node in tree.body:
                    if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                        content = ast.get_source_segment(code, node)
                        if content and 50 < len(content) < 5000:
                            chunks.append(Document(
                                page_content=content,
                                metadata={
                                    "source": str(filepath),
                                    "file_ext": "code",
                                    "type": type(node).__name__.lower(),
                                    "name": node.name,
                                    "lines": f"{node.lineno}-{getattr(node, 'end_lineno', node.lineno)}"
                                }
                            ))

                        doc = ast.get_docstring(node)
                        if doc and 50 < len(doc) < 5000:
                            chunks.append(Document(
                                page_content=doc,
                                metadata={
                                    "source": str(filepath),
                                    "file_ext": "code",
                                    "type": f"{type(node).__name__.lower()}_docstring",
                                    "name": node.name,
                                    "lines": f"{node.lineno}-{getattr(node, 'end_lineno', node.lineno)}"
                                }
                            ))

            # 3b. Notebooks
            elif suffix == ".ipynb":
                nb = nbformat.read(filepath, as_version=4)
                for i, cell in enumerate(nb.cells):
                    if cell.cell_type in ("markdown", "code"):
                        content = cell.source.strip()
                        if 50 < len(content) < 5000:
                            chunks.append(Document(
                                page_content=content,
                                metadata={
                                    "source": str(filepath),
                                    "file_ext": "code",
                                    "type": f"{cell.cell_type}_cell",
                                    "name": f"{filepath.name} - cell {i}",
                                    "lines": f"cell_{i}"
                                }
                            ))

            # 3c. Other code/text files
            else:
                code = filepath.read_text(encoding="utf-8")
                if 50 < len(code) < 5000:
                    chunks.append(Document(
                        page_content=code,
                        metadata={
                            "source": str(filepath),
                            "file_ext": "code",
                            "type": suffix,
                            "name": filepath.name,
                            "lines": f"1-{code.count(chr(10)) + 1}"
                        }
                    ))

        except Exception:
            continue

    # Optional FAISS index creation
    # embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    # vectordb = FAISS.from_documents(chunks, embedding=embeddings)
    # vectordb.save_local(index_dir)

    return chunks

In [21]:
repo_url = "https://github.com/adarshlearnngrow/StepUpYourCareer.AI"
repo_path = clone_repo(repo_url)

# 1️⃣ Build the FAISS DB from your repo
chunks = extract_all_chunks_to_faiss(repo_path)



In [23]:
len(chunks)

59

In [None]:
def multi_label_chunk(text: str, model: str = "gpt-3.5-turbo-0125") -> dict:
    """
    Classify a chunk into one or more documentation sections.
    Returns: dict with {"sections": [...], "rationale": ..., "tags": [...]}
    """
    text = text.strip()

    layout = [
        "Project Overview", "Objective & Scope", "System Architecture",
        "Tech Stack", "Installation & Setup", "Usage Instructions",
        "API Documentation", "Others"
    ]

    system_prompt = (
        "You are an expert technical writer. Given a code or text chunk, "
        "identify ALL documentation sections it may contribute to, based on the following layout:\n\n"
        + "\n".join(f"- {s}" for s in layout) +
        "\n\nOutput must be in JSON format with these fields:\n"
        "- sections: a list of one or more relevant section names\n"
        "- rationale: 1–3 sentence explanation of why these sections are appropriate\n"
        "- tags: 5–10 keywords from the chunk (e.g., FastAPI, endpoint, config, Docker, class, train loop)\n\n"
        "Be analytical. If a chunk helps define a function and also contains setup instructions, include both."
    )

    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": text[:16_000]},
        ],
        temperature=0.3,
        max_tokens=500,
    )

    import json
    try:
        return json.loads(resp.choices[0].message.content.strip())
    except json.JSONDecodeError:
        return {
            "sections": ["Others"],
            "rationale": "Failed to parse response",
            "tags": []
        }


In [24]:
for doc in chunks:
    label = multi_label_chunk(doc.page_content)
    doc.metadata["sections"] = label["sections"]        # <-- fix here
    doc.metadata["tags"] = label["tags"]
    doc.metadata["rationale"] = label["rationale"] 


In [28]:
chunks

[Document(metadata={'source': 'tmp_repo\\README.md', 'file_ext': 'text', 'type': 'readme', 'name': 'README.md', 'lines': '1-29', 'sections': ['Others'], 'tags': [], 'rationale': 'Failed to parse response'}, page_content='# StepUpYourCareer.ai: Elevate Your Future\n\nAn AI-powered career assistant that helps students and job seekers identify **skill gaps**, receive **personalized learning roadmaps**, and connect with **industry mentors**—all from a single resume upload.\n\n### Link to the website: https://stepupyourcareer.streamlit.app/\n\n---\n\n## Problem\n\nGraduates often leave university with degrees but **lack clarity on what employers actually expect**. They spend months applying for jobs, facing rejections without knowing **what skills they’re missing** or **how to upskill efficiently**.\n\n---\n\n## Solution\n\n**StepUpYourCareer.ai** transforms your resume into a personalized upskilling journey.\n\n- **Skill Gap Analyzer**: Extracts skills from your resume and compares them to

In [1]:
selected_chunks = [
    doc for doc in chunks
    if "Usage Instructions" in doc.metadata.get("sections", [])


_IncompleteInputError: incomplete input (3361902121.py, line 3)

## Testing

In [None]:
# Query for architecture-related info
arch_query = "System Architecture"
arch_docs = db.similarity_search(arch_query, k=20)

# Preview content and metadata for each chunk
for i, doc in enumerate(arch_docs, start=1):
    print(f"\n🔹 Result {i}:")
    print("📄 File:", doc.metadata.get("source"))
    print("📌 Type:", doc.metadata.get("type"))
    print("🔢 Lines:", doc.metadata.get("lines"))
    print("📛 Name:", doc.metadata.get("name"))
    print("🧠 Content preview:\n", doc.page_content[:300], "...\n")


🔹 Result 1:
📄 File: tmp_repo\Job_Description_JD_Manupulation.ipynb
📌 Type: markdown_cell
🔢 Lines: cell_0
📛 Name: Job_Description_JD_Manupulation.ipynb - cell 0
🧠 Content preview:
 <a href="https://colab.research.google.com/github/adarshlearnngrow/StepUp-AI/blob/main/Job_Description_JD_Manupulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> ...


🔹 Result 2:
📄 File: tmp_repo\Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb
📌 Type: markdown_cell
🔢 Lines: cell_0
📛 Name: Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb - cell 0
🧠 Content preview:
 <a href="https://colab.research.google.com/github/adarshlearnngrow/StepUp-AI/blob/main/Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> ...


🔹 Result 3:
📄 File: tmp_repo\Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb
📌 Type: markdown_cell
🔢 Lines:

In [30]:
arch_docs[0]

Document(id='6c647709-aed9-4f66-b4a6-c5830eba1e38', metadata={'source': 'tmp_repo\\Job_Description_JD_Manupulation.ipynb', 'type': 'markdown_cell', 'name': 'Job_Description_JD_Manupulation.ipynb - cell 0', 'lines': 'cell_0'}, page_content='<a href="https://colab.research.google.com/github/adarshlearnngrow/StepUp-AI/blob/main/Job_Description_JD_Manupulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>')

In [108]:
# Query for architecture-related info
arch_query = "Architecture Overview"
arch_docs = db.similarity_search(arch_query, k=1000)

# Preview content and metadata for each chunk
for i, doc in enumerate(arch_docs, start=1):
    print(f"\n🔹 Result {i}:")
    print("📄 File:", doc.metadata.get("source"))
    print("📌 Type:", doc.metadata.get("type"))
    print("🔢 Lines:", doc.metadata.get("lines"))
    print("📛 Name:", doc.metadata.get("name"))
    print("🧠 Content preview:\n", doc.page_content[:300], "...\n")



🔹 Result 1:
📄 File: tmp_repo\Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb
📌 Type: markdown_cell
🔢 Lines: cell_23
📛 Name: Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb - cell 23
🧠 Content preview:
 ### Action Plan Generation (Testing for 3 Candidates) ...


🔹 Result 2:
📄 File: tmp_repo\Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb
📌 Type: markdown_cell
🔢 Lines: cell_0
📛 Name: Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb - cell 0
🧠 Content preview:
 <a href="https://colab.research.google.com/github/adarshlearnngrow/StepUp-AI/blob/main/Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> ...


🔹 Result 3:
📄 File: tmp_repo\README.md
📌 Type: .md
🔢 Lines: 1-32
📛 Name: README.md
🧠 Content preview:
 # StepUpYourCareer.ai: Elevate Your Future

An AI-powered career assistant that helps students and job seekers identify **skill gaps**, receive **personalized le

In [34]:
arch_docs[0].page_content

'<a href="https://colab.research.google.com/github/adarshlearnngrow/StepUp-AI/blob/main/Job_Description_JD_Manupulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>'

In [97]:

import os

from openai import OpenAI
client = OpenAI()

relevance_judgments = []

for i, chunk in enumerate(arch_docs):
    prompt = f"""
You are an expert documentation reviewer.

Your task is to **judge** whether the following chunk is relevant to writing the section:  
**"{arch_query}"**

📌 Rules:
- Prioritise **code cells**, functions, or implementation details over high-level marketing or introductory text (e.g., README).
- Only select markdown if it describes a technical process, setup instruction, or logic explanation.
- then understand the code or text and then tell if this can be used to generate the documentation.
- Return a JSON with only the following fields: `chunk_id`, `relevant`, and `reason`.
- `chunk_id` is the numeric index: {i}
- If the chunk is related to {arch_query}, set `"relevant": true` and give a brief reason.
- If not, set `"relevant": false` and explain why.
- Do **not** repeat the chunk content.
- Keep the explanation short and factual.

Example:
{{
  "chunk_id": 3,
  "relevant": true,
  "reason": "Loads required JSON files for skill gap analysis, which is part of setup."
}}

Evaluate this chunk:
{chunk.page_content}
"""
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0.5,
        messages=[{"role": "user", "content": prompt}]
    )

    result = response.choices[0].message.content.strip()
    
    try:
        # Optionally use json.loads() if you're confident it's valid JSON
        relevance_judgments.append({
            "chunk_id": i,
            "relevant": '"relevant": true' in result.lower(),
            "reason": result.split('"reason":', 1)[-1].strip().rstrip('}').strip('"')
        })
    except Exception as e:
        print(f"Failed to parse chunk {i}: {e}")
        relevance_judgments.append({
            "chunk_id": i,
            "relevant": False,
            "reason": "Parsing failed or invalid format."
        })

In [99]:
[i for i in relevance_judgments if i['relevant']]

[{'chunk_id': 12,
  'relevant': True,
  'reason': 'Contains implementation details for analyzing resumes and generating skill gaps, which are key components of the system architecture."\n'},
 {'chunk_id': 13,
  'relevant': True,
  'reason': 'Contains implementation details for sending emails and formatting results, which are part of the system\'s architecture and functionality."\n'},
 {'chunk_id': 14,
  'relevant': True,
  'reason': 'Includes code for loading data, analyzing resumes, and generating skill gap analysis, detailing the system\'s functionality and architecture."\n'},
 {'chunk_id': 16,
  'relevant': True,
  'reason': 'The function details the process of generating a hybrid action plan, including skill categorization and resource extraction, which are key components of the system architecture."\n'},
 {'chunk_id': 17,
  'relevant': True,
  'reason': 'Contains functions and implementation details related to skill prioritization and resource loading, which are part of the system

In [105]:
for i, doc in enumerate(arch_docs, start=12):
    print(f"\n🔹 Result {i}:")
    print("📄 File:", doc.metadata.get("source"))


    print("📛 Name:", doc.metadata.get("name"))
    print("🧠 Content preview:\n", doc.page_content, "...\n")


🔹 Result 12:
📄 File: tmp_repo\Job_Description_JD_Manupulation.ipynb
📛 Name: Job_Description_JD_Manupulation.ipynb - cell 0
🧠 Content preview:
 <a href="https://colab.research.google.com/github/adarshlearnngrow/StepUp-AI/blob/main/Job_Description_JD_Manupulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> ...


🔹 Result 13:
📄 File: tmp_repo\Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb
📛 Name: Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb - cell 0
🧠 Content preview:
 <a href="https://colab.research.google.com/github/adarshlearnngrow/StepUp-AI/blob/main/Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> ...


🔹 Result 14:
📄 File: tmp_repo\Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb
📛 Name: Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb - cell 23
🧠 Content preview:
 ### Action Pl

# maybe we can use this deterministic approach to fethc folder structure instead of LLM

In [None]:
import os

def generate_clean_repo_tree(repo_path):
    exclude_dirs = {'.git', '.devcontainer', '__pycache__', '.ipynb_checkpoints'}
    exclude_ext = {'.pdf', '.png', '.jpg', '.jpeg', '.log', '.tmp'}

    tree_lines = []

    for root, dirs, files in os.walk(repo_path):
        # Remove excluded dirs
        dirs[:] = [d for d in dirs if d not in exclude_dirs]

        # Relative indent
        indent_level = root.replace(repo_path, "").count(os.sep)
        indent = "    " * indent_level
        tree_lines.append(f"{indent}{os.path.basename(root)}/")

        for f in files:
            ext = os.path.splitext(f)[1].lower()
            if f.startswith('.') or ext in exclude_ext:
                continue  # skip hidden/unwanted files
            tree_lines.append(f"{indent}    {f}")

    return "\n".join(tree_lines)

# ✅ Generate a clean repo tree
repo_tree = generate_clean_repo_tree("tmp_repo")
print(repo_tree)


In [None]:
from pathlib import Path
import ast
import nbformat
from langchain.docstore.document import Document

def extract_chunks(repo_path: str):
    all_chunks = []
    ast_chunks = []

    for filepath in Path(repo_path).rglob("*.*"):
        try:
            if filepath.suffix == ".py":
                code = filepath.read_text(encoding="utf-8")
                tree = ast.parse(code)

                mod_doc = ast.get_docstring(tree)
                if mod_doc and 50 < len(mod_doc) < 5000:
                    doc = Document(
                        page_content=mod_doc,
                        metadata={
                            "source": str(filepath),
                            "type": "module_docstring",
                            "name": filepath.name,
                            "lines": "1-{}".format(code.count("\n") + 1)
                        }
                    )
                    all_chunks.append(doc)
                    ast_chunks.append(doc)

                for node in tree.body:
                    if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                        content = ast.get_source_segment(code, node)
                        if content and 50 < len(content) < 5000:
                            chunk = Document(
                                page_content=content,
                                metadata={
                                    "source": str(filepath),
                                    "type": type(node).__name__.lower(),
                                    "name": node.name,
                                    "lines": f"{node.lineno}-{getattr(node, 'end_lineno', node.lineno)}"
                                }
                            )
                            all_chunks.append(chunk)
                            ast_chunks.append(chunk)

                        doc = ast.get_docstring(node)
                        if doc and 50 < len(doc) < 5000:
                            doc_chunk = Document(
                                page_content=doc,
                                metadata={
                                    "source": str(filepath),
                                    "type": f"{type(node).__name__.lower()}_docstring",
                                    "name": node.name,
                                    "lines": f"{node.lineno}-{getattr(node, 'end_lineno', node.lineno)}"
                                }
                            )
                            all_chunks.append(doc_chunk)
                            ast_chunks.append(doc_chunk)

        except Exception:
            continue

    return all_chunks, ast_chunks


In [None]:
import hashlib
import json
from pathlib import Path
from langchain.docstore.document import Document
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain

def summarise_ast_chunks_with_cache(ast_chunks, cache_path="summary_cache.json"):
    # Load cache
    cache_file = Path(cache_path)
    if cache_file.exists():
        with open(cache_file, "r", encoding="utf-8") as f:
            summary_cache = json.load(f)
    else:
        summary_cache = {}

from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(""" You are an expert technical writer. Summarise the following code or docstring for technical documentation:
- Describe its purpose and functionality
- Mention any inputs/outputs
- Be precise and concise

Code:
```python
{code}


In [None]:
summariser = LLMChain(llm=llm, prompt=prompt)
summarised_docs = []

for doc in ast_chunks:
    summary = summariser.run(code=doc.page_content)
    summarised_docs.append(Document(
        page_content=summary.strip(),
        metadata=doc.metadata
    ))

return summarised_docs


In [None]:
repo_path = "/path/to/your/repo"

# Step 1: Extract all + AST chunks
all_chunks, ast_chunks = extract_chunks(repo_path)

# Step 2: Summarise AST chunks
summarised_ast_chunks = summarise_ast_chunks(ast_chunks)

# Step 3: Save summarised AST content to FAISS
vectordb = save_summaries_to_faiss(summarised_ast_chunks, index_dir="summarised_ast_index")
