# Working code


In [25]:
import os
import shutil
import ast
import pandas as pd
from pathlib import Path
from typing import List
from git import Repo, GitCommandError

from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.chat_models import ChatOpenAI

from docx import Document as DocxDocument

# --- Utility: Determine low-value files ---
def is_low_value_file(filepath):
    low_value_exts = ['.css', '.min.js', '.json', '.svg', '.csv', '.xlsx', '.xls']
    filename = os.path.basename(filepath).lower()
    return any(filename.endswith(ext) for ext in low_value_exts) or 'mock' in filename

# --- Step 1: Clone the repo (latest commit only) ---
def clone_repo(repo_url, clone_path="tmp_repo"):
    if os.path.exists(clone_path):
        shutil.rmtree(clone_path, ignore_errors=True)
    Repo.clone_from(repo_url, clone_path, depth=1)

# --- Step 2: Extract code chunks via AST ---
def extract_ast_chunks(repo_path: str) -> List[Document]:
    chunks = []
    for filepath in Path(repo_path).rglob("*.*"):
        if is_low_value_file(filepath):
            continue
        try:
            code = filepath.read_text(encoding="utf-8")
            if filepath.suffix == ".py":
                tree = ast.parse(code)
                for node in tree.body:
                    if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                        content = ast.get_source_segment(code, node)
                        if content and 50 < len(content) < 5000:
                            meta = {"source": str(filepath), "type": type(node).__name__, "name": node.name}
                            chunks.append(Document(page_content=content, metadata=meta))
            else:
                if 50 < len(code) < 5000:
                    meta = {"source": str(filepath), "type": filepath.suffix, "name": os.path.basename(filepath)}
                    chunks.append(Document(page_content=code, metadata=meta))
        except Exception:
            pass
    return chunks

# --- Step 3: Build FAISS vector store ---
def build_faiss_from_ast_chunks(chunks: List[Document], db_path="faiss_ast_index"):
    embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embedder)
    vectordb.save_local(db_path)
    return vectordb

# --- LLM client ---
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=1.5)

# --- Section functions ---
def generate_business_overview(repo_path: str) -> str:
    files = [p.relative_to(repo_path).as_posix() for p in Path(repo_path).rglob("*") if p.is_file()]
    context = "\n".join(files)
    #print(context)
    prompt = (
        "You are a technical writer for enterprise software. Provide a high-level business overview of this solution, "
        "including purpose, scope, and value delivered.\n"
        f"Files in project:\n{context}\n---\nOverview:\n"
    )
    return llm.invoke(prompt).content.strip()


def generate_technical_specifications(repo_path: str) -> str:
    # Deterministically list technologies used without LLM to avoid hallucination
    ext_map = {'.py':'Python','.js':'JavaScript','.ts':'TypeScript','.java':'Java', '.html':'HTML','.css':'CSS', '.go':'Go', '.rs':'Rust'}
    techs = set()
    for p in Path(repo_path).rglob('*'):
        if p.suffix in ext_map:
            techs.add(ext_map[p.suffix])
        if p.name.lower() == 'dockerfile':
            techs.add('Docker')
        if p.name.lower() in ('requirements.txt', 'pyproject.toml'):
            techs.add('Python (dependencies)')
        if p.name.lower() == 'package.json':
            techs.add('Node.js (npm)')
    # Format as bullet list
    lines = [f"- {tech}" for tech in sorted(techs)]
    return "".join(lines)


def generate_folder_structure(repo_path: str) -> str:
    lines = []
    for root, dirs, files in os.walk(repo_path):
        level = root.replace(repo_path, '').count(os.sep)
        indent = '  ' * level
        lines.append(f"{indent}{os.path.basename(root)}/")
        for f in files:
            lines.append(f"{indent}  {f}")
    tree = "\n".join(lines)
    prompt = (
        "You are a software architect. Describe the folder structure and modular organization of this project.\n"
        f"Directory tree:\n{tree}\n---\nDescription:\n"
    )
    return llm.invoke(prompt).content.strip()


def generate_code_flow(chunks: List[Document]) -> str:
    entries = [f"{d.metadata['type']} {d.metadata['name']}" for d in chunks]
    context = '\n'.join(entries)
    prompt = (

        "You are a senior software engineer. Provide a step-by-step execution flow of this codebase, "
        "referencing functions and classes.\n"
        f"Components:\n{context}\n---\nFlow:\n"
    )
    return llm.invoke(prompt).content.strip()


def summarize_chunk(chunk: Document) -> str:
    prompt = (
        f"You are a senior software engineer. Generate concise documentation for the {chunk.metadata['type']} '{chunk.metadata['name']}'.\n"
        f"Code:\n{chunk.page_content}\n---\nDocumentation:\n"
    )
    return llm.invoke(prompt).content.strip()

# --- Step 4: Generate multi-section DOCX ---
def generate_technical_doc(chunks: List[Document], repo_path: str, output_path="Documents\\technical_documentation.docx"):
    doc = DocxDocument()

    # Business Overview
    doc.add_heading("Business Overview", level=1)
    doc.add_paragraph(generate_business_overview(repo_path))
    '''
    # Technical Specifications
    doc.add_heading("Technical Specifications", level=1)
    doc.add_paragraph(generate_technical_specifications(repo_path))

    # Folder Structure
    doc.add_heading("Folder Structure", level=1)
    doc.add_paragraph(generate_folder_structure(repo_path))

    # Code Flow
    doc.add_heading("Code Flow", level=1)
    doc.add_paragraph(generate_code_flow(chunks))

    # Detailed Function/Class Documentation
    doc.add_heading("Detailed Documentation", level=1)
    for chunk in chunks:
        if chunk.metadata.get("type") in ['FunctionDef', 'ClassDef']:
            doc.add_heading(chunk.metadata.get("name", "Unnamed"), level=2)
            doc.add_paragraph(f"Source: {chunk.metadata.get('source')}")
            doc.add_paragraph(summarize_chunk(chunk))
            doc.add_paragraph("—" * 30)
    '''

    doc.save(output_path)
    print(f"Saved technical documentation to `{output_path}`")

# --- Step 5: Main runner ---
def main(repo_url: str):
    repo_path = "tmp_repo"
    #clone_repo(repo_url, repo_path)
    chunks = extract_ast_chunks(repo_path)
    pd.DataFrame([{
        "name": d.metadata.get("name"),
        "type": d.metadata.get("type"),
        "source": d.metadata.get("source"),
        "length": len(d.page_content)
    } for d in chunks]).to_csv("ast_chunk_summary.csv", index=False)
    build_faiss_from_ast_chunks(chunks)
    generate_technical_doc(chunks, repo_path)

if __name__ == "__main__":
    main("https://github.com/adarshlearnngrow/StepUpYourCareer.AI")


Saved technical documentation to `Documents\technical_documentation.docx`


## Overview


In [None]:
import os
import shutil
import ast
import pandas as pd
from pathlib import Path
from typing import List
import sys

from git import Repo, GitCommandError
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.chat_models import ChatOpenAI

from docx import Document as DocxDocument

# --- Utility: Determine low-value files ---
def is_low_value_file(filepath):
    low_value_exts = ['.css', '.min.js', '.json', '.svg', '.csv', '.xlsx', '.xls']
    filename = os.path.basename(filepath).lower()
    return any(filename.endswith(ext) for ext in low_value_exts) or 'mock' in filename

# --- Step 1: Clone the repo (latest commit only) ---
def clone_repo(repo_url, clone_path="tmp_repo") -> str:
    """
    Shallow-clone the repo at the latest commit only and return the clone path.
    """
    if os.path.exists(clone_path):
        shutil.rmtree(clone_path, ignore_errors=True)
    Repo.clone_from(repo_url, clone_path, depth=1)
    return clone_path

# --- Step 2: Extract code chunks via AST ---
def extract_ast_chunks(repo_path: str) -> List[Document]:
    chunks = []
    for filepath in Path(repo_path).rglob("*.*"):
        if is_low_value_file(filepath):
            continue
        try:
            code = filepath.read_text(encoding="utf-8")
            if filepath.suffix == ".py":
                tree = ast.parse(code)
                for node in tree.body:
                    if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                        content = ast.get_source_segment(code, node)
                        if content and 50 < len(content) < 5000:
                            meta = {"source": str(filepath), "type": type(node).__name__, "name": node.name}
                            chunks.append(Document(page_content=content, metadata=meta))
            else:
                if 50 < len(code) < 5000:
                    meta = {"source": str(filepath), "type": filepath.suffix, "name": os.path.basename(filepath)}
                    chunks.append(Document(page_content=code, metadata=meta))
        except Exception:
            pass
    return chunks

# --- Step 3: Build FAISS vector store ---
def build_faiss_from_ast_chunks(chunks: List[Document], db_path="faiss_ast_index"):
    embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embedder)
    vectordb.save_local(db_path)
    return vectordb

# --- Step 4: Load README and docstrings into vectorstore ---
def load_readme(repo_path: str) -> str:
    for pattern in ('README.md', 'README.rst', 'README.txt'):
        readme = Path(repo_path) / pattern
        if readme.exists():
            return readme.read_text(encoding='utf-8')
    return ""

def extract_python_docstrings(repo_path: str) -> List[str]:
    docs: List[str] = []
    for py in Path(repo_path).rglob('*.py'):
        try:
            tree = ast.parse(py.read_text(encoding='utf-8'))
            for node in ast.walk(tree):
                if isinstance(node, (ast.Module, ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
                    doc = ast.get_docstring(node)
                    if doc:
                        docs.append(doc)
        except Exception:
            continue
    return docs

def load_all_markdown(repo_path: str) -> str:
    """
    Concatenate every .md file under the repo (including subdirectories).
    """
    texts = []
    for md in Path(repo_path).rglob("*.md"):
        texts.append(md.read_text(encoding="utf-8"))
    return "\n\n".join(texts)

def build_doc_vectorstore(readme: str, docstrings: List[str], index_dir: str = 'docs_index') -> FAISS:
    texts = []
    if readme:
        texts.append(readme)
    texts.extend(docstrings)
    docs = [Document(page_content=text, metadata={'source': 'readme' if i==0 else 'docstring'})
            for i, text in enumerate(texts)]
    embedder = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    vectordb = FAISS.from_documents(docs, embedder)
    vectordb.save_local(index_dir)
    return vectordb

# --- Step 5: Generate Business Overview ---
def generate_business_overview(readme_text: str, repo_path: str, vectordb: FAISS = None, k: int = 5) -> str:
    """
    Use README content if available, otherwise fall back to vectordb or prompt user for input.
    """
    # Prefer README content
    if readme_text and readme_text.strip():
        context = readme_text
    else:
        context = ""
        if vectordb:
            snippets = vectordb.similarity_search("business overview", k=k)
            if snippets:
                context = "\n\n".join([doc.page_content for doc in snippets])
        if not context.strip():
            context = input(
                "No README or docstrings found. Please provide high-level context or project description: "
            )
            sys.exit(1)
    
        
    # Construct prompt with proper quoting
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    prompt = f"""
You are a technical writer for enterprise software. Given the following context, provide a high-level business overview of this codebase, including purpose, scope, and value delivered.

Context:
{context}

---
Overview:
"""
    return llm.invoke(prompt).content.strip()


# --- Entry Point Example ---
if __name__ == "__main__":
    repo_url = "https://github.com/adarshlearnngrow/StepUpYourCareer.AI"
    repo_path = clone_repo(repo_url)

    readme_text = load_readme(repo_path)
    docstrings = extract_python_docstrings(repo_path)
    docs_vectordb = build_doc_vectorstore(readme_text, docstrings)
    overview = generate_business_overview(readme_text, repo_path, docs_vectordb)
    print("Business Overview:", overview)

    chunks = extract_ast_chunks(repo_path)

Business Overview: The StepUpYourCareer.ai codebase is a comprehensive AI-powered career assistant designed to address the common problem faced by graduates and job seekers of not knowing what skills are required by employers and how to efficiently upskill. The purpose of this codebase is to transform resumes into personalized upskilling journeys by analyzing skill gaps, generating action plans for upskilling, and matching users with industry mentors.

The scope of this codebase includes a Skill Gap Analyzer that extracts skills from resumes and compares them to target roles, an Action Plan Generator that recommends online courses and resources for missing skills, and a Mentor Matching feature that connects users with industry experts through clustering algorithms.

The value delivered by this codebase is the ability to provide personalized learning roadmaps, identify skill gaps, and facilitate connections with industry mentors, all from a single resume upload. This not only helps user

IndexError: list index out of range