# Working code


In [2]:
import os
import shutil
import ast
import pandas as pd
from pathlib import Path
from typing import List
from git import Repo, GitCommandError

from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.chat_models import ChatOpenAI

from docx import Document as DocxDocument

# --- Utility: Determine low-value files ---
def is_low_value_file(filepath):
    low_value_exts = ['.css', '.min.js', '.json', '.svg', '.csv', '.xlsx', '.xls']
    filename = os.path.basename(filepath).lower()
    return any(filename.endswith(ext) for ext in low_value_exts) or 'mock' in filename

# --- Step 1: Clone the repo (latest commit only) ---
def clone_repo(repo_url, clone_path="tmp_repo"):
    if os.path.exists(clone_path):
        shutil.rmtree(clone_path, ignore_errors=True)
    Repo.clone_from(repo_url, clone_path, depth=1)

# --- Step 2: Extract code chunks via AST ---
def extract_ast_chunks(repo_path: str) -> List[Document]:
    chunks = []
    for filepath in Path(repo_path).rglob("*.*"):
        if is_low_value_file(filepath):
            continue
        try:
            code = filepath.read_text(encoding="utf-8")
            if filepath.suffix == ".py":
                tree = ast.parse(code)
                for node in tree.body:
                    if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                        content = ast.get_source_segment(code, node)
                        if content and 50 < len(content) < 5000:
                            meta = {"source": str(filepath), "type": type(node).__name__, "name": node.name}
                            chunks.append(Document(page_content=content, metadata=meta))
            else:
                if 50 < len(code) < 5000:
                    meta = {"source": str(filepath), "type": filepath.suffix, "name": os.path.basename(filepath)}
                    chunks.append(Document(page_content=code, metadata=meta))
        except Exception:
            pass
    return chunks

# --- Step 3: Build FAISS vector store ---
def build_faiss_from_ast_chunks(chunks: List[Document], db_path="faiss_ast_index"):
    embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embedder)
    vectordb.save_local(db_path)
    return vectordb

# --- LLM client ---
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

# --- Section functions ---
def generate_business_overview(repo_path: str) -> str:
    files = [p.relative_to(repo_path).as_posix() for p in Path(repo_path).rglob("*") if p.is_file()]
    context = "\n".join(files)
    print(context)
    prompt = (
        "You are a technical writer for enterprise software. Provide a high-level business overview of this solution, "
        "including purpose, scope, and value delivered.\n"
        f"Files in project:\n{context}\n---\nOverview:\n"
    )
    return llm.invoke(prompt).content.strip()


def generate_technical_specifications(repo_path: str) -> str:
    # Deterministically list technologies used without LLM to avoid hallucination
    ext_map = {'.py':'Python','.js':'JavaScript','.ts':'TypeScript','.java':'Java', '.html':'HTML','.css':'CSS', '.go':'Go', '.rs':'Rust'}
    techs = set()
    for p in Path(repo_path).rglob('*'):
        if p.suffix in ext_map:
            techs.add(ext_map[p.suffix])
        if p.name.lower() == 'dockerfile':
            techs.add('Docker')
        if p.name.lower() in ('requirements.txt', 'pyproject.toml'):
            techs.add('Python (dependencies)')
        if p.name.lower() == 'package.json':
            techs.add('Node.js (npm)')
    # Format as bullet list
    lines = [f"- {tech}" for tech in sorted(techs)]
    return "".join(lines)


def generate_folder_structure(repo_path: str) -> str:
    lines = []
    for root, dirs, files in os.walk(repo_path):
        level = root.replace(repo_path, '').count(os.sep)
        indent = '  ' * level
        lines.append(f"{indent}{os.path.basename(root)}/")
        for f in files:
            lines.append(f"{indent}  {f}")
    tree = "\n".join(lines)
    prompt = (
        "You are a software architect. Describe the folder structure and modular organization of this project.\n"
        f"Directory tree:\n{tree}\n---\nDescription:\n"
    )
    return llm.invoke(prompt).content.strip()


def generate_code_flow(chunks: List[Document]) -> str:
    entries = [f"{d.metadata['type']} {d.metadata['name']}" for d in chunks]
    context = '\n'.join(entries)
    prompt = (
        "You are a senior software engineer. Provide a step-by-step execution flow of this codebase, "
        "referencing functions and classes.\n"
        f"Components:\n{context}\n---\nFlow:\n"
    )
    return llm.invoke(prompt).content.strip()


def summarize_chunk(chunk: Document) -> str:
    prompt = (
        f"You are a senior software engineer. Generate concise documentation for the {chunk.metadata['type']} '{chunk.metadata['name']}'.\n"
        f"Code:\n{chunk.page_content}\n---\nDocumentation:\n"
    )
    return llm.invoke(prompt).content.strip()

# --- Step 4: Generate multi-section DOCX ---
def generate_technical_doc(chunks: List[Document], repo_path: str, output_path="technical_documentation.docx"):
    doc = DocxDocument()

    # Business Overview
    doc.add_heading("Business Overview", level=1)
    doc.add_paragraph(generate_business_overview(repo_path))
    '''
    # Technical Specifications
    doc.add_heading("Technical Specifications", level=1)
    doc.add_paragraph(generate_technical_specifications(repo_path))

    # Folder Structure
    doc.add_heading("Folder Structure", level=1)
    doc.add_paragraph(generate_folder_structure(repo_path))

    # Code Flow
    doc.add_heading("Code Flow", level=1)
    doc.add_paragraph(generate_code_flow(chunks))

    # Detailed Function/Class Documentation
    doc.add_heading("Detailed Documentation", level=1)
    for chunk in chunks:
        if chunk.metadata.get("type") in ['FunctionDef', 'ClassDef']:
            doc.add_heading(chunk.metadata.get("name", "Unnamed"), level=2)
            doc.add_paragraph(f"Source: {chunk.metadata.get('source')}")
            doc.add_paragraph(summarize_chunk(chunk))
            doc.add_paragraph("—" * 30)
    '''

    doc.save(output_path)
    print(f"✅ Saved technical documentation to `{output_path}`")

# --- Step 5: Main runner ---
def main(repo_url: str):
    repo_path = "tmp_repo"
    #clone_repo(repo_url, repo_path)
    chunks = extract_ast_chunks(repo_path)
    pd.DataFrame([{
        "name": d.metadata.get("name"),
        "type": d.metadata.get("type"),
        "source": d.metadata.get("source"),
        "length": len(d.page_content)
    } for d in chunks]).to_csv("ast_chunk_summary.csv", index=False)
    build_faiss_from_ast_chunks(chunks)
    generate_technical_doc(chunks, repo_path)

if __name__ == "__main__":
    main("https://github.com/adarshlearnngrow/StepUpYourCareer.AI")


ClusteringMentorModelTraining.ipynb
Job_Description_JD_Manupulation.ipynb
Presentation - StepUpYourCareer.ai Elevate Your Future.pdf
README.md
Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb
.devcontainer/devcontainer.json
.git/config
.git/description
.git/HEAD
.git/index
.git/packed-refs
.git/shallow
data/all_roles_student_resumes.json
data/req_job_desc.csv
data/role_skills.csv
data/role_skills.json
data/sim_resume.json
data/skill_gap_analysis.csv
data/skill_gap_analysis.json
StepUpAI/app.py
StepUpAI/generated_mentors.json
StepUpAI/mentors_final_data.json
StepUpAI/requirements.txt
StepUpAI/role_skills.json
StepUpAI/skill_gap_analysis.json
StepUpAI/skill_resource_mapping.json
StepUpAI/models/fitted_vectorizer.pkl
StepUpAI/models/mentor_clustering_model.pkl
.git/hooks/applypatch-msg.sample
.git/hooks/commit-msg.sample
.git/hooks/fsmonitor-watchman.sample
.git/hooks/post-update.sample
.git/hooks/pre-applypatch.sample
.git/hooks/pre-commit.sample
.git/hooks/pre-merge-commit.sample
.gi