In [None]:
import os
from pathlib import Path
from git import Repo

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document


# Step 1: Clone GitHub Repository
def clone_repo(repo_url, clone_path="tmp_repo"):
    if not os.path.exists(clone_path):
        print(f"Cloning {repo_url} ...")
        Repo.clone_from(repo_url, clone_path)
        print("✅ Repo cloned.")
    else:
        print("⚠️ Repo already exists.")


# Step 2: Extract .py, .md, .txt files
def load_code_files(repo_path):
    code_files = []
    for ext in [".py", ".md", ".txt"]:
        code_files.extend(Path(repo_path).rglob(f"*{ext}"))
    return code_files


# Step 3: Read files and split into chunks
def read_and_split(files, chunk_size=500, chunk_overlap=50):
    docs = []
    for fpath in files:
        try:
            with open(fpath, 'r', encoding='utf-8') as f:
                content = f.read()
            doc = Document(page_content=content, metadata={"source": str(fpath)})
            docs.append(doc)
        except Exception as e:
            print(f"❌ Failed to read {fpath}: {e}")
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(docs)


# Step 4: Generate embeddings and store in FAISS
def build_vector_db(chunks, db_path="faiss_index"):
    embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embedder)
    vectordb.save_local(db_path)
    print(f"✅ Vector DB saved at `{db_path}`.")


# Main Pipeline
if __name__ == "__main__":
    repo_url = "https://github.com/YOUR_USERNAME/YOUR_REPO.git"  # ⬅️ Replace with your repo
    repo_path = "tmp_repo"

    clone_repo(repo_url, repo_path)
    files = load_code_files(repo_path)
    print(f"📁 Loaded {len(files)} code files.")

    chunks = read_and_split(files)
    print(f"📄 Split into {len(chunks)} text chunks.")

    build_vector_db(chunks, db_path="faiss_index")


⏳ Loading embedding model …
⏳ Loading LLM … (first time may take a while)


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is en

In [None]:
from openai import OpenAI
client = OpenAI()

query = "What are the key skills missing in this resume?"
docs = vectordb.similarity_search(query, k=5)
context = "\n\n".join([doc.page_content for doc in docs])

response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You're a helpful career advisor."},
        {"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{query}"}
    ]
)
print(response.choices[0].message.content)

In [None]:
# 1. embed & index  ----------------------------------------------------------
from git import Repo
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from glob import glob, iglob
import os, textwrap

repo_path = "repos/requests"
Repo.clone_from("https://github.com/psf/requests", repo_path, depth=1)

chunks, metadatas = [], []
for path in iglob(f"{repo_path}/**/*", recursive=True):
    if path.endswith(('.py', '.md', '.txt', '.json', '.ppt')):
        with open(path, 'r', errors='ignore') as f:
            content = f.read()
        for part in textwrap.wrap(content, 800):           # simple chunk
            chunks.append(part)
            metadatas.append({"path": path})

emb = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectordb = FAISS.from_texts(chunks, emb, metadatas=metadatas)
vectordb.save_local("vector_db/requests.faiss")


In [None]:
import os
from pathlib import Path
from git import Repo

from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document


# Step 1: Clone GitHub Repository
def clone_repo(repo_url, clone_path="tmp_repo"):
    if not os.path.exists(clone_path):
        print(f"Cloning {repo_url} ...")
        Repo.clone_from(repo_url, clone_path)
        print("Repo cloned.")
    else:
        print("Repo already exists.")


# Step 2: Extract .py, .md, .txt files
def load_code_files(repo_path):
    code_files = []
    for ext in [".py", ".md", ".txt", ".html", ".js", ".json"]:
        code_files.extend(Path(repo_path).rglob(f"*{ext}"))
    return code_files


# Step 3: Read files and split into chunks
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    PythonCodeTextSplitter,
    MarkdownTextSplitter,
    HTMLHeaderTextSplitter
)
from langchain_core.documents import Document

def get_splitter(file_path):
    ext = Path(file_path).suffix.lower()

    if ext == ".py":
        return PythonCodeTextSplitter(chunk_size=500, chunk_overlap=50)
    elif ext == ".md":
        return MarkdownTextSplitter(chunk_size=500, chunk_overlap=50)
    elif ext in [".html", ".htm"]:
        return HTMLHeaderTextSplitter(headers_to_split_on=[("h1", "Title"), ("h2", "Section")])
    else:
        # Fallback for .js, .json, .txt, etc.
        return RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

def read_and_split(files):
    chunks = []
    for fpath in files:
        try:
            with open(fpath, 'r', encoding='utf-8') as f:
                content = f.read()
            doc = Document(page_content=content, metadata={"source": str(fpath)})
            splitter = get_splitter(fpath)
            split_chunks = splitter.split_documents([doc])
            chunks.extend(split_chunks)
        except Exception as e:
            print(f"Failed to read {fpath}: {e}")
    return chunks



# Step 4: Generate embeddings and store in FAISS
def build_vector_db(chunks, db_path="faiss_index"):
    embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embedder)
    vectordb.save_local(db_path)
    print(f"✅ Vector DB saved at `{db_path}`.")


# Main Pipeline
if __name__ == "__main__":
    repo_url = "https://github.com/adarshlearnngrow/StepUpYourCareer.AI"  # ⬅️ Replace with your repo
    repo_path = "tmp_repo"

    clone_repo(repo_url, repo_path)
    files = load_code_files(repo_path)
    print(f"Loaded {len(files)} code files.")

    chunks = read_and_split(files)
    print(f"Split into {len(chunks)} text chunks.")

    chunk_lengths = [len(chunk.page_content) for chunk in chunks]
    longest_idx = chunk_lengths.index(max(chunk_lengths))

    print(f"\n📊 Total Chunks: {len(chunks)}")
    print(f"📏 Longest Chunk Length: {chunk_lengths[longest_idx]} characters")
    print(f"📂 Source File: {chunks[longest_idx].metadata['source']}")
    print(f"🧾 Preview:\n{chunks[longest_idx].page_content[:300]}...")

    build_vector_db(chunks, db_path="faiss_index")
    

Repo already exists.
Loaded 3 code files.
Split into 68 text chunks.

📊 Total Chunks: 68
📏 Longest Chunk Length: 483 characters
📂 Source File: tmp_repo\StepUpAI\app.py
🧾 Preview:
def generate_skill_gap(resume_text, target_role, retrieved_examples, fallback_skills):
    examples_prompt = "\n\n".join([
    f"Example for role {ex['target_role']}:\nResume: {ex['resume_summary']}\nSkill Gaps: tech={ex['technical_skill_gap']}, soft={ex['soft_skill_gap']}, transferable={ex['transfe...
✅ Vector DB saved at `faiss_index`.


In [15]:
import pandas as pd

chunk_data = [{
    "source": chunk.metadata["source"],
    "length": len(chunk.page_content),
    "content": chunk.page_content
} for chunk in chunks]

df = pd.DataFrame(chunk_data)
df.to_csv("chunk_inspection.csv", index=False, encoding='utf-8')
print("✅ Saved chunk details to `chunk_inspection.csv`.")

✅ Saved chunk details to `chunk_inspection.csv`.


In [16]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# 1. Load the vector DB
db_path = "faiss_index"
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectordb = FAISS.load_local(db_path, embedder, allow_dangerous_deserialization=True)

# 2. Define your query
query = "Give me an technical overview of the project."

# 3. Perform similarity search
results = vectordb.similarity_search(query, k=5)  # k is number of results

# 4. Print the top results
for i, doc in enumerate(results, 1):
    print(f"\n--- Result {i} ---")
    print(doc.page_content[:500])  # Show only first 500 characters
    print(f"Source: {doc.metadata.get('source')}")



--- Result 1 ---
![Notes_250516_042908_4](https://github.com/user-attachments/assets/ba3b34ec-57ec-4bf5-906a-84af82342b8b)

![Notes_250516_042908_3](https://github.com/user-attachments/assets/4b7e5f11-7923-427c-be95-cbcd7d919c5b)
Source: tmp_repo\README.md

--- Result 2 ---
# StepUpYourCareer.ai: Elevate Your Future

An AI-powered career assistant that helps students and job seekers identify **skill gaps**, receive **personalized learning roadmaps**, and connect with **industry mentors**—all from a single resume upload.

### Link to the website: https://stepupyourcareer.streamlit.app/

---

## Problem
Source: tmp_repo\README.md

--- Result 3 ---
- **Skill Gap Analyzer**: Extracts skills from your resume and compares them to your target role
- **Action Plan Generator**: Recommends curated online courses and resources for each missing skill
- **Mentor Matching**: Clusters users and mentors using K-Means to connect you with experts in your domain

---

![Notes_250516_042908_1](https://gi

In [22]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI

# Load vector DB
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectordb = FAISS.load_local("faiss_index", embedder, allow_dangerous_deserialization=True)

# LLM
llm = ChatOpenAI(model="gpt-3.5-turbo")

# Define multiple prompt templates for different doc sections
sections = {
    "Overview": """
    You are a technical writer. Given this code chunk, write a high-level overview of the entire application.
    Focus on what the system does and its purpose.
    
    ### Code:
    {code_chunk}
    
    ### Overview:
    """,
    
    "Installation Instructions": """
    From the following code, infer any setup steps or dependencies.
    Generate clear installation instructions for the user, like a README install section.
    
    ### Code:
    {code_chunk}
    
    ### Installation:
    """,

    "Usage Instructions": """
    Based on the code below, write a usage guide or how-to for running the app or module.
    
    ### Code:
    {code_chunk}
    
    ### Usage:
    """,

    "File/Module Description": """
    Explain what this specific file or module is responsible for.
    Mention any functions or logic it contains.
    
    ### Code:
    {code_chunk}
    
    ### File Purpose:
    """,

    "Core Logic Explanation": """
    Explain the core logic or algorithm in this code block.
    Keep it technical, but understandable to junior developers.
    
    ### Code:
    {code_chunk}
    
    ### Logic Explanation:
    """,
}

# Run similarity search
query = "project summary"
top_chunks = vectordb.similarity_search(query, k=3)

# Loop through chunks and generate each doc section
for i, doc in enumerate(top_chunks):
    print(f"\n\n==================== Code Chunk {i+1} ====================")
    for section_title, prompt_template in sections.items():
        prompt = prompt_template.format(code_chunk=doc.page_content)
        response = llm.invoke(prompt)
        print(f"\n🔹 {section_title}:\n{response.content}")





🔹 Overview:
The Action Plan Generator is a software application designed to create action plans for various tasks or projects. Users can input specific details about the task or project, such as objectives, deadlines, and resources, and the application will generate a detailed action plan outlining the steps needed to achieve the desired outcome. This tool is intended to streamline the planning process and help users stay organized and focused on their goals.

🔹 Installation Instructions:
To use the Action Plan Generator, you will need to have Python installed on your computer. You can download Python from the official website: https://www.python.org/downloads/

1. Clone the repository or download the code files to your local machine.
2. Open a terminal or command prompt and navigate to the directory where the code files are located.
3. Run the following command to install the required dependencies:
   
   ```
   pip install pandas
   ```

4. Once the dependencies are installed, you

In [23]:
import os
import ast
from pathlib import Path
from typing import List, Dict
import pandas as pd

# Step 1: Extract function/class-based code blocks using AST
def extract_code_blocks_from_repo(repo_path: str) -> List[Dict]:
    blocks = []
    for filepath in Path(repo_path).rglob("*.py"):
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                code = f.read()
            tree = ast.parse(code)
            for node in tree.body:
                if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                    block_code = ast.get_source_segment(code, node)
                    blocks.append({
                        "name": getattr(node, 'name', 'unknown'),
                        "type": type(node).__name__,
                        "code": block_code,
                        "source": str(filepath)
                    })
        except Exception as e:
            print(f"⚠️ Error reading {filepath}: {e}")
    return blocks

# Extract blocks from the user's cloned repo
repo_path = "tmp_repo"
code_blocks = extract_code_blocks_from_repo(repo_path)

# Convert to DataFrame for inspection
df_blocks = pd.DataFrame(code_blocks)
import ace_tools as tools; tools.display_dataframe_to_user(name="AST Code Blocks", dataframe=df_blocks.head(10))

# Return metadata for confirmation
{
    "total_blocks": len(code_blocks),
    "unique_files": df_blocks['source'].nunique()
}


ModuleNotFoundError: No module named 'ace_tools'

# AST

In [None]:
# Full pipeline for AST-based chunking + FAISS + LLM summarisation

import os
import ast
from pathlib import Path
from typing import List, Dict
import pandas as pd
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.chat_models import ChatOpenAI


def clone_repo(repo_url, clone_path="tmp_repo"):
    if not os.path.exists(clone_path):
        print(f"Cloning {repo_url} ...")
        Repo.clone_from(repo_url, clone_path)
        print("✅ Repo cloned.")
    else:
        print("⚠️ Repo already exists.")


# Step 2: Extract .py, .md, .txt files
def load_code_files(repo_path):
    code_files = []
    for ext in [".py", ".md", ".txt"]:
        code_files.extend(Path(repo_path).rglob(f"*{ext}"))
    return code_files

# Step 1: Extract code blocks using AST
def extract_ast_chunks(repo_path: str) -> List[Document]:
    documents = []
    for filepath in Path(repo_path).rglob("*.py"):
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                code = f.read()
            tree = ast.parse(code)
            for node in tree.body:
                if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                    block_code = ast.get_source_segment(code, node)
                    if block_code:
                        metadata = {
                            "source": str(filepath),
                            "type": type(node).__name__,
                            "name": getattr(node, 'name', 'unknown')
                        }
                        documents.append(Document(page_content=block_code, metadata=metadata))
        except Exception as e:
            print(f"⚠️ Error parsing {filepath}: {e}")
    return documents

# Step 2: Embed and store chunks in FAISS
def build_faiss_from_ast_chunks(chunks: List[Document], db_path="faiss_ast_index"):
    embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embedder)
    vectordb.save_local(db_path)
    print(f"✅ FAISS vector DB created at: {db_path}")
    return vectordb

# Step 3: Define prompt for LLM doc generation
def generate_doc_for_code_chunk(chunk: Document, model_name="gpt-3.5-turbo") -> str:
    llm = ChatOpenAI(model=model_name)
    prompt = f"""
You are a senior software engineer.

Please generate a clean, technical, and concise docstring or explanation for the following {chunk.metadata.get('type')} named `{chunk.metadata.get('name')}`.

### Code:
{chunk.page_content}

### Documentation:
"""
    return llm.invoke(prompt).content

# Run the pipeline
repo_path = "tmp_repo"
ast_chunks = extract_ast_chunks(repo_path)
vectordb = build_faiss_from_ast_chunks(ast_chunks, db_path="faiss_ast_index")

# Return summary for user
chunk_summaries = [{
    "name": doc.metadata.get("name"),
    "type": doc.metadata.get("type"),
    "source": doc.metadata.get("source"),
    "length": len(doc.page_content)
} for doc in ast_chunks]

df_summary = pd.DataFrame(chunk_summaries)
df_summary.to_csv("ast_chunk_summary.csv", index=False)
print("✅ Saved summary to `ast_chunk_summary.csv`.")



✅ FAISS vector DB created at: faiss_ast_index
✅ Saved summary to `ast_chunk_summary.csv`.


In [29]:
import os
import shutil
import ast
from pathlib import Path
from typing import List
import pandas as pd

from git import Repo
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.chat_models import ChatOpenAI

# Step 1: Clone the repository (removes old copy)
def clone_repo(repo_url, clone_path="tmp_repo"):
    if os.path.exists(clone_path):
        print("🧹 Removing existing repo...")
        import stat
        def handle_remove_readonly(func, path, exc):
            os.chmod(path, stat.S_IWRITE)
            func(path)
        shutil.rmtree(clone_path, onerror=handle_remove_readonly)
    print(f"🔄 Cloning {repo_url} ...")
    Repo.clone_from(repo_url, clone_path)
    print("✅ Repo cloned.")

# Step 2a: Extract Python function/class chunks using AST
def extract_ast_chunks(repo_path: str) -> List[Document]:
    documents = []
    for filepath in Path(repo_path).rglob("*.py"):
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                code = f.read()
            tree = ast.parse(code)
            for node in tree.body:
                if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                    block_code = ast.get_source_segment(code, node)
                    if block_code:
                        metadata = {
                            "source": str(filepath),
                            "type": type(node).__name__,
                            "name": getattr(node, 'name', 'unknown')
                        }
                        documents.append(Document(page_content=block_code, metadata=metadata))
        except Exception as e:
            print(f"⚠️ Error parsing {filepath}: {e}")
    return documents

# Step 2b: Load full content of documentation and config files
def extract_other_chunks(repo_path: str) -> List[Document]:
    extensions = [".md", ".txt", ".html", ".css", ".js", ".json", ".yml", ".yaml", ".ini", ".cfg"]
    special_files = ["README", "requirements.txt", "setup.py"]

    documents = []

    for filepath in Path(repo_path).rglob("*"):
        if filepath.is_file() and (
            filepath.suffix in extensions or
            filepath.name in special_files or
            filepath.name.lower().startswith("readme")
        ):
            try:
                with open(filepath, "r", encoding="utf-8") as f:
                    content = f.read()
                metadata = {
                    "source": str(filepath),
                    "type": "File",
                    "name": filepath.name
                }
                documents.append(Document(page_content=content, metadata=metadata))
            except Exception as e:
                print(f"⚠️ Error reading {filepath}: {e}")
    return documents

# Step 3: Embed and store in FAISS
def build_faiss_from_documents(chunks: List[Document], db_path="faiss_ast_index"):
    embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embedder)
    vectordb.save_local(db_path)
    print(f"✅ FAISS vector DB created at: {db_path}")
    return vectordb

# Step 4 (Optional): Generate summary docstring using LLM
def generate_doc_for_code_chunk(chunk: Document, model_name="gpt-3.5-turbo") -> str:
    llm = ChatOpenAI(model=model_name)
    prompt = f"""
You are a senior software engineer.

Please generate a clean, technical, and concise docstring or explanation for the following {chunk.metadata.get('type')} named `{chunk.metadata.get('name')}`.

### Code:
{chunk.page_content}

### Documentation:
"""
    return llm.invoke(prompt).content

# Step 5: Pipeline Runner
def main(repo_url: str):
    repo_path = "tmp_repo"

    # Clone and extract
    clone_repo(repo_url, repo_path)
    ast_chunks = extract_ast_chunks(repo_path)
    other_chunks = extract_other_chunks(repo_path)
    all_chunks = ast_chunks + other_chunks

    print(f"🧩 Extracted {len(all_chunks)} chunks ({len(ast_chunks)} Python, {len(other_chunks)} others)")

    # Save to FAISS
    vectordb = build_faiss_from_documents(all_chunks, db_path="faiss_ast_index")

    # Save chunk summary
    chunk_summaries = [{
        "name": doc.metadata.get("name"),
        "type": doc.metadata.get("type"),
        "source": doc.metadata.get("source"),
        "length": len(doc.page_content)
    } for doc in all_chunks]

    df_summary = pd.DataFrame(chunk_summaries)
    df_summary.to_csv("ast_chunk_summary.csv", index=False)
    print("📄 Saved summary to `ast_chunk_summary.csv`")

# Example usage
if __name__ == "__main__":
    repo_url = "https://github.com/VESIT-CMPN-Projects/2023-24-BE26"  # 🔁 Replace as needed
    main(repo_url)


🔄 Cloning https://github.com/VESIT-CMPN-Projects/2023-24-BE26 ...
✅ Repo cloned.
🧩 Extracted 9811 chunks (18 Python, 9793 others)


KeyboardInterrupt: 

In [34]:
import os
import shutil
import ast
from pathlib import Path
from typing import List
import pandas as pd

from git import Repo
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.chat_models import ChatOpenAI


# --------- Filter logic ---------
low_value_extensions = [
    ".class", ".o", ".exe", ".dll", ".jar", ".zip", ".png", ".jpg", ".ico", ".log", ".tmp", ".min.js", ".css"
]
skip_dirs = ["node_modules", "venv", ".git", "__pycache__"]

def is_low_value(filepath: Path):
    if any(skip in filepath.parts for skip in skip_dirs):
        return True
    if filepath.suffix.lower() in low_value_extensions:
        return True
    try:
        size = os.path.getsize(filepath)
        return size < 50 or size > 1_000_000  # Skip tiny or huge files
    except:
        return True


# --------- Clone GitHub repo ---------
def clone_repo(repo_url, clone_path="tmp_repo"):
    if os.path.exists(clone_path):
        print("🧹 Removing existing repo...")
        shutil.rmtree(clone_path)
    print(f"🔄 Cloning {repo_url} ...")
    Repo.clone_from(repo_url, clone_path)
    print("✅ Repo cloned.")


# --------- AST-based chunking for .py ---------
def extract_ast_chunks(repo_path: str) -> List[Document]:
    documents = []
    for filepath in Path(repo_path).rglob("*.py"):
        if is_low_value(filepath):
            continue
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                code = f.read()
            tree = ast.parse(code)
            for node in tree.body:
                if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                    block_code = ast.get_source_segment(code, node)
                    if block_code:
                        metadata = {
                            "source": str(filepath),
                            "type": type(node).__name__,
                            "name": getattr(node, 'name', 'unknown')
                        }
                        documents.append(Document(page_content=block_code, metadata=metadata))
        except Exception as e:
            print(f"⚠️ Error parsing {filepath}: {e}")
    return documents


# --------- Text file chunking (.md, .txt, .html, .css, .js) ---------
def extract_text_chunks(repo_path: str) -> List[Document]:
    text_exts = [".md", ".txt", ".html", ".js"]
    documents = []
    for ext in text_exts:
        for filepath in Path(repo_path).rglob(f"*{ext}"):
            if is_low_value(filepath):
                continue
            try:
                with open(filepath, "r", encoding="utf-8") as f:
                    content = f.read()
                metadata = {
                    "source": str(filepath),
                    "type": ext,
                    "name": filepath.name
                }
                documents.append(Document(page_content=content, metadata=metadata))
            except Exception as e:
                print(f"⚠️ Error reading {filepath}: {e}")
    return documents


# --------- Build FAISS vector store ---------
def build_faiss_from_chunks(chunks: List[Document], db_path="faiss_ast_index"):
    embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embedder)
    vectordb.save_local(db_path)
    print(f"✅ FAISS vector DB created at: {db_path}")
    return vectordb


# --------- (Optional) Summarise a code chunk ---------
def generate_doc_for_code_chunk(chunk: Document, model_name="gpt-3.5-turbo") -> str:
    llm = ChatOpenAI(model=model_name)
    prompt = f"""
You are a senior software engineer.

Please generate a clean, technical, and concise docstring or explanation for the following {chunk.metadata.get('type')} named `{chunk.metadata.get('name')}`.

### Code:
{chunk.page_content}

### Documentation:
"""
    return llm.invoke(prompt).content


# --------- Main pipeline ---------
def main(repo_url: str):
    repo_path = "tmp_repo"
    clone_repo(repo_url, repo_path)

    # Collect AST and text chunks
    ast_chunks = extract_ast_chunks(repo_path)
    text_chunks = extract_text_chunks(repo_path)
    all_chunks = ast_chunks + text_chunks

    print(f"🧩 Extracted {len(ast_chunks)} Python AST chunks, {len(text_chunks)} text chunks")

    # Save FAISS
    vectordb = build_faiss_from_chunks(all_chunks)

    # Save metadata summary
    chunk_summaries = [{
        "name": doc.metadata.get("name"),
        "type": doc.metadata.get("type"),
        "source": doc.metadata.get("source"),
        "length": len(doc.page_content)
    } for doc in all_chunks]

    df_summary = pd.DataFrame(chunk_summaries)
    df_summary.to_csv("ast_chunk_summary.csv", index=False)
    print("📄 Saved summary to `ast_chunk_summary.csv`")


# --------- Run ---------
if __name__ == "__main__":
    repo_url = "https://github.com/VESIT-CMPN-Projects/2023-24-BE26"  # 🔁 Change this
    main(repo_url)


🔄 Cloning https://github.com/VESIT-CMPN-Projects/2023-24-BE26 ...
✅ Repo cloned.
🧩 Extracted 18 Python AST chunks, 12 text chunks
✅ FAISS vector DB created at: faiss_ast_index
📄 Saved summary to `ast_chunk_summary.csv`


In [38]:
import os
import shutil
import ast
from pathlib import Path
from typing import List
import pandas as pd
from git import Repo

from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.chat_models import ChatOpenAI
from docx import Document as DocxDocument

# --- Utility: Determine low-value files ---
def is_low_value_file(filepath):
    low_value_exts = ['.css', '.min.js', '.json', '.svg']
    filename = os.path.basename(filepath).lower()
    return any(filename.endswith(ext) for ext in low_value_exts) or 'mock' in filename

# --- Step 1: Clone the repo ---
def clone_repo(repo_url, clone_path="tmp_repo"):
    if os.path.exists(clone_path):
        print("🧹 Removing existing repo...")
        shutil.rmtree(clone_path, ignore_errors=True)
    print(f"🔄 Cloning {repo_url} ...")
    Repo.clone_from(repo_url, clone_path)
    print("✅ Repo cloned.")

# --- Step 2: Extract chunks using AST or direct code ---
def extract_ast_chunks(repo_path: str) -> List[Document]:
    chunks = []
    for filepath in Path(repo_path).rglob("*.*"):
        if is_low_value_file(filepath):
            continue
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                code = f.read()
            if filepath.suffix == ".py":
                tree = ast.parse(code)
                for node in tree.body:
                    if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                        content = ast.get_source_segment(code, node)
                        if content and 50 < len(content) < 5000:
                            meta = {
                                "source": str(filepath),
                                "type": type(node).__name__,
                                "name": node.name
                            }
                            chunks.append(Document(page_content=content, metadata=meta))
            else:
                if 50 < len(code) < 5000:
                    meta = {
                        "source": str(filepath),
                        "type": filepath.suffix,
                        "name": os.path.basename(filepath)
                    }
                    chunks.append(Document(page_content=code, metadata=meta))
        except Exception as e:
            print(f"⚠️ Skipping {filepath}: {e}")
    return chunks

# --- Step 3: Create FAISS vector DB ---
def build_faiss_from_ast_chunks(chunks: List[Document], db_path="faiss_ast_index"):
    embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embedder)
    vectordb.save_local(db_path)
    print(f"✅ FAISS DB saved to `{db_path}`")
    return vectordb

# --- Step 4: Generate documentation using LLM ---
def generate_doc_for_code_chunk(chunk: Document, model_name="gpt-3.5-turbo") -> str:
    llm = ChatOpenAI(model=model_name)
    prompt = f"""
You are a senior software engineer.

Generate a clean, concise, and technical docstring or explanation for the following {chunk.metadata.get('type')} named `{chunk.metadata.get('name')}`.

### Code:
{chunk.page_content}

### Documentation:
"""
    return llm.invoke(prompt).content

# --- Step 5: Generate DOCX with summaries ---
def generate_technical_doc(chunks: List[Document], output_path="technical_documentation.docx"):
    doc = DocxDocument()
    doc.add_heading("📘 Technical Documentation Summary", 0)

    for chunk in chunks:
        if chunk.metadata.get("type") in ['FunctionDef', 'ClassDef', '.py']:
            explanation = generate_doc_for_code_chunk(chunk)
            doc.add_heading(chunk.metadata.get("name", "Unnamed"), level=2)
            doc.add_paragraph(f"📄 Source: {chunk.metadata.get('source')}")
            doc.add_paragraph(explanation)
            doc.add_paragraph("—" * 30)

    doc.save(output_path)
    print(f"✅ Saved technical documentation to `{output_path}`")

# --- Step 6: Main runner ---
def main(repo_url: str):
    repo_path = "tmp_repo"
    clone_repo(repo_url, repo_path)
    ast_chunks = extract_ast_chunks(repo_path)
    print(f"🧩 Extracted {len(ast_chunks)} chunks")

    df_summary = pd.DataFrame([{
        "name": d.metadata.get("name"),
        "type": d.metadata.get("type"),
        "source": d.metadata.get("source"),
        "length": len(d.page_content)
    } for d in ast_chunks])
    df_summary.to_csv("ast_chunk_summary.csv", index=False)
    print("📄 Saved summary to `ast_chunk_summary.csv`")

    build_faiss_from_ast_chunks(ast_chunks, db_path="faiss_ast_index")
    generate_technical_doc(ast_chunks, output_path="technical_documentation.docx")

# 🔁 Replace URL as needed
if __name__ == "__main__":
    repo_url = "https://github.com/VESIT-CMPN-Projects/2023-24-BE26"
    main(repo_url)


🔄 Cloning https://github.com/VESIT-CMPN-Projects/2023-24-BE26 ...
✅ Repo cloned.
⚠️ Skipping tmp_repo\.git: [Errno 13] Permission denied: 'tmp_repo\\.git'
⚠️ Skipping tmp_repo\26_Big Data Analytics and Machine Learning_Indu Dokare.docx: 'utf-8' codec can't decode byte 0x90 in position 11: invalid start byte
⚠️ Skipping tmp_repo\26_Big Data Analytics and Machine Learning_Indu Dokare.pdf: 'utf-8' codec can't decode byte 0xa7 in position 10: invalid start byte
⚠️ Skipping tmp_repo\BE Group 26 Video.zip: 'utf-8' codec can't decode byte 0xeb in position 10: invalid continuation byte
⚠️ Skipping tmp_repo\backend\.ipynb_checkpoints: [Errno 13] Permission denied: 'tmp_repo\\backend\\.ipynb_checkpoints'
⚠️ Skipping tmp_repo\node_modules\.bin: [Errno 13] Permission denied: 'tmp_repo\\node_modules\\.bin'
⚠️ Skipping tmp_repo\node_modules\.cache: [Errno 13] Permission denied: 'tmp_repo\\node_modules\\.cache'
⚠️ Skipping tmp_repo\public\favicon.ico: 'utf-8' codec can't decode byte 0xe3 in position 

KeyboardInterrupt: 

In [None]:
import os
import shutil
import ast
from pathlib import Path
from typing import List
import pandas as pd
from git import Repo

from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.chat_models import ChatOpenAI
from docx import Document as DocxDocument

# --- Utility: Determine low-value files ---
def is_low_value_file(filepath):
    low_value_exts = ['.css', '.min.js', '.json', '.svg']
    filename = os.path.basename(filepath).lower()
    return any(filename.endswith(ext) for ext in low_value_exts) or 'mock' in filename

# --- Step 1: Clone the repo ---
def clone_repo(repo_url, clone_path="tmp_repo"):
    if os.path.exists(clone_path):
        print("🧹 Removing existing repo...")
        shutil.rmtree(clone_path, ignore_errors=True)
    print(f"🔄 Cloning {repo_url} ...")
    Repo.clone_from(repo_url, clone_path)
    print("✅ Repo cloned.")

# --- Step 2: Extract chunks using AST or direct code ---
def extract_ast_chunks(repo_path: str) -> List[Document]:
    chunks = []
    for filepath in Path(repo_path).rglob("*.*"):
        if is_low_value_file(filepath):
            continue
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                code = f.read()
            if filepath.suffix == ".py":
                tree = ast.parse(code)
                for node in tree.body:
                    if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                        content = ast.get_source_segment(code, node)
                        if content and 50 < len(content) < 5000:
                            meta = {
                                "source": str(filepath),
                                "type": type(node).__name__,
                                "name": node.name
                            }
                            chunks.append(Document(page_content=content, metadata=meta))
            else:
                if 50 < len(code) < 5000:
                    meta = {
                        "source": str(filepath),
                        "type": filepath.suffix,
                        "name": os.path.basename(filepath)
                    }
                    chunks.append(Document(page_content=code, metadata=meta))
        except Exception as e:
            print(f"⚠️ Skipping {filepath}: {e}")
    return chunks

# --- Step 3: Create FAISS vector DB ---
def build_faiss_from_ast_chunks(chunks: List[Document], db_path="faiss_ast_index"):
    embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embedder)
    vectordb.save_local(db_path)
    print(f"✅ FAISS DB saved to `{db_path}`")
    return vectordb

# --- Step 4: Generate documentation using LLM ---
def generate_doc_for_code_chunk(chunk: Document, model_name="gpt-3.5-turbo") -> str:
    llm = ChatOpenAI(model=model_name)
    prompt = f"""
You are a senior software engineer.

Generate a clean, concise, and business overview for the following {chunk.metadata.get('type')} named `{chunk.metadata.get('name')}` on what it is doing.

### Code:
{chunk.page_content}

### Documentation:
"""
    return llm.invoke(prompt).content

# --- Step 5: Generate DOCX with summaries ---
def generate_technical_doc(chunks: List[Document], output_path="technical_documentation.docx"):
    doc = DocxDocument()
    doc.add_heading("📘 Technical Documentation Summary", 0)

    for chunk in chunks:
        if chunk.metadata.get("type") in ['FunctionDef', 'ClassDef', '.py']:
            explanation = generate_doc_for_code_chunk(chunk)
            doc.add_heading(chunk.metadata.get("name", "Unnamed"), level=2)
            doc.add_paragraph(f"📄 Source: {chunk.metadata.get('source')}")
            doc.add_paragraph(explanation)
            doc.add_paragraph("—" * 30)

    doc.save(output_path)
    print(f"✅ Saved technical documentation to `{output_path}`")
    

# --- Step 6: Main runner ---
def main(repo_url: str):
    repo_path = "tmp_repo"
    clone_repo(repo_url, repo_path)
    ast_chunks = extract_ast_chunks(repo_path)
    print(f"🧩 Extracted {len(ast_chunks)} chunks")

    df_summary = pd.DataFrame([{
        "name": d.metadata.get("name"),
        "type": d.metadata.get("type"),
        "source": d.metadata.get("source"),
        "length": len(d.page_content)
    } for d in ast_chunks])
    df_summary.to_csv("ast_chunk_summary.csv", index=False)
    print("📄 Saved summary to `ast_chunk_summary.csv`")

    build_faiss_from_ast_chunks(ast_chunks, db_path="faiss_ast_index")
    generate_technical_doc(ast_chunks, output_path="technical_documentation_2.docx")

# 🔁 Replace URL as needed
if __name__ == "__main__":
    repo_url = "https://github.com/adarshlearnngrow/StepUpYourCareer.AI"
    main(repo_url)


🔄 Cloning https://github.com/adarshlearnngrow/StepUpYourCareer.AI ...
✅ Repo cloned.
⚠️ Skipping tmp_repo\.devcontainer: [Errno 13] Permission denied: 'tmp_repo\\.devcontainer'
⚠️ Skipping tmp_repo\.git: [Errno 13] Permission denied: 'tmp_repo\\.git'
⚠️ Skipping tmp_repo\Presentation - StepUpYourCareer.ai Elevate Your Future.pdf: 'utf-8' codec can't decode byte 0xf6 in position 10: invalid start byte
⚠️ Skipping tmp_repo\StepUpAI\models\fitted_vectorizer.pkl: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte
⚠️ Skipping tmp_repo\StepUpAI\models\mentor_clustering_model.pkl: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte
⚠️ Skipping tmp_repo\.git\objects\pack\pack-a5b318ad826ebceb70d739baaee5679d9a54860a.idx: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
⚠️ Skipping tmp_repo\.git\objects\pack\pack-a5b318ad826ebceb70d739baaee5679d9a54860a.pack: 'utf-8' codec can't decode byte 0x86 in position 11: invalid start byte
⚠️ S

In [5]:
import os
import shutil
import ast
import pandas as pd
from pathlib import Path
from typing import List
from git import Repo, GitCommandError

from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.chat_models import ChatOpenAI
from langchain.agents import Tool, initialize_agent, AgentType

from docx import Document as DocxDocument

# --- Utility: Determine low-value files ---
def is_low_value_file(filepath):
    low_value_exts = ['.css', '.min.js', '.json', '.svg', '.csv', '.xlsx', '.xls']
    filename = os.path.basename(filepath).lower()
    return any(filename.endswith(ext) for ext in low_value_exts) or 'mock' in filename

# --- Step 1: Clone the repo ---
def clone_repo(repo_url, clone_path="tmp_repo"):
    if os.path.exists(clone_path):
        print("🧹 Removing existing repo...")
        shutil.rmtree(clone_path, ignore_errors=True)
    print(f"🔄 Cloning {repo_url} ...")
    Repo.clone_from(repo_url, clone_path)
    print("✅ Repo cloned.")

# --- Step 2: Extract chunks using AST or direct code ---
def extract_ast_chunks(repo_path: str) -> List[Document]:
    chunks = []
    for filepath in Path(repo_path).rglob("*.*"):
        if is_low_value_file(filepath):
            continue
        try:
            code = filepath.read_text(encoding="utf-8")
            if filepath.suffix == ".py":
                tree = ast.parse(code)
                for node in tree.body:
                    if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                        content = ast.get_source_segment(code, node)
                        if content and 50 < len(content) < 5000:
                            meta = {"source": str(filepath), "type": type(node).__name__, "name": node.name}
                            chunks.append(Document(page_content=content, metadata=meta))
            else:
                if 50 < len(code) < 5000:
                    meta = {"source": str(filepath), "type": filepath.suffix, "name": os.path.basename(filepath)}
                    chunks.append(Document(page_content=code, metadata=meta))
        except Exception as e:
            print(f"⚠️ Skipping {filepath}: {e}")
    return chunks

# --- Step 3: Create FAISS vector DB ---
def build_faiss_from_ast_chunks(chunks: List[Document], db_path="faiss_ast_index"):
    embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embedder)
    vectordb.save_local(db_path)
    print(f"✅ FAISS DB saved to `{db_path}`")
    return vectordb

# --- Setup LLM and Tool for LangChain Agent ---
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

def _summarize_code(code: str) -> str:
    prompt = (
        "You are a senior software engineer. Generate concise, business-focused documentation for the following code snippet:\n\n"
        f"{code}\n---\nDocumentation:\n"
    )
    return llm.invoke(prompt).content.strip()

summarization_tool = Tool(
    name="summarize_code",
    func=_summarize_code,
    description="Generates documentation for a code snippet"
)

agent = initialize_agent(
    tools=[summarization_tool],
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=False
)

# --- Step 4: Generate documentation using LangChain Agent ---
def generate_doc_for_code_chunk(chunk: Document) -> str:
    # Directly call the summarization tool function to avoid parsing errors
    return summarization_tool.func(chunk.page_content)

# --- Step 5: Generate DOCX with summaries ---
def generate_technical_doc(chunks: List[Document], output_path="technical_documentation.docx"):
    doc = DocxDocument()
    doc.add_heading("📘 Technical Documentation Summary", 0)

    for chunk in chunks:
        if chunk.metadata.get("type") in ['FunctionDef', 'ClassDef']:
            summary = generate_doc_for_code_chunk(chunk)
            doc.add_heading(chunk.metadata.get("name", "Unnamed"), level=2)
            doc.add_paragraph(f"📄 Source: {chunk.metadata.get('source')}")
            doc.add_paragraph(summary)
            doc.add_paragraph("—" * 30)

    doc.save(output_path)
    print(f"✅ Saved technical documentation to `{output_path}`")

# --- Step 6: Main runner ---
def main(repo_url: str):
    repo_path = "tmp_repo"
    clone_repo(repo_url, repo_path)
    chunks = extract_ast_chunks(repo_path)
    print(f"🧩 Extracted {len(chunks)} chunks")

    import pandas as pd
    df_summary = pd.DataFrame([{
        "name": d.metadata.get("name"),
        "type": d.metadata.get("type"),
        "source": d.metadata.get("source"),
        "length": len(d.page_content)
    } for d in chunks])
    df_summary.to_csv("ast_chunk_summary.csv", index=False)
    print("📄 Saved summary to `ast_chunk_summary.csv`")

    build_faiss_from_ast_chunks(chunks, db_path="faiss_ast_index")
    generate_technical_doc(chunks, output_path="technical_documentation_2.docx")

# 🔁 Replace URL as needed
if __name__ == "__main__":
    repo_url = "https://github.com/adarshlearnngrow/StepUpYourCareer.AI"
    main(repo_url)


🔄 Cloning https://github.com/adarshlearnngrow/StepUpYourCareer.AI ...
✅ Repo cloned.
⚠️ Skipping tmp_repo\.devcontainer: [Errno 13] Permission denied: 'tmp_repo\\.devcontainer'
⚠️ Skipping tmp_repo\.git: [Errno 13] Permission denied: 'tmp_repo\\.git'
⚠️ Skipping tmp_repo\Presentation - StepUpYourCareer.ai Elevate Your Future.pdf: 'utf-8' codec can't decode byte 0xf6 in position 10: invalid start byte
⚠️ Skipping tmp_repo\StepUpAI\models\fitted_vectorizer.pkl: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte
⚠️ Skipping tmp_repo\StepUpAI\models\mentor_clustering_model.pkl: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte
⚠️ Skipping tmp_repo\.git\objects\pack\pack-a5b318ad826ebceb70d739baaee5679d9a54860a.idx: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
⚠️ Skipping tmp_repo\.git\objects\pack\pack-a5b318ad826ebceb70d739baaee5679d9a54860a.pack: 'utf-8' codec can't decode byte 0x86 in position 11: invalid start byte
⚠️ S

In [7]:
import os
import shutil
import ast
import pandas as pd
from pathlib import Path
from typing import List
from git import Repo, GitCommandError

from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.chat_models import ChatOpenAI

from docx import Document as DocxDocument

# --- Utility: Determine low-value files ---
def is_low_value_file(filepath):
    low_value_exts = ['.css', '.min.js', '.json', '.svg', '.csv', '.xlsx', '.xls']
    filename = os.path.basename(filepath).lower()
    return any(filename.endswith(ext) for ext in low_value_exts) or 'mock' in filename

# --- Step 1: Clone the repo (latest commit only) ---
def clone_repo(repo_url, clone_path="tmp_repo"):
    if os.path.exists(clone_path):
        shutil.rmtree(clone_path, ignore_errors=True)
    Repo.clone_from(repo_url, clone_path, depth=1)

# --- Step 2: Extract code chunks via AST ---
def extract_ast_chunks(repo_path: str) -> List[Document]:
    chunks = []
    for filepath in Path(repo_path).rglob("*.*"):
        if is_low_value_file(filepath):
            continue
        try:
            code = filepath.read_text(encoding="utf-8")
            if filepath.suffix == ".py":
                tree = ast.parse(code)
                for node in tree.body:
                    if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                        content = ast.get_source_segment(code, node)
                        if content and 50 < len(content) < 5000:
                            meta = {"source": str(filepath), "type": type(node).__name__, "name": node.name}
                            chunks.append(Document(page_content=content, metadata=meta))
            else:
                if 50 < len(code) < 5000:
                    meta = {"source": str(filepath), "type": filepath.suffix, "name": os.path.basename(filepath)}
                    chunks.append(Document(page_content=code, metadata=meta))
        except Exception:
            pass
    return chunks

# --- Step 3: Build FAISS vector store ---
def build_faiss_from_ast_chunks(chunks: List[Document], db_path="faiss_ast_index"):
    embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embedder)
    vectordb.save_local(db_path)
    return vectordb

# --- LLM client ---
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

# --- Section functions ---
def generate_business_overview(repo_path: str) -> str:
    files = [p.relative_to(repo_path).as_posix() for p in Path(repo_path).rglob("*") if p.is_file()]
    context = "\n".join(files)
    prompt = (
        "You are a technical writer for enterprise software. Provide a high-level business overview of this codebase, "
        "including purpose, scope, and value delivered.\n"
        f"Files in project:\n{context}\n---\nOverview:\n"
    )
    return llm.invoke(prompt).content.strip()


def generate_technical_specifications(repo_path: str) -> str:
    ext_map = {'.py':'Python','.js':'JavaScript','.ts':'TypeScript','.java':'Java', '.html':'HTML','.css':'CSS'}
    techs = set()
    for p in Path(repo_path).rglob('*'):
        if p.suffix in ext_map:
            techs.add(ext_map[p.suffix])
        if p.name.lower() == 'dockerfile':
            techs.add('Docker')
    tech_list = ', '.join(sorted(techs))
    prompt = (
        "You are a senior software engineer. List the technologies and frameworks used in this project, "
        "with a brief rationale for each.\n"
        f"Technologies: {tech_list}\n---\nSpecifications:\n"
    )
    return llm.invoke(prompt).content.strip()


def generate_folder_structure(repo_path: str) -> str:
    lines = []
    for root, dirs, files in os.walk(repo_path):
        level = root.replace(repo_path, '').count(os.sep)
        indent = '  ' * level
        lines.append(f"{indent}{os.path.basename(root)}/")
        for f in files:
            lines.append(f"{indent}  {f}")
    tree = "\n".join(lines)
    prompt = (
        "You are a software architect. Describe the folder structure and modular organization of this project.\n"
        f"Directory tree:\n{tree}\n---\nDescription:\n"
    )
    return llm.invoke(prompt).content.strip()


def generate_code_flow(chunks: List[Document]) -> str:
    entries = [f"{d.metadata['type']} {d.metadata['name']}" for d in chunks]
    context = '\n'.join(entries)
    prompt = (
        "You are a senior software engineer. Provide a step-by-step execution flow of this codebase, "
        "referencing functions and classes.\n"
        f"Components:\n{context}\n---\nFlow:\n"
    )
    return llm.invoke(prompt).content.strip()


def summarize_chunk(chunk: Document) -> str:
    prompt = (
        f"You are a senior software engineer. Generate concise documentation for the {chunk.metadata['type']} '{chunk.metadata['name']}'.\n"
        f"Code:\n{chunk.page_content}\n---\nDocumentation:\n"
    )
    return llm.invoke(prompt).content.strip()

# --- Step 4: Generate multi-section DOCX ---
def generate_technical_doc(chunks: List[Document], repo_path: str, output_path="technical_documentation.docx"):
    doc = DocxDocument()

    # Business Overview
    doc.add_heading("Business Overview", level=1)
    doc.add_paragraph(generate_business_overview(repo_path))

    # Technical Specifications
    doc.add_heading("Technical Specifications", level=1)
    doc.add_paragraph(generate_technical_specifications(repo_path))

    # Folder Structure
    doc.add_heading("Folder Structure", level=1)
    doc.add_paragraph(generate_folder_structure(repo_path))

    # Code Flow
    doc.add_heading("Code Flow", level=1)
    doc.add_paragraph(generate_code_flow(chunks))

    # Detailed Function/Class Documentation
    doc.add_heading("Detailed Documentation", level=1)
    for chunk in chunks:
        if chunk.metadata.get("type") in ['FunctionDef', 'ClassDef']:
            doc.add_heading(chunk.metadata.get("name", "Unnamed"), level=2)
            doc.add_paragraph(f"Source: {chunk.metadata.get('source')}")
            doc.add_paragraph(summarize_chunk(chunk))
            doc.add_paragraph("—" * 30)

    doc.save(output_path)
    print(f"✅ Saved technical documentation to `{output_path}`")

# --- Step 5: Main runner ---
def main(repo_url: str):
    repo_path = "tmp_repo"
    clone_repo(repo_url, repo_path)
    chunks = extract_ast_chunks(repo_path)
    pd.DataFrame([{
        "name": d.metadata.get("name"),
        "type": d.metadata.get("type"),
        "source": d.metadata.get("source"),
        "length": len(d.page_content)
    } for d in chunks]).to_csv("ast_chunk_summary.csv", index=False)
    build_faiss_from_ast_chunks(chunks)
    generate_technical_doc(chunks, repo_path)

if __name__ == "__main__":
    main("https://github.com/adarshlearnngrow/StepUpYourCareer.AI")


✅ Saved technical documentation to `technical_documentation.docx`


# Working code


In [2]:
import os
import shutil
import ast
import pandas as pd
from pathlib import Path
from typing import List
from git import Repo, GitCommandError

from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.chat_models import ChatOpenAI

from docx import Document as DocxDocument

# --- Utility: Determine low-value files ---
def is_low_value_file(filepath):
    low_value_exts = ['.css', '.min.js', '.json', '.svg', '.csv', '.xlsx', '.xls']
    filename = os.path.basename(filepath).lower()
    return any(filename.endswith(ext) for ext in low_value_exts) or 'mock' in filename

# --- Step 1: Clone the repo (latest commit only) ---
def clone_repo(repo_url, clone_path="tmp_repo"):
    if os.path.exists(clone_path):
        shutil.rmtree(clone_path, ignore_errors=True)
    Repo.clone_from(repo_url, clone_path, depth=1)

# --- Step 2: Extract code chunks via AST ---
def extract_ast_chunks(repo_path: str) -> List[Document]:
    chunks = []
    for filepath in Path(repo_path).rglob("*.*"):
        if is_low_value_file(filepath):
            continue
        try:
            code = filepath.read_text(encoding="utf-8")
            if filepath.suffix == ".py":
                tree = ast.parse(code)
                for node in tree.body:
                    if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                        content = ast.get_source_segment(code, node)
                        if content and 50 < len(content) < 5000:
                            meta = {"source": str(filepath), "type": type(node).__name__, "name": node.name}
                            chunks.append(Document(page_content=content, metadata=meta))
            else:
                if 50 < len(code) < 5000:
                    meta = {"source": str(filepath), "type": filepath.suffix, "name": os.path.basename(filepath)}
                    chunks.append(Document(page_content=code, metadata=meta))
        except Exception:
            pass
    return chunks

# --- Step 3: Build FAISS vector store ---
def build_faiss_from_ast_chunks(chunks: List[Document], db_path="faiss_ast_index"):
    embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embedder)
    vectordb.save_local(db_path)
    return vectordb

# --- LLM client ---
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

# --- Section functions ---
def generate_business_overview(repo_path: str) -> str:
    files = [p.relative_to(repo_path).as_posix() for p in Path(repo_path).rglob("*") if p.is_file()]
    context = "\n".join(files)
    print(context)
    prompt = (
        "You are a technical writer for enterprise software. Provide a high-level business overview of this solution, "
        "including purpose, scope, and value delivered.\n"
        f"Files in project:\n{context}\n---\nOverview:\n"
    )
    return llm.invoke(prompt).content.strip()


def generate_technical_specifications(repo_path: str) -> str:
    # Deterministically list technologies used without LLM to avoid hallucination
    ext_map = {'.py':'Python','.js':'JavaScript','.ts':'TypeScript','.java':'Java', '.html':'HTML','.css':'CSS', '.go':'Go', '.rs':'Rust'}
    techs = set()
    for p in Path(repo_path).rglob('*'):
        if p.suffix in ext_map:
            techs.add(ext_map[p.suffix])
        if p.name.lower() == 'dockerfile':
            techs.add('Docker')
        if p.name.lower() in ('requirements.txt', 'pyproject.toml'):
            techs.add('Python (dependencies)')
        if p.name.lower() == 'package.json':
            techs.add('Node.js (npm)')
    # Format as bullet list
    lines = [f"- {tech}" for tech in sorted(techs)]
    return "".join(lines)


def generate_folder_structure(repo_path: str) -> str:
    lines = []
    for root, dirs, files in os.walk(repo_path):
        level = root.replace(repo_path, '').count(os.sep)
        indent = '  ' * level
        lines.append(f"{indent}{os.path.basename(root)}/")
        for f in files:
            lines.append(f"{indent}  {f}")
    tree = "\n".join(lines)
    prompt = (
        "You are a software architect. Describe the folder structure and modular organization of this project.\n"
        f"Directory tree:\n{tree}\n---\nDescription:\n"
    )
    return llm.invoke(prompt).content.strip()


def generate_code_flow(chunks: List[Document]) -> str:
    entries = [f"{d.metadata['type']} {d.metadata['name']}" for d in chunks]
    context = '\n'.join(entries)
    prompt = (
        "You are a senior software engineer. Provide a step-by-step execution flow of this codebase, "
        "referencing functions and classes.\n"
        f"Components:\n{context}\n---\nFlow:\n"
    )
    return llm.invoke(prompt).content.strip()


def summarize_chunk(chunk: Document) -> str:
    prompt = (
        f"You are a senior software engineer. Generate concise documentation for the {chunk.metadata['type']} '{chunk.metadata['name']}'.\n"
        f"Code:\n{chunk.page_content}\n---\nDocumentation:\n"
    )
    return llm.invoke(prompt).content.strip()

# --- Step 4: Generate multi-section DOCX ---
def generate_technical_doc(chunks: List[Document], repo_path: str, output_path="technical_documentation.docx"):
    doc = DocxDocument()

    # Business Overview
    doc.add_heading("Business Overview", level=1)
    doc.add_paragraph(generate_business_overview(repo_path))
    '''
    # Technical Specifications
    doc.add_heading("Technical Specifications", level=1)
    doc.add_paragraph(generate_technical_specifications(repo_path))

    # Folder Structure
    doc.add_heading("Folder Structure", level=1)
    doc.add_paragraph(generate_folder_structure(repo_path))

    # Code Flow
    doc.add_heading("Code Flow", level=1)
    doc.add_paragraph(generate_code_flow(chunks))

    # Detailed Function/Class Documentation
    doc.add_heading("Detailed Documentation", level=1)
    for chunk in chunks:
        if chunk.metadata.get("type") in ['FunctionDef', 'ClassDef']:
            doc.add_heading(chunk.metadata.get("name", "Unnamed"), level=2)
            doc.add_paragraph(f"Source: {chunk.metadata.get('source')}")
            doc.add_paragraph(summarize_chunk(chunk))
            doc.add_paragraph("—" * 30)
    '''

    doc.save(output_path)
    print(f"✅ Saved technical documentation to `{output_path}`")

# --- Step 5: Main runner ---
def main(repo_url: str):
    repo_path = "tmp_repo"
    #clone_repo(repo_url, repo_path)
    chunks = extract_ast_chunks(repo_path)
    pd.DataFrame([{
        "name": d.metadata.get("name"),
        "type": d.metadata.get("type"),
        "source": d.metadata.get("source"),
        "length": len(d.page_content)
    } for d in chunks]).to_csv("ast_chunk_summary.csv", index=False)
    build_faiss_from_ast_chunks(chunks)
    generate_technical_doc(chunks, repo_path)

if __name__ == "__main__":
    main("https://github.com/adarshlearnngrow/StepUpYourCareer.AI")


ClusteringMentorModelTraining.ipynb
Job_Description_JD_Manupulation.ipynb
Presentation - StepUpYourCareer.ai Elevate Your Future.pdf
README.md
Skill_Gap_Analysis_and_Action_Plan_Generation.ipynb
.devcontainer/devcontainer.json
.git/config
.git/description
.git/HEAD
.git/index
.git/packed-refs
.git/shallow
data/all_roles_student_resumes.json
data/req_job_desc.csv
data/role_skills.csv
data/role_skills.json
data/sim_resume.json
data/skill_gap_analysis.csv
data/skill_gap_analysis.json
StepUpAI/app.py
StepUpAI/generated_mentors.json
StepUpAI/mentors_final_data.json
StepUpAI/requirements.txt
StepUpAI/role_skills.json
StepUpAI/skill_gap_analysis.json
StepUpAI/skill_resource_mapping.json
StepUpAI/models/fitted_vectorizer.pkl
StepUpAI/models/mentor_clustering_model.pkl
.git/hooks/applypatch-msg.sample
.git/hooks/commit-msg.sample
.git/hooks/fsmonitor-watchman.sample
.git/hooks/post-update.sample
.git/hooks/pre-applypatch.sample
.git/hooks/pre-commit.sample
.git/hooks/pre-merge-commit.sample
.gi