In [1]:
from textwrap import dedent
from typing import Optional, List, Dict, Any
from pydantic import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser

In [2]:
from langchain_ollama import OllamaLLM
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain import PromptTemplate

In [3]:
import os
from pathlib import Path
from textwrap import dedent
import transformers


In [137]:
llm = OllamaLLM(model='qwen2.5-coder', temperature=0.1)

In [None]:
llm.invoke("hi, Strictly Return a json object, no text before or after")

In [2]:
import ast

code_string = "def greet(name):\n    return f'Hello, {name}'"
tree = ast.parse(code_string)
tree

<ast.Module at 0x232a84a3fd0>

In [4]:
for node in ast.walk(tree):
    if isinstance(node, ast.FunctionDef)
    print(node.name)
    

AttributeError: 'Module' object has no attribute 'name'

In [52]:
CODEBASE_ANALYSE_PROMPT = dedent("""
You are a senior software architect and security expert with 15+ years of experience specializing in code analysis, security auditing, and technical documentation.

YOUR TASK :
COMPREHENSIVELY ANALYSE FILE BY FILE OF THE GIVEN CODE FILES(FRONTEND, BACKEND, OR FULL STACK) COVERING FIVE CRITICAL AREAS - CODE SMELLS/SECURITY VULNERABILITIES (SONARQUBE GUIDLINES), PAGE SUMMARY,
AUTHENTICATION FLOWS, ERROR HANDLING, AND PERFORMANCE FLAGS

CRITICAL INSTRUCTIONS:
1. Analyze only the project
2. Never hallucinate missing information
3. If information is incomplete or unclear, Return null
4. Follow specific guidelines provided below for separate CRITICAL AREAS: CODE QUALITY/SECURITY VULNERABILITIES, PAGE SUMMARY,
    AUTHENTICATION FLOWS, ERROR HANDLING, AND PERFORMANCE FLAGS
5. Use clear, professional language suitable for technical documentation
6. Return ONLY valid JSON. No explanation, no commentary, no markdown, no extra text, no long natural language text before or after the JSON.
7. STRICTLY format your response as JSON object with page_summary, code_smells, auth_flow, error_handling and performance as keys

PAGE SUMMARY GUIDELINES:
1. Analyze and understand the following concepts -> Overview, Core Features, Workflow, Role in project, Tech Stack, Architecture, Unique aspects/ Strengths and Possible improvements
2. Return a factual, short, and concise covering:
        - summary: 30-50 words describing the file's primary purpose, core features, and technology used
        - functions_classes: Object with "name" and "description" (10-20 words) for the main component/function/class
        - role_in_project: 15-30 words explaining how this file integrates with the broader system
3. If any of these cannot be determined, return null for that subfield.

CODE SMELL GUIDELINES:
1. Analyze and identify code smells and provide suggested refactors
2. Mandatory Analysis Areas -> 
    OWASP Top 10 vulnerabilities
    SONARQUBE Guildelines
    Dependency Vulnerabilities related to the File
3. Assign severity: CRITICAL | HIGH | MEDIUM
4. Provide ACTIONABLE mitigation steps
5. FORMAT: Return as a numbered dictionary ('1', '2', '3', ...) where each entry has:
        - code_smell: Clear problem description with severity (e.g., "prop drilling through 4 components")
        - severity: Severity levels(CRITICAL, HIGH, MEDIUM)
        - suggested_refactors: Specific, actionable solution
6. If no issues found in a category, return null

AUTHENTICATION FLOWS GUIDELINES:
1. Analyze and identify all token handling, authentication-related logic, and flows
2. Identify ->
        where authentication tokens (e.g., JWT, session tokens) are received from API endpoints (e.g., /api/auth/login).
        Where and how tokens are stored (e.g., cookies, localStorage, sessionStorage).
        How tokens are used in API requests (e.g., adding Authorization: Bearer <token> headers).
        Any related mechanisms for login, signup, logout, token refresh, or session management.
3. Summarize the authentication behavior observed        
4. FORMAT: Single concise string (20-80 words) like:
        "Token received from /api/auth/login and stored in httpOnly cookies. Included in requests via the Authorization header. Implements refresh token rotation."
5. If can't identify any authentication/authorization, return null

ERROR HANDLING GUIDELINES:
1. Analyze and identify all error-handling mechanisms
2. Identify -> 
        Use of error boundaries or fallback UIs.
        Error handling in async operations (e.g., try/catch, .catch()).
        Display of form-level or global error messages.
        Any additional strategies for managing or displaying errors gracefully.
3. Summarize the error handling strategy
4. FORMAT: Single concise string (20-80 words) like:
        "Uses try/catch for async operations. Displays form-level error messages on validation failure. No global error boundary detected."
5. If can't identify any error handling strategy, return null


PERFORMANCE FLAGS GUIDELINES:
1. Analyze and identify any performance-related issues or inefficiencies
2. Identify ->
        Large bundle sizes due to unused dependencies or heavy imports.
        Unnecessary component re-renders.
        Inefficient state management or expensive computations.
        Usage of heavy libraries or other slow code patterns.
3. Summarize performance observations
4. FORMAT: Single concise string (20-80 words) like:
        "Component re-renders unnecessarily due to non-memoized callback props. Consider wrapping with React.memo and using useCallback for handlers."
5. If can't identify any performance flags, return null

INPUT DATA:
You will receive a JSON object with one to five files.
{input}

Each file contains:
- file_name: string
- content: string (full file contents)

Each File Example:
{{'index.html':'<!doctype html>\n<html lang="en">\n  <head>\n    <meta charset="UTF-8" />\n    <link rel="icon" type="image/svg+xml" href="/vite.svg" />\n    <meta name="viewport" content="width=device-width, initial-scale=1.0" />\n    <title>ExpenSo</title>\n    <script type="module" crossorigin src="/assets/index-BwHWdS85.js"></script>\n    <link rel="stylesheet" crossorigin href="/assets/index-B4rkgUJa.css">\n  </head>\n  <body>\n    <div id="root"></div>\n  </body>\n</html>'}}

OUTPUT SCHEMA:
Return a Dictionary/JSON of EXACTLY one JSON object per file.
Return your analysis strictly as a JSON object for each file that matches this schema:
{format_instructions}

IMPORTANT NOTES:
- No preamble, no trailing text, no natural language text. Only the exact JSON output.
- Use null (not "null" string) for absent/not-applicable fields
- For code_smells, use String keys ('1', '2', '3'...)
- Include severity level in code_smell descriptions
- Keep descriptions concise and actionable

BEGIN COMPREHENSIVE ANALYSIS NOW!!!!
""")

In [None]:
CODEBASE_ANALYSE_PROMPT_v2 = dedent("""
You are a senior software architect and security expert with 15+ years of experience in code analysis, security auditing, and technical documentation.

TASK:
Perform a precise, file-by-file analysis of the provided code files (frontend, backend, or full-stack) covering five mandated areas: CODE_SMELLS/SECURITY_VULNERABILITIES (follow SONARQUBE guidelines and OWASP Top 10), PAGE_SUMMARY, AUTHENTICATION_FLOWS, ERROR_HANDLING, and PERFORMANCE_FLAGS.

REQUIREMENTS (READ CAREFULLY):
1. Analyze only the provided files. Do NOT assume or invent missing files or behavior.
2. Do NOT hallucinate. If required information is missing or cannot be determined from the files, return null for that field.
3. Return EXACTLY one JSON object per file.
4. Return ONLY valid JSON. No explanation, no commentary, no markdown, no extra text before or after the JSON.
5. Use JSON `null` (not the string "null") for absent or not-applicable fields.
6. code_smells MUST be a numbered dictionary with integer keys (1, 2, 3...). Each entry must include a severity label: CRITICAL | HIGH | MEDIUM.
7. Rank code_smells by severity descending (CRITICAL first, then HIGH, then MEDIUM).
8. Keep all descriptions concise, factual, and actionable.
9. Follow the exact output schema provided in {format_instructions}.
10. Use only the input data. Do not call external services or reference external knowledge unless it is standard (OWASP, SONARQUBE) and directly relevant to the file content.

PAGE_SUMMARY (output requirements):
- summary: 30–50 words, factual, stating primary purpose, key features, and technologies.
- functions_classes: object mapping main exported functions/classes to {{"name","description"}} (10–20 words each).
- role_in_project: 15–30 words describing integration with the system.
If any of these cannot be determined, return null for that subfield.

CODE_SMELLS (output requirements):
- Mandatory analysis areas: OWASP Top 10, SONARQUBE guidelines, dependency vulnerabilities relevant to the file.
- Each numbered entry must contain:
    - code_smell: succinct problem text including severity (e.g., "Improper input validation — CRITICAL").
    - suggested_refactors: clear, actionable remediation steps.
- If no issues found in a mandatory area, return null for code_smells.

AUTH_FLOW (output requirements):
- Single concise string (20–80 words) summarizing observed authentication/token logic:
  where tokens originate, how/where stored, how included in requests, refresh/logout behavior.
- If no auth behavior is present in the files, return null.

ERROR_HANDLING (output requirements):
- Single concise string (20–80 words) describing error strategies:
  presence of boundaries/fallbacks, try/catch usage, global/form-level handling.
- If no error handling is present, return null.

PERFORMANCE (output requirements):
- Single concise string (20–80 words) identifying performance flags:
  bundle issues, re-render problems, heavy libraries, expensive computations.
- If none present, return null.

INPUT:
You will receive a JSON object with 1–3 file objects in {input}. Each file object contains at minimum "file_name" and "content".

OUTPUT:
Return a single JSON object (or an array of JSON objects if multiple files are supplied) that conforms exactly to the schema provided in {format_instructions}. The response MUST be parseable by a strict JSON parser without additional cleanup.

IMPORTANT FINAL NOTES:
- No preamble, no trailing text, no long natural language text. Only the exact JSON output.
- Use integer keys for code_smells (1,2,3...).
- Use null for unavailable fields.
- Use precise, professional, and technical tone. Short sentences only.

BEGIN ANALYSIS NOW.
""")

In [21]:
class FunctionClass(BaseModel):
    """Represents a function or class in the codebase"""
    name: str = Field(description="Name of the function or class")
    description: str = Field(description="Brief description (10-20 words)")


class PageSummary(BaseModel):
    """Summary of the analyzed file(s)"""
    summary: str = Field(description="Concise overview of the file's purpose (30-50 words)")
    functions_classes: FunctionClass = Field(description="Key functions or classes with descriptions")
    role_in_project: str = Field(description="How this file integrates with the project (15-30 words)")


class CodeSmell(BaseModel):
    """Represents a code smell or security vulnerability"""
    code_smell: str = Field(description="Description of the issue")
    severity: str = Field(description="Severity level (Critical, high, medium)")
    suggested_refactors: str = Field(description="Actionable mitigation steps")


class CodebaseAnalysis(BaseModel):
    """Complete codebase analysis output"""
    page_summary: PageSummary = Field(description="Summary of the file's purpose and structure")
    code_smells: Optional[Dict[str, CodeSmell]] = Field(
        default=None,
        description="Dictionary of code smells with STRING keys ('1', '2', '3'), or null if no issues found"
    )
    auth_flow: Optional[str] = Field(
        default=None,
        description="Description of authentication flows, or null if not applicable"
    )
    error_handling: Optional[str] = Field(
        default=None,
        description="Description of error handling strategies, or null if not present"
    )
    performance_flags: Optional[str] = Field(
        default=None,
        description="Description of performance concerns, or null if no issues detected"
    )

class MultiFileAnalysis(BaseModel):
    """Analysis results for multiple files"""
    analyses: Dict[str, CodebaseAnalysis] = Field(description="Dictionary mapping filename to analysis results")

In [138]:
parser = JsonOutputParser(pydantic_object=MultiFileAnalysis)

In [139]:
format_instructions = parser.get_format_instructions()

In [140]:
prompt =  PromptTemplate(
        input_variables=['input'],
    partial_variables={'format_instructions':format_instructions},
    template=CODEBASE_ANALYSE_PROMPT
)

In [131]:
format_instructions

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"$defs": {"CodeSmell": {"description": "Represents a code smell or security vulnerability", "properties": {"code_smell": {"description": "Description of the issue", "title": "Code Smell", "type": "string"}, "severity": {"description": "Severity level (Critical, high, medium)", "title": "Severity", "type": "string"}, "suggested_refactors": {"description": "Actionable mitigation steps", "title": "Suggested Refactors", "type": "string"}}, "required": ["code_smell", "severity", "suggested_refactors"], "title": "CodeSmell", "type": "object

In [141]:
chain = prompt | llm | parser

In [57]:
CHUNK_SIZE = 1200     # Optimal for Ollama models (LLaMA / Mistral)
CHUNK_OVERLAP = 150
separators = [
    "\nclass ",       # Python / Java / C++ class
    "\ndef ",         # Python function
    "\nfunction ",    # JS function
    "\nconst ",       # JS constant
    "\nlet ",         # JS variable
    "\nvar ",         # JS variable
    "\nimport ",      # Python/JS import
    "\nfrom ",        # Python import
    "\npackage ",     # Java package
    "\ninterface ",   # Java interface
    "\npublic ",      # Java public method
    "\nprivate ",     # Java private method
    "\nprotected ",   # Java protected method
    "\n<script>",     # HTML script tag
    "\n<style>",      # HTML style tag
    "</div>",         # HTML div closing
    "</section>",     # HTML section closing
    "\n\n",           # Double newlines
    "\n"              # Fallback single newline
]

In [58]:
FRONTEND_EXT = {".html", ".js", ".jsx", ".ts", ".tsx", ".vue"}
BACKEND_EXT = {".py", ".java", ".php", ".go", ".rb", ".ts", ".sql", ".yml", ".yaml"}
DOC_EXT = {".md", ".txt"}

In [59]:
file_path=r"C:\Users\Asus\Desktop\WebTech\LoginDjango\ReactFrontEnd\loginEx"

In [83]:
def collect_project_files(base_path: Path):
    """Categorize files into frontend, backend, and documentation."""
    frontend, backend, docs = [], [], []
    for root, _, files in os.walk(base_path):
        # print(root)
        if 'node_modules' in root:
            continue
        for f in files:
            # print(f)
            path = Path(root) / f
            if not path.is_file() or path.stat().st_size > 500_000:
                continue
            try:
                content = path.read_text(errors="ignore")
            except Exception:
                continue
            ext = path.suffix.lower()
            rel_path = str(path.relative_to(base_path))
            doc = Document(page_content=content, metadata={"file_name": rel_path})
            if ext in FRONTEND_EXT:
                print(f)
                frontend.append(doc)
            elif ext in BACKEND_EXT:
                backend.append(doc)
            elif ext in DOC_EXT:
                docs.append(doc)
    return frontend, backend, docs

In [84]:
def doc_list(docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separators=separators
    )
    split_docs = text_splitter.split_documents(docs)
    return split_docs

In [110]:
frontend_docs, backend_docs, doc_docs = collect_project_files(file_path)
frontend_summary = doc_list(frontend_docs)

eslint.config.js
index.html
vite.config.js
App.jsx
Login.jsx
main.jsx
Register.jsx


In [111]:
len(frontend_docs)

7

In [109]:
for i in frontend_summary:
    print(i.metadata['file_name'])

eslint.config.js
index.html
vite.config.js
src\App.jsx
src\Login.jsx
src\Login.jsx
src\Login.jsx
src\Login.jsx
src\main.jsx
src\Register.jsx
src\Register.jsx
src\Register.jsx
src\Register.jsx


In [87]:
def batch(doc, batch_size):
    batches=[]
    for i in range(0,len(doc),batch_size):
        batch = doc[i:i+batch_size]
        batches.append(batch)
    return batches

In [120]:
def file_data(docs):
    llm_input = []
    
    for i in docs:
        file_data = dict()
        print(len(i))
        # print(i, end='\n\n')
        for j in i:
            
            file_name = j.metadata['file_name']
            # print(file_name)
            if '/' in file_name or '\\' in file_name:
                file_name = file_name.split('/')[-1].split('\\')[-1]
            file_data[file_name]=j.page_content
        llm_input.append(file_data)
    return llm_input

In [114]:
def full_file_data(docs):
    llm_input = []
    
    file_data = dict()
    for j in docs:
        file_name = j.metadata['file_name']
        if '/' in file_name or '\\' in file_name:
            file_name = file_name.split('/')[-1].split('\\')[-1]
        file_data[file_name]=j.page_content
        llm_input.append(file_data)
    # llm_input.append(file_data)
    return llm_input

In [121]:
docs = batch(frontend_docs, 10)
file_data = file_data(docs)

7


In [123]:
len(docs[0])

7

In [37]:
full_data = full_file_data(frontend_summary)

In [38]:
file_data[0]

{'eslint.config.js': "import js from '@eslint/js'\nimport globals from 'globals'\nimport reactHooks from 'eslint-plugin-react-hooks'\nimport reactRefresh from 'eslint-plugin-react-refresh'\nimport { defineConfig, globalIgnores } from 'eslint/config'\n\nexport default defineConfig([\n  globalIgnores(['dist']),\n  {\n    files: ['**/*.{js,jsx}'],\n    extends: [\n      js.configs.recommended,\n      reactHooks.configs['recommended-latest'],\n      reactRefresh.configs.vite,\n    ],\n    languageOptions: {\n      ecmaVersion: 2020,\n      globals: globals.browser,\n      parserOptions: {\n        ecmaVersion: 'latest',\n        ecmaFeatures: { jsx: true },\n        sourceType: 'module',\n      },\n    },\n    rules: {\n      'no-unused-vars': ['error', { varsIgnorePattern: '^[A-Z_]' }],\n    },\n  },\n])",
 'index.html': '<!doctype html>\n<html lang="en">\n  <head>\n    <meta charset="UTF-8" />\n    <link rel="icon" type="image/svg+xml" href="/vite.svg" />\n    <meta name="viewport" conte

In [39]:
len(full_data)

29

In [40]:
len(frontend_summary)

29

In [70]:
response = chain.invoke({'input':full_data})

In [71]:
response

{'analyses': {'index.html': {'page_summary': {'summary': 'HTML file for the ExpenSo application',
    'functions_classes': None,
    'role_in_project': 'Root HTML template'},
   'code_smells': None,
   'auth_flow': None,
   'error_handling': None,
   'performance_flags': None},
  'App.jsx': {'page_summary': {'summary': 'Main application component',
    'functions_classes': [{'name': 'App',
      'description': 'Renders the main application structure'}],
    'role_in_project': 'Top-level component'},
   'code_smells': None,
   'auth_flow': None,
   'error_handling': None,
   'performance_flags': None},
  'Expenes.jsx': {'page_summary': {'summary': 'Component to display and manage expenses',
    'functions_classes': [{'name': 'Expenses',
      'description': 'Handles rendering and managing expenses'}],
    'role_in_project': 'Expense management component'},
   'code_smells': None,
   'auth_flow': None,
   'error_handling': None,
   'performance_flags': None},
  'main.jsx': {'page_summary

In [None]:
file_data[0]

In [142]:
res = dict()

for i in file_data:
    response = chain.invoke({'input':i})
    # parsed = parser.parse(response)
    res.update(response['analyses'])
    print(response['analyses'])
    print(i.keys())
    print(type(response))
    print(len(response))
    print(response)
print(res)

{'eslint.config.js': {'page_summary': {'summary': 'Configuration file for ESLint with rules for JavaScript, React hooks, and React refresh.', 'functions_classes': None, 'role_in_project': 'Ensures code quality and consistency across the project.'}, 'code_smells': None, 'auth_flow': None, 'error_handling': None, 'performance_flags': None}, 'index.html': {'page_summary': {'summary': 'HTML template for a React application using Vite.', 'functions_classes': None, 'role_in_project': 'Provides the basic structure and entry point for the React app.'}, 'code_smells': None, 'auth_flow': None, 'error_handling': None, 'performance_flags': None}, 'vite.config.js': {'page_summary': {'summary': 'Configuration file for Vite, a build tool for modern web development.', 'functions_classes': None, 'role_in_project': 'Configures the build process and development server.'}, 'code_smells': None, 'auth_flow': None, 'error_handling': None, 'performance_flags': None}, 'main.jsx': {'page_summary': {'summary': '

In [134]:
for i in file_data:
    # print(i,end='\n\n\n')
    for j,_ in i.items():
        print(j)

eslint.config.js
index.html
vite.config.js
App.jsx
Login.jsx
main.jsx
Register.jsx


In [144]:
for i in res.keys():
    print(i)

eslint.config.js
index.html
vite.config.js
main.jsx
App.jsx
Login.jsx
Register.jsx


In [143]:
res

{'eslint.config.js': {'page_summary': {'summary': 'Configuration file for ESLint with rules for JavaScript, React hooks, and React refresh.',
   'functions_classes': None,
   'role_in_project': 'Ensures code quality and consistency across the project.'},
  'code_smells': None,
  'auth_flow': None,
  'error_handling': None,
  'performance_flags': None},
 'index.html': {'page_summary': {'summary': 'HTML template for a React application using Vite.',
   'functions_classes': None,
   'role_in_project': 'Provides the basic structure and entry point for the React app.'},
  'code_smells': None,
  'auth_flow': None,
  'error_handling': None,
  'performance_flags': None},
 'vite.config.js': {'page_summary': {'summary': 'Configuration file for Vite, a build tool for modern web development.',
   'functions_classes': None,
   'role_in_project': 'Configures the build process and development server.'},
  'code_smells': None,
  'auth_flow': None,
  'error_handling': None,
  'performance_flags': None}

In [None]:
from textwrap import dedent
from typing import Optional, List, Dict, Any
from pydantic import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser


# ============================================================================
# PYDANTIC MODELS
# ============================================================================

class FunctionClass(BaseModel):
    """Represents a function or class in the codebase"""
    name: str = Field(description="Name of the function or class")
    description: str = Field(description="Brief description (10-20 words)")


class PageSummary(BaseModel):
    """Summary of the analyzed file(s)"""
    summary: str = Field(description="Concise overview of the file's purpose (30-50 words)")
    functions_classes: FunctionClass = Field(description="Key functions or classes with descriptions")
    role_in_project: str = Field(description="How this file integrates with the project (15-30 words)")


class CodeSmell(BaseModel):
    """Represents a code smell or security vulnerability"""
    code_smell: str = Field(description="Description of the issue including severity level")
    suggested_refactors: str = Field(description="Actionable mitigation steps")


class CodebaseAnalysis(BaseModel):
    """Complete codebase analysis output"""
    file_name: str = Field(description="Name of the analyzed file")
    page_summary: PageSummary = Field(description="Summary of the file's purpose and structure")
    code_smells: Optional[Dict[int, CodeSmell]] = Field(
        default=None,
        description="Dictionary of code smells indexed by number, or None if no issues found"
    )
    auth_flow: Optional[str] = Field(
        default=None,
        description="Description of authentication flows, or None if not applicable"
    )
    error_handling: Optional[str] = Field(
        default=None,
        description="Description of error handling strategies, or None if not present"
    )
    performance_flags: Optional[str] = Field(
        default=None,
        description="Description of performance concerns, or None if no issues detected"
    )


# ============================================================================
# LANGCHAIN JSON OUTPUT PARSER
# ============================================================================

parser = JsonOutputParser(pydantic_object=CodebaseAnalysis)


# ============================================================================
# REFINED PROMPT
# ============================================================================

CODEBASE_ANALYSE_PROMPT = dedent("""
You are a senior software architect and security expert with 15+ years of experience specializing in code analysis, security auditing, and technical documentation.

YOUR TASK:
Perform a comprehensive analysis of the provided code file across five critical dimensions:
1. Page Summary
2. Code Smells & Security Vulnerabilities  
3. Authentication Flows
4. Error Handling
5. Performance Flags

CORE PRINCIPLES:
- Analyze ONLY the provided code; never infer or hallucinate missing information
- If information is incomplete, ambiguous, or absent, return null for that field
- Maintain objectivity and precision in all assessments
- Use clear, professional technical language
- Return EXACTLY ONE analysis object per file

═══════════════════════════════════════════════════════════════════════════════
SECTION 1: PAGE SUMMARY (REQUIRED)
═══════════════════════════════════════════════════════════════════════════════

Provide a concise analysis covering:
- summary: 30-50 words describing the file's primary purpose, key features, and technology used
- functions_classes: Object with "name" and "description" (10-20 words) for the main component/function/class
- role_in_project: 15-30 words explaining how this file integrates with the broader system

═══════════════════════════════════════════════════════════════════════════════
SECTION 2: CODE SMELLS & SECURITY VULNERABILITIES (OPTIONAL)
═══════════════════════════════════════════════════════════════════════════════

Analyze for:
- OWASP Top 10 vulnerabilities
- SonarQube quality guidelines violations
- Dependency vulnerabilities
- Security misconfigurations
- Maintainability issues (prop drilling, deep nesting, tight coupling)

SEVERITY LEVELS: Include severity in the code_smell description
- CRITICAL: Security vulnerabilities or system-breaking issues
- HIGH: Significant security/quality concerns
- MEDIUM: Moderate maintainability or minor security issues

FORMAT: Return as numbered dictionary (1, 2, 3, ...) where each entry has:
- code_smell: Clear problem description with severity (e.g., "prop drilling through 4 components - high")
- suggested_refactors: Specific, actionable solution

If NO issues found, return null for code_smells.

═══════════════════════════════════════════════════════════════════════════════
SECTION 3: AUTHENTICATION FLOWS (OPTIONAL)
═══════════════════════════════════════════════════════════════════════════════

Identify and document authentication mechanisms:
- Token reception endpoints (e.g., /api/auth/login)
- Storage mechanisms (cookies, localStorage, sessionStorage)
- Token usage in API requests
- Login/logout/refresh flows

FORMAT: Single concise string (20-80 words) like:
"Token received from /api/auth/login and stored in httpOnly cookies. Included in requests via Authorization header. Implements refresh token rotation."

If NO authentication logic found, return null.

═══════════════════════════════════════════════════════════════════════════════
SECTION 4: ERROR HANDLING (OPTIONAL)
═══════════════════════════════════════════════════════════════════════════════

Evaluate error management:
- Error boundaries or fallback UI
- Try/catch blocks and async error handling
- Form validation and error display
- Global error handlers
- User-facing error messages

FORMAT: Single concise string (20-80 words) like:
"Uses try/catch for async operations. Displays form-level error messages on validation failure. No global error boundary detected."

If NO error handling found, return null.

═══════════════════════════════════════════════════════════════════════════════
SECTION 5: PERFORMANCE FLAGS (OPTIONAL)
═══════════════════════════════════════════════════════════════════════════════

Identify performance concerns:
- Unnecessary re-renders (missing React.memo, useMemo, useCallback)
- Large bundle sizes or unused dependencies
- Inefficient state management
- Expensive computations
- Missing lazy loading or virtualization

FORMAT: Single concise string (20-80 words) like:
"Component re-renders unnecessarily due to non-memoized callback props. Consider wrapping with React.memo and using useCallback for handlers."

If NO performance issues detected, return null.

═══════════════════════════════════════════════════════════════════════════════
INPUT DATA
═══════════════════════════════════════════════════════════════════════════════

You will receive a JSON object with one to three files:
{input}

═══════════════════════════════════════════════════════════════════════════════
OUTPUT FORMAT
═══════════════════════════════════════════════════════════════════════════════

{format_instructions}

CRITICAL REQUIREMENTS:
- Return valid JSON only
- Use null (not "null" string) for absent/not-applicable fields
- For code_smells, use integer keys (1, 2, 3...) NOT strings
- Include severity level in code_smell descriptions
- Keep descriptions concise and actionable

BEGIN ANALYSIS.
""")


# ============================================================================
# USAGE EXAMPLE
# ============================================================================

def analyze_codebase(file_data: Dict[str, str], llm) -> CodebaseAnalysis:
    """
    Analyze codebase files using the LLM.
    
    Args:
        file_data: Dictionary with file names as keys and content as values
                   e.g., {"TaskForm.jsx": "import React...", ...}
        llm: Your LangChain LLM instance (e.g., ChatOpenAI, ChatAnthropic)
    
    Returns:
        CodebaseAnalysis: Parsed analysis result
    """
    # Get format instructions from parser
    format_instructions = parser.get_format_instructions()
    
    # Create the prompt
    prompt = CODEBASE_ANALYSE_PROMPT.format(
        input=file_data,
        format_instructions=format_instructions
    )
    
    # Get LLM response
    response = llm.invoke(prompt)
    
    # Parse the response
    parsed_result = parser.parse(response.content)
    
    # Return as Pydantic model
    return CodebaseAnalysis(**parsed_result)


# ============================================================================
# BATCH PROCESSING DOCUMENTS
# ============================================================================

def batch_documents(documents: List, batch_size: int = 3) -> List[List]:
    """
    Split documents into batches of specified size.
    
    Args:
        documents: List of LangChain Document objects
        batch_size: Number of documents per batch (default: 3)
    
    Returns:
        List of batches, where each batch is a list of documents
    """
    batches = []
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        batches.append(batch)
    return batches


def prepare_file_data_from_documents(documents: List) -> Dict[str, str]:
    """
    Convert a batch of LangChain documents into the required dictionary format.
    
    Args:
        documents: List of LangChain Document objects (max 3)
                   Each document should have metadata with 'source' or 'file_name'
    
    Returns:
        Dictionary with file names as keys and content as values
        e.g., {"TaskForm.jsx": "import React...", "App.js": "function App()..."}
    """
    file_data = {}
    
    for doc in documents:
        # Try to get file name from metadata
        # Adjust these keys based on your metadata structure
        file_name = (
            doc.metadata.get('source') or 
            doc.metadata.get('file_name') or 
            doc.metadata.get('filename') or
            f"file_{len(file_data) + 1}.js"  # Fallback name
        )
        
        # If source is a full path, extract just the filename
        if '/' in file_name or '\\' in file_name:
            file_name = file_name.split('/')[-1].split('\\')[-1]
        
        # Add to dictionary
        file_data[file_name] = doc.page_content
    
    return file_data


def analyze_documents_in_batches(documents: List, llm, batch_size: int = 3) -> List[CodebaseAnalysis]:
    """
    Analyze documents in batches and return all results.
    
    Args:
        documents: List of LangChain Document objects
        llm: Your LangChain LLM instance
        batch_size: Number of documents to analyze together (default: 3)
    
    Returns:
        List of CodebaseAnalysis objects
    """
    all_results = []
    
    # Split into batches
    batches = batch_documents(documents, batch_size)
    
    print(f"Processing {len(documents)} documents in {len(batches)} batches...")
    
    # Process each batch
    for batch_idx, batch in enumerate(batches, 1):
        print(f"\nProcessing batch {batch_idx}/{len(batches)} ({len(batch)} files)...")
        
        # Prepare file data
        file_data = prepare_file_data_from_documents(batch)
        
        # Analyze the batch
        try:
            result = analyze_codebase(file_data, llm)
            all_results.append(result)
            print(f"✓ Batch {batch_idx} analyzed successfully")
        except Exception as e:
            print(f"✗ Error analyzing batch {batch_idx}: {str(e)}")
            continue
    
    return all_results


# ============================================================================
# EXAMPLE USAGE
# ============================================================================

"""
from langchain_openai import ChatOpenAI
from langchain.schema import Document

# Initialize LLM
llm = ChatOpenAI(model="gpt-4", temperature=0)

# Your LangChain documents (example)
documents = [
    Document(
        page_content="import React from 'react';\\nfunction App() { return <div>Hello</div>; }",
        metadata={"source": "src/App.js", "type": "javascript"}
    ),
    Document(
        page_content="export const API_URL = 'https://api.example.com';",
        metadata={"source": "src/config.js", "type": "javascript"}
    ),
    Document(
        page_content="import axios from 'axios';\\nexport const fetchData = () => axios.get('/api/data');",
        metadata={"source": "src/api/client.js", "type": "javascript"}
    ),
    Document(
        page_content="const express = require('express');\\nconst app = express();",
        metadata={"source": "server/index.js", "type": "javascript"}
    ),
    # ... more documents
]

# METHOD 1: Analyze all documents in batches automatically
results = analyze_documents_in_batches(documents, llm, batch_size=3)

# Access results
for result in results:
    print(f"\n{'='*60}")
    print(f"File: {result.file_name}")
    print(f"Summary: {result.page_summary.summary}")
    if result.code_smells:
        print(f"Issues found: {len(result.code_smells)}")

# METHOD 2: Manual batch processing with more control
batches = batch_documents(documents, batch_size=3)

for batch_idx, batch in enumerate(batches, 1):
    print(f"\n--- Batch {batch_idx} ---")
    
    # Prepare file data for this batch
    file_data = prepare_file_data_from_documents(batch)
    
    print(f"Files in this batch: {list(file_data.keys())}")
    
    # Analyze
    result = analyze_codebase(file_data, llm)
    
    # Do something with result
    print(f"Analysis complete for {result.file_name}")

# METHOD 3: Process specific subset
first_three_docs = documents[:3]
file_data = prepare_file_data_from_documents(first_three_docs)
result = analyze_codebase(file_data, llm)
"""