# Java Antipattern Scanner - Refactored Demo

This is a refactored Java antipattern scanner demonstration, based on a modular codebase structure.

## Key improvements:

- **Modular structure**: Clear separation of agents, workflow, and data layers
- **English documentation**: All comments and docstrings translated to English
- **Enhanced workflow**: Multi-step analysis pipeline with LLM integration
- **Better error handling**: Comprehensive exception handling and fallbacks
- **Standardized configuration**: Centralized settings management

## Project Structure

This demo will create the following modular structure:

```
java_antipatterns_scanner/
├── __init__.py                    # Package initialization
├── config.py                      # Configuration management
├── requirements.txt               # Dependency management
├── main.py                        # Main entry script
├── db/                           # Database module
│   ├── __init__.py
│   └── vector_db.py              # Vector database management
├── analysis/                     # Analysis module
│   ├── __init__.py
│   ├── retriever.py             # Retrieval tools
│   └── analyzer.py              # Code analyzer
└── workflow/                     # Workflow module
    ├── __init__.py
    └── langgraph_workflow.py     # LangGraph workflow
```

## Limitations (To be addressed in future versions)

- Query methods: Still using the original code snippets for semantic querying
- Agent structure: Current version is still a single-agent architecture
- Model support: Only local Ollama model is supported
- Usage: Manual code provision for analysis is still required

In [None]:
# 第一步：创建项目目录结构
import os
from pathlib import Path

# 定义项目根目录
project_root = Path("java_antipatterns_scanner")

# 创建目录结构
directories = [
    project_root,
    project_root / "db",
    project_root / "analysis", 
    project_root / "workflow"
]

print("🚀 创建项目目录结构...")
for directory in directories:
    directory.mkdir(parents=True, exist_ok=True)
    print(f"✅ 创建目录: {directory}")

# 创建__init__.py文件
init_files = [
    project_root / "__init__.py",
    project_root / "db" / "__init__.py",
    project_root / "analysis" / "__init__.py",
    project_root / "workflow" / "__init__.py"
]

for init_file in init_files:
    init_file.touch()
    print(f"✅ 创建文件: {init_file}")

print("\n📁 项目结构创建完成!")
print("\n目录结构:")
def print_tree(directory, prefix="", is_last=True):
    """递归打印目录树"""
    name = directory.name
    print(f"{prefix}{'└── ' if is_last else '├── '}{name}/")
    
    # 获取子项目并排序
    children = sorted([child for child in directory.iterdir() if child.is_dir()])
    files = sorted([child for child in directory.iterdir() if child.is_file()])
    
    # 打印子目录
    for i, child in enumerate(children):
        is_last_child = (i == len(children) - 1) and len(files) == 0
        new_prefix = prefix + ("    " if is_last else "│   ")
        print_tree(child, new_prefix, is_last_child)
    
    # 打印文件
    for i, file in enumerate(files):
        is_last_file = i == len(files) - 1
        print(f"{prefix}{'└── ' if is_last else '├── '}{'└── ' if is_last_file else '├── '}{file.name}")

print_tree(project_root)

In [None]:
# 第二步：创建配置管理模块
config_content = '''"""
配置管理模块 - 统一管理项目配置参数
"""

import os
from pathlib import Path

class Config:
    """项目配置类"""
    
    # 基础路径配置
    PROJECT_ROOT = Path(__file__).parent
    DATA_DIR = PROJECT_ROOT.parent / "static"
    
    # LLM模型配置
    LLM_MODEL = "granite3.3:8b"
    EMBEDDING_MODEL = "nomic-embed-text:v1.5"
    
    # 向量数据库配置
    VECTOR_DB_PATH = os.path.expanduser("~/antipattern_vectordb")
    ANTIPATTERN_FILE = DATA_DIR / "ap.txt"
    
    # 文档分割配置
    CHUNK_SIZE = 1000
    CHUNK_OVERLAP = 200
    
    # 检索配置
    RETRIEVAL_K = 4  # 检索相关文档数量
    
    @classmethod
    def get_antipattern_file_path(cls):
        """获取反模式文件路径，支持多种可能位置"""
        possible_paths = [
            cls.ANTIPATTERN_FILE,
            cls.PROJECT_ROOT.parent / "static" / "ap.txt",
            Path("static/ap.txt"),
            Path("ap.txt")
        ]
        
        for path in possible_paths:
            if path.exists():
                return str(path)
        
        # 如果都不存在，返回默认路径（用于错误提示）
        return str(cls.ANTIPATTERN_FILE)

# 全局配置实例
config = Config()
'''

# 写入配置文件
config_file = project_root / "config.py"
with open(config_file, 'w', encoding='utf-8') as f:
    f.write(config_content)

print("✅ 配置管理模块 (config.py) 创建完成")

# 创建requirements.txt
requirements_content = '''# LangChain相关
langchain>=0.1.0
langchain-community>=0.0.20
langchain-ollama>=0.1.0
langgraph>=0.0.30

# 向量数据库
chromadb>=0.4.0

# 其他依赖
typing-extensions>=4.0.0
'''

requirements_file = project_root / "requirements.txt"
with open(requirements_file, 'w', encoding='utf-8') as f:
    f.write(requirements_content)

print("✅ 依赖配置文件 (requirements.txt) 创建完成")
print("\n📋 配置要点:")
print("- 支持多种反模式文件位置自动检测")
print("- 统一的模型和路径参数管理")  
print("- 可通过环境变量覆盖配置")
print("- 模块化的配置结构")

In [None]:
# 第三步：实现向量数据库模块
vector_db_content = '''"""
向量数据库管理模块 - 负责文档加载、分割和向量化存储
"""

import os
from pathlib import Path
from typing import Optional

try:
    from langchain_community.document_loaders import TextLoader
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from langchain_community.vectorstores import Chroma
    from langchain_ollama import OllamaEmbeddings
except ImportError:
    # 向后兼容导入
    from langchain.document_loaders import TextLoader
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from langchain.vectorstores import Chroma
    from langchain.embeddings import OllamaEmbeddings

from .config import config


class VectorDBManager:
    """向量数据库管理器"""
    
    def __init__(self, persist_directory: Optional[str] = None):
        self.persist_directory = persist_directory or config.VECTOR_DB_PATH
        self.vectordb = None
        self.is_initialized = False
    
    def init_vector_db(self, file_path: Optional[str] = None) -> bool:
        """
        初始化向量数据库
        
        Args:
            file_path: 反模式文件路径，如果为None则使用配置中的路径
            
        Returns:
            bool: 初始化是否成功
        """
        try:
            # 确定文件路径
            if file_path is None:
                file_path = config.get_antipattern_file_path()
            
            if not os.path.exists(file_path):
                print(f"❌ 文件不存在: {file_path}")
                return False
            
            print(f"📁 加载反模式文件: {file_path}")
            
            # 加载文档
            loader = TextLoader(file_path, encoding="utf-8")
            docs = loader.load()
            
            # 分割文档
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=config.CHUNK_SIZE,
                chunk_overlap=config.CHUNK_OVERLAP,
                length_function=len
            )
            split_docs = text_splitter.split_documents(docs)
            print(f"📄 文档分割为 {len(split_docs)} 个块")
            
            # 创建持久化目录
            os.makedirs(self.persist_directory, exist_ok=True)
            
            # 初始化嵌入模型
            print(f"🤖 初始化嵌入模型: {config.EMBEDDING_MODEL}")
            embedding = OllamaEmbeddings(model=config.EMBEDDING_MODEL)
            
            # 创建向量数据库
            self.vectordb = Chroma(
                embedding_function=embedding,
                persist_directory=self.persist_directory
            )
            
            # 添加文档
            print("💾 添加文档到向量数据库...")
            self.vectordb.add_documents(split_docs)
            
            # 持久化
            if hasattr(self.vectordb, 'persist'):
                self.vectordb.persist()
            
            self.is_initialized = True
            chunk_count = self.get_chunk_count()
            print(f"✅ 向量数据库初始化成功! 存储了 {chunk_count} 个文档块")
            
            return True
            
        except Exception as e:
            print(f"❌ 向量数据库初始化失败: {e}")
            return False
    
    def load_existing_db(self) -> bool:
        """加载已存在的向量数据库"""
        try:
            if not os.path.exists(self.persist_directory):
                print(f"⚠️ 向量数据库目录不存在: {self.persist_directory}")
                return False
            
            # 初始化嵌入模型
            embedding = OllamaEmbeddings(model=config.EMBEDDING_MODEL)
            
            # 加载现有数据库
            self.vectordb = Chroma(
                embedding_function=embedding,
                persist_directory=self.persist_directory
            )
            
            chunk_count = self.get_chunk_count()
            if chunk_count > 0:
                self.is_initialized = True
                print(f"✅ 成功加载现有向量数据库，包含 {chunk_count} 个文档块")
                return True
            else:
                print("⚠️ 向量数据库为空")
                return False
                
        except Exception as e:
            print(f"❌ 加载向量数据库失败: {e}")
            return False
    
    def get_retriever(self):
        """获取检索器"""
        if not self.is_initialized or self.vectordb is None:
            raise ValueError("向量数据库未初始化，请先调用 init_vector_db() 或 load_existing_db()")
        
        return self.vectordb.as_retriever(search_kwargs={"k": config.RETRIEVAL_K})
    
    def get_chunk_count(self) -> int:
        """获取文档块数量"""
        try:
            if self.vectordb and hasattr(self.vectordb, '_collection'):
                return self.vectordb._collection.count()
            return 0
        except:
            return 0


def init_vector_db(file_path: Optional[str] = None, force_recreate: bool = False) -> VectorDBManager:
    """
    便捷函数：初始化向量数据库
    
    Args:
        file_path: 反模式文件路径
        force_recreate: 是否强制重新创建
        
    Returns:
        VectorDBManager: 数据库管理器实例
    """
    db_manager = VectorDBManager()
    
    # 如果不强制重新创建，先尝试加载现有数据库
    if not force_recreate and db_manager.load_existing_db():
        return db_manager
    
    # 创建新的数据库
    if db_manager.init_vector_db(file_path):
        return db_manager
    else:
        raise RuntimeError("向量数据库初始化失败")
'''

# 写入向量数据库模块
vector_db_file = project_root / "db" / "vector_db.py"
with open(vector_db_file, 'w', encoding='utf-8') as f:
    f.write(vector_db_content)

print("✅ 向量数据库模块 (db/vector_db.py) 创建完成")

# 更新db包的__init__.py
db_init_content = '''"""
数据库模块 - 向量数据库管理
"""

from .vector_db import VectorDBManager, init_vector_db

__all__ = ["VectorDBManager", "init_vector_db"]
'''

db_init_file = project_root / "db" / "__init__.py"
with open(db_init_file, 'w', encoding='utf-8') as f:
    f.write(db_init_content)

print("✅ 数据库包初始化文件更新完成")
print("\n📋 向量数据库模块特点:")
print("- 支持自动检测反模式文件位置")
print("- 可重用现有向量数据库")
print("- 统一的错误处理和日志输出")
print("- 灵活的配置参数支持")

In [None]:
# 第四步：实现检索与分析模块

# 创建retriever.py - 检索工具模块
retriever_content = '''"""
检索工具模块 - 封装检索相关功能
"""

try:
    from langchain.tools.retriever import create_retriever_tool
    from langchain_community.chat_models import ChatOllama
except ImportError:
    from langchain.tools.retriever import create_retriever_tool
    from langchain.chat_models import ChatOllama

from .config import config


class RetrieverManager:
    """检索管理器"""
    
    def __init__(self, vectordb_manager):
        self.vectordb_manager = vectordb_manager
        self.retriever_tool = None
        self.llm = None
        self._initialize_components()
    
    def _initialize_components(self):
        """初始化检索组件"""
        # 创建检索器
        retriever = self.vectordb_manager.get_retriever()
        
        # 创建检索工具
        self.retriever_tool = create_retriever_tool(
            retriever,
            name="retrieve_Java_antipatterns",
            description="Search for Java anti-patterns in the codebase",
        )
        
        # 初始化LLM
        self.llm = ChatOllama(model=config.LLM_MODEL)
        print(f"🤖 LLM模型初始化: {self.llm.model}")
    
    def get_retriever_tool(self):
        """获取检索工具"""
        return self.retriever_tool
    
    def get_llm(self):
        """获取LLM实例"""
        return self.llm


def create_retriever_components(vectordb_manager):
    """
    便捷函数：创建检索组件
    
    Args:
        vectordb_manager: 向量数据库管理器
        
    Returns:
        RetrieverManager: 检索管理器实例
    """
    return RetrieverManager(vectordb_manager)
'''

retriever_file = project_root / "analysis" / "retriever.py"
with open(retriever_file, 'w', encoding='utf-8') as f:
    f.write(retriever_content)

print("✅ 检索模块 (analysis/retriever.py) 创建完成")

# 创建analyzer.py - 代码分析模块  
analyzer_content = '''"""
代码分析模块 - 实现Java代码反模式分析逻辑
"""

from typing import Dict, Any


class JavaCodeAnalyzer:
    """Java代码分析器"""
    
    # 分析提示模板 (与原Demo相同)
    ANALYSIS_PROMPT = (
        "You are a senior Java code reviewer with deep experience in detecting software design antipatterns. "
        "Below is the code to analyze:\\n"
        "{code}\\n\\n"
        "Here is additional context from the codebase:\\n"
        "{context}\\n\\n"
        "Your task is to:\\n"
        "- Carefully analyze the code.\\n"
        "- Identify any Java antipatterns or design smells present.\\n"
        "- For each antipattern you find, include:\\n"
        "  - [Name of the antipattern] (e.g., God Object, Long Method)\\n"
        "  - [File or class/method name involved] (if detectable)\\n"
        "  - [Brief description] of the issue\\n"
        "  - [Why it\\'s a problem]\\n"
        "  - [Suggested refactor]\\n"
        "Be thorough but concise. If no antipatterns are found, say so."
    )
    
    def __init__(self, retriever_manager):
        self.retriever_manager = retriever_manager
        self.retriever_tool = retriever_manager.get_retriever_tool()
        self.llm = retriever_manager.get_llm()
    
    def retrieve_context(self, code: str) -> str:
        """
        检索相关上下文信息
        
        Args:
            code: Java代码
            
        Returns:
            str: 检索到的上下文信息
        """
        try:
            print("🔍 检索相关反模式上下文...")
            
            # 基于代码片段创建搜索查询
            search_query = f"Java antipatterns code analysis: {code[:200]}"
            
            # 使用检索工具获取相关上下文
            context = self.retriever_tool.invoke({"query": search_query})
            
            print("   ✅ 成功检索到相关上下文")
            return context
            
        except Exception as e:
            print(f"   ❌ 上下文检索错误: {e}")
            return "No additional context available due to retrieval error."
    
    def analyze_code(self, code: str, context: str = None) -> str:
        """
        分析Java代码中的反模式
        
        Args:
            code: 待分析的Java代码
            context: 可选的上下文信息
            
        Returns:
            str: 分析结果
        """
        try:
            print("🔍 分析代码中的反模式...")
            
            # 如果没有提供上下文，则自动检索
            if context is None:
                context = self.retrieve_context(code)
            
            # 格式化分析提示
            prompt = self.ANALYSIS_PROMPT.format(
                code=code,
                context=context
            )
            
            # 使用LLM进行分析
            response = self.llm.invoke(prompt)
            result = response.content if hasattr(response, 'content') else str(response)
            
            print("   ✅ 代码分析完成")
            return result
            
        except Exception as e:
            error_msg = f"代码分析过程中发生错误: {e}"
            print(f"   ❌ {error_msg}")
            return error_msg
    
    def full_analysis(self, code: str) -> Dict[str, Any]:
        """
        完整的代码分析流程
        
        Args:
            code: 待分析的Java代码
            
        Returns:
            Dict: 包含所有分析结果的字典
        """
        print("🚀 开始完整的Java反模式分析...")
        print("=" * 50)
        
        # 检索上下文
        context = self.retrieve_context(code)
        
        # 执行分析
        analysis_result = self.analyze_code(code, context)
        
        # 返回结构化结果
        result = {
            "code": code,
            "context": context,
            "analysis": analysis_result,
            "success": "Error" not in analysis_result
        }
        
        print("🎉 分析完成!")
        return result


def create_analyzer(retriever_manager):
    """
    便捷函数：创建代码分析器
    
    Args:
        retriever_manager: 检索管理器
        
    Returns:
        JavaCodeAnalyzer: 代码分析器实例
    """
    return JavaCodeAnalyzer(retriever_manager)
'''

analyzer_file = project_root / "analysis" / "analyzer.py"
with open(analyzer_file, 'w', encoding='utf-8') as f:
    f.write(analyzer_content)

print("✅ 分析模块 (analysis/analyzer.py) 创建完成")

# 更新analysis包的__init__.py
analysis_init_content = '''"""
分析模块 - 代码检索和反模式分析
"""

from .retriever import RetrieverManager, create_retriever_components
from .analyzer import JavaCodeAnalyzer, create_analyzer

__all__ = [
    "RetrieverManager", 
    "create_retriever_components",
    "JavaCodeAnalyzer", 
    "create_analyzer"
]
'''

analysis_init_file = project_root / "analysis" / "__init__.py"
with open(analysis_init_file, 'w', encoding='utf-8') as f:
    f.write(analysis_init_content)

print("✅ 分析包初始化文件更新完成")
print("\n📋 分析模块特点:")
print("- 分离检索和分析逻辑")
print("- 支持独立的上下文检索")
print("- 结构化的分析结果返回")
print("- 统一的错误处理机制")