In [2]:
import importlib
import inspect
import json
import logging
import pkgutil
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Set

import networkx as nx
import numpy as np
import spacy
from gensim.models.doc2vec import Doc2Vec
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from textblob import TextBlob

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class MethodInfo:
    signature: str
    doc: Optional[str]
    parameters: Dict[str, str]
    return_type: Optional[str] = None
    category: str = "uncategorized"
    complexity: str = "unknown"
    semantic_group: str = "unknown"
    input_output_pattern: str = "unknown"
    related_methods: List[str] = field(default_factory=list)
    usage_examples: List[str] = field(default_factory=list)
    semantic_embedding: Optional[np.ndarray] = None

@dataclass
class ClassInfo:
    methods: Dict[str, MethodInfo]
    attributes: Dict[str, Any]
    doc: Optional[str]
    base_classes: List[str]
    category: str = "uncategorized"
    semantic_group: str = "unknown"
    responsibility_cluster: str = "unknown"
    design_pattern: str = "unknown"
    related_classes: List[str] = field(default_factory=list)
    semantic_embedding: Optional[np.ndarray] = None

@dataclass
class ModuleAnalysis:
    classes: Dict[str, ClassInfo] = field(default_factory=dict)
    functions: Dict[str, MethodInfo] = field(default_factory=dict)
    constants: Dict[str, Any] = field(default_factory=dict)
    modules: Set[str] = field(default_factory=set)
    categories: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list))
    semantic_clusters: Dict[str, List[str]] = field(default_factory=dict)
    dependency_graph: nx.DiGraph = field(default_factory=nx.DiGraph)
    api_groups: Dict[str, List[str]] = field(default_factory=dict)
    functionality_domains: Dict[str, List[str]] = field(default_factory=dict)

class SemanticAnalyzer:
    def __init__(self):
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.nlp = spacy.load('en_core_web_trf')
        self.doc2vec = Doc2Vec(vector_size=100, min_count=2, epochs=30)

    def compute_embedding(self, text: str) -> np.ndarray:
        return self.sentence_model.encode(text)

    def cluster_embeddings(self, embeddings: List[np.ndarray], eps=0.5) -> List[int]:
        clustering = DBSCAN(eps=eps, min_samples=2).fit(embeddings)
        return clustering.labels_

    def extract_concepts(self, text: str) -> List[str]:
        doc = self.nlp(text)
        return [chunk.text for chunk in doc.noun_chunks]

def categorize_item(name: str, doc: Optional[str], signature: Optional[str] = None) -> str:
    """Enhanced categorization using NLP techniques"""
    name = name.lower()
    doc = doc or ""

    categories = {
        "io": ["load", "save", "read", "write", "import", "export", "file", "stream"],
        "math": ["calculate", "compute", "sum", "average", "mean", "numeric", "statistical"],
        "text": ["tokenize", "parse", "text", "string", "word", "nlp", "linguistic"],
        "data": ["process", "transform", "convert", "format", "structure", "collection"],
        "utils": ["util", "helper", "tool", "auxiliary", "support"],
        "visualization": ["plot", "draw", "display", "show", "render", "graph"],
        "ml": ["train", "predict", "model", "classify", "cluster", "learn"],
        "api": ["request", "response", "endpoint", "service", "client"],
        "validation": ["check", "verify", "validate", "assert", "test"],
        "security": ["encrypt", "decrypt", "auth", "secure", "permission"]
    }

    blob = TextBlob(f"{name} {doc} {signature or ''}")

    # Weight different signals
    scores = defaultdict(float)
    for category, keywords in categories.items():
        # Name matching
        if any(k in name for k in keywords):
            scores[category] += 2.0

        # Keyword presence in doc
        matches = sum(1 for k in keywords if k in doc.lower())
        scores[category] += matches * 0.5

        # Sentiment and subjectivity analysis
        sentiment = blob.sentiment.polarity
        if sentiment > 0:
            scores[category] += 0.2

    return max(scores.items(), key=lambda x: x[1])[0] if scores else "other"

def cluster_by_similarity(items: Dict[str, MethodInfo], analyzer: SemanticAnalyzer) -> Dict[str, List[str]]:
    """Advanced clustering using multiple similarity measures"""
    # Get embeddings for all items
    embeddings = []
    names = []
    for name, info in items.items():
        text = f"{name} {info.doc or ''} {info.signature}"
        embedding = analyzer.compute_embedding(text)
        embeddings.append(embedding)
        names.append(name)

    # Cluster using embeddings
    clusters = defaultdict(list)
    labels = analyzer.cluster_embeddings(embeddings)

    for name, label in zip(names, labels):
        if label >= 0:  # Ignore noise points labeled as -1
            clusters[f"semantic_cluster_{label}"].append(name)

    # Additional clustering by input/output patterns
    io_patterns = defaultdict(list)
    for name, info in items.items():
        pattern = (tuple(info.parameters.values()), info.return_type)
        io_patterns[pattern].append(name)

    # Merge results
    final_clusters = {}
    final_clusters.update({f"io_pattern_{i}": names for i, names in enumerate(io_patterns.values())})
    final_clusters.update(clusters)

    return final_clusters

def analyze_package(package_name: str) -> ModuleAnalysis:
    """Enhanced package analysis with semantic understanding and advanced organization"""
    analysis = ModuleAnalysis()
    analyzer = SemanticAnalyzer()

    try:
        package = importlib.import_module(package_name)
        logger.info(f"Analyzing package: {package_name}")

        if hasattr(package, '__path__'):
            for _, name, _ in sorted(pkgutil.walk_packages(package.__path__, package.__name__ + '.')):
                try:
                    module = importlib.import_module(name)
                    analysis.modules.add(name)

                    # Enhanced module content analysis
                    for item_name, item in sorted(inspect.getmembers(module)):
                        if item_name.startswith('_'):
                            continue

                        if inspect.isclass(item):
                            methods = {}
                            for method_name, method in sorted(inspect.getmembers(item, inspect.isfunction)):
                                if not method_name.startswith('_'):
                                    sig = inspect.signature(method)
                                    doc = inspect.getdoc(method)

                                    method_info = MethodInfo(
                                        signature=str(sig),
                                        doc=doc,
                                        parameters={
                                            name: str(param.annotation)
                                            for name, param in sig.parameters.items()
                                        },
                                        return_type=str(sig.return_annotation),
                                        category=categorize_item(method_name, doc, str(sig)),
                                        semantic_embedding=analyzer.compute_embedding(f"{method_name} {doc or ''}")
                                    )
                                    methods[method_name] = method_info

                            class_doc = inspect.getdoc(item)
                            class_info = ClassInfo(
                                methods=methods,
                                attributes={},
                                doc=class_doc,
                                base_classes=[str(b) for b in item.__bases__],
                                category=categorize_item(item_name, class_doc),
                                semantic_embedding=analyzer.compute_embedding(f"{item_name} {class_doc or ''}")
                            )
                            analysis.classes[f"{name}.{item_name}"] = class_info
                            analysis.categories[class_info.category].append(f"{name}.{item_name}")

                        elif inspect.isfunction(item):
                            sig = inspect.signature(item)
                            doc = inspect.getdoc(item)

                            func_info = MethodInfo(
                                signature=str(sig),
                                doc=doc,
                                parameters={
                                    name: str(param.annotation)
                                    for name, param in sig.parameters.items()
                                },
                                return_type=str(sig.return_annotation),
                                category=categorize_item(item_name, doc, str(sig)),
                                semantic_embedding=analyzer.compute_embedding(f"{item_name} {doc or ''}")
                            )
                            analysis.functions[f"{name}.{item_name}"] = func_info
                            analysis.categories[func_info.category].append(f"{name}.{item_name}")

                except Exception as e:
                    logger.warning(f"Error analyzing module {name}: {str(e)}")

        # Build dependency graph
        for class_name, class_info in analysis.classes.items():
            for base in class_info.base_classes:
                analysis.dependency_graph.add_edge(base, class_name)

        # Cluster functions and methods
        analysis.semantic_clusters = cluster_by_similarity(analysis.functions, analyzer)

    except ImportError as e:
        logger.error(f"Could not import {package_name}: {str(e)}")

    return analysis

# List of packages to analyze
packages = sorted([
    'gensim',
    'spacy',
    'nltk',
    'transformers',
    'textblob',
    'sentence_transformers'
])

# Analyze each package
analysis_results = {}
for package in packages:
    try:
        analysis = analyze_package(package)

        # Organize results hierarchically
        analysis_results[package] = {
            "overview": {
                "total_classes": len(analysis.classes),
                "total_functions": len(analysis.functions),
                "total_modules": len(analysis.modules),
                "total_categories": len(analysis.categories)
            },
            "classes": {
                category: {
                    k: vars(v) for k,v in sorted(analysis.classes.items())
                    if v.category == category
                }
                for category in sorted(set(c.category for c in analysis.classes.values()))
            },
            "functions": {
                category: {
                    k: vars(v) for k,v in sorted(analysis.functions.items())
                    if v.category == category
                }
                for category in sorted(set(f.category for f in analysis.functions.values()))
            },
            "modules": sorted(analysis.modules),
            "semantic_clusters": {k: sorted(v) for k,v in sorted(analysis.semantic_clusters.items())},
            "dependency_graph": nx.node_link_data(analysis.dependency_graph)
        }

        logger.info(f"Successfully analyzed {package}")
        logger.info(f"{package} analysis summary:")
        for key, value in analysis_results[package]["overview"].items():
            logger.info(f"- {key}: {value}")

    except Exception as e:
        logger.error(f"Failed to analyze {package}: {str(e)}")

# Save results with detailed formatting
with open('nlp_packages_analysis.json', 'w') as f:
    json.dump(analysis_results, f, indent=2, sort_keys=True, default=str)

logger.info("Analysis complete. Results saved to nlp_packages_analysis.json")



INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

loading configuration file config.json from cache at C:\Users\ace19\.cache\huggingface\hub\models--sentence-transformers--all-MiniLM-L6-v2\snapshots\fa97f6e7cb1a59073dff9e6b13e2715cf7475ac9\config.json
Model config BertConfig {
  "_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at C:\Users\ace19\.cache\huggingface\hub\models--sentence-transformers--all-MiniLM-L6-v2\snapshots\fa97f6e7cb1a59073dff9e6b13e2715cf7475ac9\model.safetensors
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

loading file vocab.txt from cache at C:\Users\ace19\.cache\huggingface\hub\models--sentence-transformers--all-MiniLM-L6-v2\snapshots\fa97f6e7cb1a59073dff9e6b13e2715cf7475ac9\vocab.txt
loading file tokenizer.json from cache at C:\Users\ace19\.cache\huggingface\hub\models--sentence-transformers--all-MiniLM-L6-v2\snapshots\fa97f6e7cb1a59073dff9e6b13e2715cf7475ac9\tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at C:\Users\ace19\.cache\huggingface\hub\models--sentence-transformers--all-MiniLM-L6-v2\snapshots\fa97f6e7cb1a59073dff9e6b13e2715cf7475ac9\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\ace19\.cache\huggingface\hub\models--sentence-transformers--all-MiniLM-L6-v2\snapshots\fa97f6e7cb1a59073dff9e6b13e2715cf7475ac9\tokenizer_config.json
loading file chat_template.jinja from cache at None


1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

ERROR:__main__:Failed to analyze gensim: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
loading configuration file config.json from cache at C:\Users\ace19\.cache\huggingface\hub\models--sentence-transformers--all-MiniLM-L6-v2\snapshots\fa97f6e7cb1a59073dff9e6b13e2715cf7475ac9\config.json
Model config BertConfig {
  "_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_att