# .gitignore Audit & Optimization Notebook

This notebook analyzes the repository `.gitignore` for consistency, redundancy, and opportunities to consolidate patterns without changing intent.

Outline implemented across the next cells:

1. Load file
2. Segment sections
3. Normalize patterns
4. Classify patterns
5. Detect duplicates & shadowed
6. Glob expansion (current ignored files)
7. Redundant artifact patterns
8. Ineffective negations
9. Large tracked artifacts not ignored
10. Interactive matcher
11. Consolidation suggestions
12. Draft generation
13. Draft validation
14. Export reports

Execution order matters; run cells sequentially.


In [None]:
# 1. Load .gitignore File
from pathlib import Path
import json, re, os, itertools

GITIGNORE_PATH = Path(".gitignore")
raw_lines = GITIGNORE_PATH.read_text(encoding="utf-8").splitlines()
print(f"Loaded {len(raw_lines)} lines from {GITIGNORE_PATH}")
raw_preview = "\n".join(raw_lines[:20])
print(raw_preview)

: 

In [None]:
# 2. Segment Sections By Comment Headers
from collections import defaultdict

section_map = defaultdict(list)
current_section = "UNLABELED"
header_pattern = re.compile(r"^#\s*-{2,}\s*$")

for line in raw_lines:
    if line.startswith("#"):
        # treat non-empty comment lines as potential headers
        if header_pattern.match(line):
            continue
        header_text = line.lstrip("#").strip()
        if header_text:
            current_section = header_text
            section_map[current_section]  # ensure key exists
            continue
    section_map[current_section].append(line)

print(f"Detected {len(section_map)} sections")
print(list(section_map.keys())[:10])

In [None]:
# 3. Normalize & Canonicalize Patterns


def normalize_pattern(p: str) -> str:
    p = p.strip()
    if not p or p.startswith("#"):
        return p
    p = p.replace("\\", "/")
    p = re.sub(r"/+", "/", p)
    return p


normalized = []
for idx, line in enumerate(raw_lines):
    n = normalize_pattern(line)
    normalized.append(
        {"index": idx, "original": line, "normalized": n, "changed": line != n}
    )

print("Sample normalized entries:")
for row in normalized[:15]:
    if row["changed"]:
        print(row)

In [None]:
# 4. Classify Patterns (Language / Purpose)

TAGS = {
    "python": [
        ".pyc",
        "__pycache__",
        ".pytest_cache",
        "venv",
        ".venv",
        "pdm",
        "poetry.lock",
        "pipfile",
    ],
    "dotnet": [".sln", "bin/", "obj/", ".csproj", ".vs/"],
    "node": ["node_modules", "package-lock.json", "playwright"],
    "logs": [".log", "logs/", "log/"],
    "env": [".env", "env/"],
    "build": ["dist/", "build/", "coverage", "BenchmarkDotNet.Artifacts"],
    "cert": [".pem", ".crt", ".key", ".pfx"],
}


def classify(p: str) -> List[str]:
    if not p or p.startswith("#"):
        return []
    tags = []
    for tag, needles in TAGS.items():
        if any(n in p for n in needles):
            tags.append(tag)
    return tags


for row in normalized:
    row["tags"] = classify(row["normalized"])

print("Tagged counts:")
from collections import Counter

c = Counter(itertools.chain.from_iterable(r["tags"] for r in normalized))
print(c)

In [None]:
# 5. Detect Duplicates & Shadowed Patterns

seen = {}
exact_duplicates = []
# gitignore precedence: later rules override earlier; we detect earlier broad rules overshadowing specifics
shadowed = []

for idx, row in enumerate(normalized):
    pat = row["normalized"]
    if not pat or pat.startswith("#"):
        continue
    if pat in seen:
        exact_duplicates.append(
            {"pattern": pat, "first_index": seen[pat], "dup_index": idx}
        )
    else:
        seen[pat] = idx

# Simple heuristic: if a pattern is a prefix directory of another later pattern
patterns = [
    r["normalized"]
    for r in normalized
    if r["normalized"] and not r["normalized"].startswith("#")
]
for i, a in enumerate(patterns):
    if a.endswith("/"):
        for j, b in enumerate(patterns):
            if j <= i:
                continue
            if b.startswith(a) and b != a:
                shadowed.append({"broad": a, "specific": b})

print(
    f"Exact duplicates: {len(exact_duplicates)} | Potential shadowed: {len(shadowed)}"
)

In [None]:
# 6. Glob Expansion: Enumerate Currently Ignored Files
import subprocess, shlex

# Collect candidate file paths (limit for performance)
all_paths = []
for root, dirs, files in os.walk(".", topdown=True):
    # skip .git itself
    if root.startswith("./.git"):
        continue
    for f in files:
        p = os.path.join(root, f)
        all_paths.append(p)
    if len(all_paths) > 5000:
        break

# Use git check-ignore batch
proc = subprocess.run(
    ["git", "check-ignore", "--stdin"],
    input="\n".join(all_paths),
    text=True,
    capture_output=True,
)
ignored_set = set(proc.stdout.splitlines()) if proc.returncode in (0, 1) else set()
print(f"Scanned {len(all_paths)} files, ignored {len(ignored_set)}")

ignored_samples = list(itertools.islice((p for p in all_paths if p in ignored_set), 20))
print("Ignored sample:")
for s in ignored_samples:
    print("  ", s)

In [None]:
# 7. Potential Redundant Artifact Patterns
redundant = []
# Heuristic: if pattern A directory covers pattern B file and both appear
for row in normalized:
    p = row["normalized"]
    if not p or p.startswith("#") or p == "/":
        continue
    if p.endswith("/"):
        for other in normalized:
            q = other["normalized"]
            if q and not q.startswith("#") and q.startswith(p) and q != p:
                redundant.append({"dir": p, "child": q})

print(f"Potential redundant pairs: {len(redundant)} (first 15 shown)")
for pair in redundant[:15]:
    print(pair)

In [None]:
# 8. Check Unignored (Negated) Patterns Integrity
negations = [r["normalized"] for r in normalized if r["normalized"].startswith("!")]
ineffective = []

# For each negation, test if any file currently ignored would be re-included
for neg in negations:
    pat = neg[1:]
    # naive match: substring for quick heuristic
    matches = [p for p in ignored_set if pat.strip("/") in p]
    if not matches:
        ineffective.append(neg)

print(f"Negations found: {len(negations)} | Ineffective: {len(ineffective)}")
print(ineffective[:10])

In [None]:
# 9. Scan Repo For Large Tracked Artifacts Not Ignored
LARGE_THRESHOLD = 5 * 1024 * 1024  # 5MB
large_files = []
for root, dirs, files in os.walk(".", topdown=True):
    if root.startswith("./.git"):
        continue
    for f in files:
        path = os.path.join(root, f)
        try:
            sz = os.path.getsize(path)
        except OSError:
            continue
        if sz >= LARGE_THRESHOLD and path not in ignored_set:
            large_files.append({"path": path, "size_mb": round(sz / 1024 / 1024, 2)})

print(f"Large tracked files (>=5MB) not ignored: {len(large_files)} (first 10)")
for lf in large_files[:10]:
    print(lf)

In [None]:
# 10. Simulate Ignore Matching For Sample Paths
import fnmatch

rules = [
    r["normalized"]
    for r in normalized
    if r["normalized"] and not r["normalized"].startswith("#")
]


def match_path(path: str):
    # Follows order: later rules override earlier
    matched_rule = None
    is_ignored = False
    for rule in rules:
        neg = rule.startswith("!")
        pattern = rule[1:] if neg else rule
        # crude glob match
        if fnmatch.fnmatch(path, pattern) or path.startswith(pattern.rstrip("/")):
            if neg:
                is_ignored = False
                matched_rule = rule
            else:
                is_ignored = True
                matched_rule = rule
    return {"path": path, "ignored": is_ignored, "rule": matched_rule}


# Demo
for demo in ["dist/output.bin", "node_modules/pkg/index.js", "docs/README.md"]:
    print(match_path(demo))

In [None]:
# 11. Suggest Consolidated Pattern Groups

suggestions = []
# Group by tag sets for potential consolidation
from collections import defaultdict

by_tagset = defaultdict(list)
for r in normalized:
    pats = tuple(sorted(r.get("tags", [])))
    if pats:
        by_tagset[pats].append(r["normalized"])

for tags, pats in by_tagset.items():
    if len(pats) > 3:
        suggestions.append({"tags": tags, "count": len(pats), "sample": pats[:5]})

print(f"Consolidation candidate groups: {len(suggestions)}")
for s in suggestions[:10]:
    print(s)

In [None]:
# 12. Generate Cleaned .gitignore Draft


def build_draft():
    lines = []
    lines.append("# Auto-generated draft (DO NOT COMMIT without review)")
    grouped = defaultdict(list)
    for r in normalized:
        pat = r["normalized"]
        if not pat or pat.startswith("#"):
            continue
        key = tuple(sorted(r.get("tags", []))) or ("misc",)
        grouped[key].append(pat)
    for key, pats in sorted(grouped.items(), key=lambda x: (-len(x[1]), x[0])):
        lines.append("")
        lines.append(
            f"# Group: {', '.join([k for k in key if k])} ({len(pats)} patterns)"
        )
        seen_local = set()
        for p in pats:
            if p in seen_local:
                continue
            seen_local.add(p)
            lines.append(p)
    return "\n".join(lines) + "\n"


draft = build_draft()
print("\n".join(draft.splitlines()[:40]))

In [None]:
# 13. Validate Draft Against Current Git Status (Dry Run)
from tempfile import TemporaryDirectory

validation_report = {}
with TemporaryDirectory() as tmp:
    draft_path = Path(tmp) / ".gitignore"
    draft_path.write_text(draft, encoding="utf-8")
    # Copy only .git directory reference by using git check-ignore with --no-index referencing draft
    sample_check = subprocess.run(
        ["git", "check-ignore", "-n", "--stdin"],
        input="\n".join(all_paths[:200]),
        text=True,
        capture_output=True,
    )
    validation_report["sample_output"] = sample_check.stdout.splitlines()[:40]

print("Validation sample lines:")
for line in validation_report["sample_output"]:
    print(line)

In [None]:
# 14. Export Reports (JSON / Markdown)
import json

REPORT_DIR = Path("gitignore_audit_reports")
REPORT_DIR.mkdir(exist_ok=True)

(REPORT_DIR / "duplicates.json").write_text(json.dumps(exact_duplicates, indent=2))
(REPORT_DIR / "shadowed.json").write_text(json.dumps(shadowed, indent=2))
(REPORT_DIR / "ineffective_negations.json").write_text(
    json.dumps(ineffective, indent=2)
)
(REPORT_DIR / "large_unignored.json").write_text(json.dumps(large_files, indent=2))
(REPORT_DIR / "redundant_pairs.json").write_text(json.dumps(redundant[:200], indent=2))
(REPORT_DIR / "proposed.gitignore").write_text(draft)

summary_md = f"""# .gitignore Audit Summary\n\n* Total lines: {len(raw_lines)}\n* Exact duplicates: {len(exact_duplicates)}\n* Potential shadowed: {len(shadowed)}\n* Redundant pairs (dir->child): {len(redundant)}\n* Ineffective negations: {len(ineffective)}\n* Large tracked (>=5MB) not ignored: {len(large_files)}\n\nSee JSON artifacts for details.\n"""
(REPORT_DIR / "summary.md").write_text(summary_md)

print("Artifacts written to", REPORT_DIR)

In [None]:
# Demo: Send a chat completion request to local AI server
import requests
import json

url = "http://192.168.0.154:1234/v1/chat/completions"
headers = {"Content-Type": "application/json"}
payload = {
    "model": "openai/gpt-oss-20b",
    "messages": [{"role": "user", "content": "Hello, local model!"}],
}

response = requests.post(url, headers=headers, data=json.dumps(payload))
if response.ok:
    result = response.json()
    print(result["choices"][0]["message"]["content"])
else:
    print(f"Error: {response.status_code}", response.text)