# Building Commit Data Objects from Repository

This notebook demonstrates how to extract commit information from the current repository and build structured `Commit` data objects using our data models. We'll process the 5 most recent commits and display them with enhanced formatting.

In [1]:
# Import required libraries
import json
import sys
from collections import defaultdict
from datetime import datetime
from pathlib import Path

import git
from rich.console import Console
from rich.panel import Panel
from rich.syntax import Syntax
from rich.table import Table
from rich.tree import Tree

from auto_release_note_generation.data_models.commit import Commit
from auto_release_note_generation.data_models.shared import (
    Diff,
    FileModification,
    GitActor,
    GitMetadata,
)
from auto_release_note_generation.data_models.utils import GitSHA

In [2]:
# Add the src directory to the Python path
project_root = Path.cwd().parent

sys.path.insert(0, str(project_root / "src"))

# Import GitPython for repository interaction

# For better display

# Import our data models

console = Console()

In [3]:
# Initialize the repository
repo = git.Repo(project_root)

# Display repository information
info_panel = Panel.fit(
    f"[bold blue]Repository:[/bold blue] {repo.working_dir}\n"
    f"[bold blue]Current branch:[/bold blue] {repo.active_branch.name}\n"
    f"[bold blue]Total commits:[/bold blue] {sum(1 for _ in repo.iter_commits())}",
    title="Repository Information",
    border_style="blue",
)
console.print(info_panel)

In [4]:
# Get the first 5 commits
commits_to_process = list(repo.iter_commits(max_count=5))

# Display commit overview
table = Table(
    title="Recent Commits Overview", show_header=True, header_style="bold magenta"
)
table.add_column("SHA", style="cyan", width=12)
table.add_column("Author", style="green")
table.add_column("Date", style="yellow")
table.add_column("Summary", style="white")

for commit in commits_to_process:
    table.add_row(
        commit.hexsha[:8],
        f"{commit.author.name}",
        datetime.fromtimestamp(commit.authored_date).strftime("%Y-%m-%d %H:%M"),
        commit.summary[:50] + "..." if len(commit.summary) > 50 else commit.summary,
    )

console.print(table)

## Helper Functions

Define all the helper functions needed to build commit objects.

In [5]:
# Helper function to create GitActor from GitPython actor


def create_git_actor(actor: git.Actor, timestamp: int) -> GitActor:
    """Convert GitPython Actor to our GitActor model."""
    return GitActor(
        name=actor.name, email=actor.email, timestamp=datetime.fromtimestamp(timestamp)
    )


# Helper function to extract file modifications
def extract_file_modifications(commit: git.Commit) -> list[FileModification]:
    """Extract file modifications from a commit."""
    modifications = []

    # Get the diff compared to the parent (or empty tree for initial commit)
    if commit.parents:
        diffs = commit.parents[0].diff(commit)
    else:
        # Initial commit - compare against empty tree
        diffs = commit.diff(git.NULL_TREE)

    # Get commit stats for accurate line counts
    stats = commit.stats.files

    for diff_item in diffs:
        # Use change_type as the primary indicator
        change_type = diff_item.change_type

        # Force correct path values based on change type
        if change_type == "A":
            # Added file - force path_before to None
            path_before = None
            path_after = diff_item.b_path
            mod_type = "A"
        elif change_type == "D":
            # Deleted file - force path_after to None
            path_before = diff_item.a_path
            path_after = None
            mod_type = "D"
        elif change_type.startswith("R"):  # R90, R100, etc.
            # Renamed file - use both paths
            path_before = diff_item.a_path
            path_after = diff_item.b_path
            mod_type = "R"
        elif change_type == "M":
            # Modified file - paths should be the same
            path_before = diff_item.a_path
            path_after = diff_item.b_path
            mod_type = "M"
        elif change_type.startswith("C"):  # C90, C100, etc.
            # Copied file
            path_before = diff_item.a_path
            path_after = diff_item.b_path
            mod_type = "C"
        elif change_type == "T":
            # Type change
            path_before = diff_item.a_path
            path_after = diff_item.b_path
            mod_type = "T"
        else:
            # Unknown change type
            path_before = diff_item.a_path
            path_after = diff_item.b_path
            mod_type = "X"

        # Get the file path to look up stats
        file_path = path_after if path_after else path_before

        # Get insertions and deletions from commit stats
        insertions = 0
        deletions = 0
        if file_path and file_path in stats:
            insertions = stats[file_path].get("insertions", 0)
            deletions = stats[file_path].get("deletions", 0)

        mod = FileModification(
            path_before=path_before,
            path_after=path_after,
            modification_type=mod_type,
            insertions=insertions,
            deletions=deletions,
        )
        modifications.append(mod)

    return modifications


# Extract branches containing this commit
def get_branches_containing_commit(repo: git.Repo, commit_sha: str) -> list[str]:
    """Get all branches that contain the given commit."""
    branches = []
    for branch in repo.branches:
        if commit_sha in [c.hexsha for c in repo.iter_commits(branch)]:
            branches.append(branch.name)
    return branches


# Extract tags pointing to this commit
def get_tags_for_commit(repo: git.Repo, commit_sha: str) -> list[str]:
    """Get all tags pointing to the given commit."""
    tags = []
    for tag in repo.tags:
        if tag.commit.hexsha == commit_sha:
            tags.append(tag.name)
    return tags


# Function to build a Commit object from a GitPython commit
def build_commit_object(git_commit: git.Commit, repo: git.Repo) -> Commit:
    """Build a complete Commit object from a GitPython commit."""

    # Build GitMetadata
    git_metadata = GitMetadata(
        sha=GitSHA(git_commit.hexsha),
        author=create_git_actor(git_commit.author, git_commit.authored_date),
        committer=create_git_actor(git_commit.committer, git_commit.committed_date),
        parents=[GitSHA(p.hexsha) for p in git_commit.parents],
        gpg_signature=(
            git_commit.gpgsig
            if hasattr(git_commit, "gpgsig") and git_commit.gpgsig
            else None
        ),
    )

    # Extract file modifications
    modifications = extract_file_modifications(git_commit)

    # Create affected_paths as list of tuples
    affected_paths = [(mod.path_before, mod.path_after) for mod in modifications]

    # Build Diff object
    diff = Diff(
        modifications=modifications,
        files_changed_count=len(modifications),
        insertions_count=sum(mod.insertions for mod in modifications),
        deletions_count=sum(mod.deletions for mod in modifications),
        affected_paths=affected_paths,
    )

    # Get commit message parts
    commit_lines = git_commit.message.strip().split("\n")
    summary = commit_lines[0] if commit_lines else ""
    full_message = git_commit.message.strip()

    # Get branches and tags
    branches = get_branches_containing_commit(repo, git_commit.hexsha)
    tags = get_tags_for_commit(repo, git_commit.hexsha)

    # Build the complete Commit object
    return Commit(
        metadata=git_metadata,
        summary=summary,
        message=full_message,
        branches=branches,
        tags=tags,
        diff=diff,
        ai_summary=None,
    )

## Build Commit Objects

Process all commits and build structured data objects.

In [6]:
# Build Commit objects for all 5 commits
commit_objects = []
build_errors = []

console.print("\n[bold cyan]Building Commit Objects...[/bold cyan]\n")

for i, git_commit in enumerate(commits_to_process):
    try:
        commit_obj = build_commit_object(git_commit, repo)
        commit_objects.append(commit_obj)
        console.print(
            f"✅ Successfully built commit {i + 1}: {commit_obj.get_short_sha()}"
        )
    except Exception as e:
        build_errors.append((git_commit.hexsha[:8], str(e)))
        console.print(
            f"❌ Failed to build commit {i + 1}: {git_commit.hexsha[:8]} - {e!s}",
            style="red",
        )

num_commits = len(commits_to_process)
num_success = len(commit_objects)
console.print(
    f"\n[bold green]Successfully built {num_success} out of "
    f"{num_commits} commits[/bold green]"
)

if build_errors:
    console.print("\n[bold red]Build Errors:[/bold red]")
    for sha, error in build_errors:
        console.print(f"  {sha}: {error}")

## Display Commit Information

Show detailed information for each commit in a structured format.

In [7]:
# Display detailed information for each commit
for i, commit_obj in enumerate(commit_objects):
    # Create a tree structure for commit details
    tree = Tree(
        f"[bold magenta]Commit {i + 1}: {commit_obj.get_short_sha()}[/bold magenta]"
    )

    # Metadata branch
    metadata_branch = tree.add("[bold blue]Metadata[/bold blue]")
    metadata_branch.add(f"Full SHA: {commit_obj.metadata.sha}")
    author_email = commit_obj.metadata.author.email
    metadata_branch.add(f"Author: {commit_obj.metadata.author.name} <{author_email}>")
    metadata_branch.add(f"Date: {commit_obj.metadata.author.timestamp}")
    parent_shas = ", ".join([p[:8] for p in commit_obj.metadata.parents])
    metadata_branch.add(f"Parents: {len(commit_obj.metadata.parents)} ({parent_shas})")
    metadata_branch.add(
        f"Signed: {'Yes' if commit_obj.metadata.gpg_signature else 'No'}"
    )

    # Message branch
    message_branch = tree.add("[bold green]Message[/bold green]")
    message_branch.add(f"Summary: {commit_obj.summary}")
    if "\n" in commit_obj.message and len(commit_obj.message.split("\n")) > 1:
        message_branch.add("Full message: [See below]")

    # Repository info branch
    repo_branch = tree.add("[bold yellow]Repository Info[/bold yellow]")
    repo_branch.add(
        f"Branches: {', '.join(commit_obj.branches) if commit_obj.branches else 'None'}"
    )
    repo_branch.add(
        f"Tags: {', '.join(commit_obj.tags) if commit_obj.tags else 'None'}"
    )

    # Diff statistics branch
    diff_branch = tree.add("[bold red]Changes[/bold red]")
    diff_branch.add(f"Files changed: {commit_obj.diff.files_changed_count}")
    diff_branch.add(f"Insertions: +{commit_obj.diff.insertions_count}")
    diff_branch.add(f"Deletions: -{commit_obj.diff.deletions_count}")
    diff_branch.add(f"Total changes: {commit_obj.get_total_changes()}")

    # File modifications
    if commit_obj.diff.modifications:
        files_branch = diff_branch.add("Modified files:")
        for mod in commit_obj.diff.modifications[:5]:  # Show first 5 files
            file_info = f"{mod.modification_type}: "
            if mod.modification_type == "R":
                file_info += f"{mod.path_before} → {mod.path_after}"
            else:
                file_info += mod.path_after or mod.path_before
            file_info += f" (+{mod.insertions}/-{mod.deletions})"
            files_branch.add(file_info)
        if len(commit_obj.diff.modifications) > 5:
            files_branch.add(
                f"... and {len(commit_obj.diff.modifications) - 5} more files"
            )

    console.print(tree)

    # Display full commit message if it's multi-line
    if "\n" in commit_obj.message and len(commit_obj.message.split("\n")) > 1:
        console.print(
            Panel(commit_obj.message, title="Full Commit Message", border_style="green")
        )

    console.print("")  # Add spacing between commits

## Aggregate Statistics

Show summary statistics across all processed commits.

In [8]:
# Aggregate statistics across all commits
if commit_objects:
    console.print("[bold cyan]Aggregate Statistics[/bold cyan]\n")

    # Calculate totals
    total_files_changed = sum(c.diff.files_changed_count for c in commit_objects)
    total_insertions = sum(c.diff.insertions_count for c in commit_objects)
    total_deletions = sum(c.diff.deletions_count for c in commit_objects)
    total_changes = sum(c.get_total_changes() for c in commit_objects)

    # File modification type distribution
    mod_type_counts = defaultdict(int)
    for commit in commit_objects:
        for mod in commit.diff.modifications:
            mod_type_counts[mod.modification_type] += 1

    # Create statistics table
    stats_table = Table(
        title="Summary Statistics", show_header=True, header_style="bold cyan"
    )
    stats_table.add_column("Metric", style="white")
    stats_table.add_column("Value", style="yellow")

    stats_table.add_row("Total Commits", str(len(commit_objects)))
    stats_table.add_row("Total Files Changed", str(total_files_changed))
    stats_table.add_row("Total Insertions", f"+{total_insertions}")
    stats_table.add_row("Total Deletions", f"-{total_deletions}")
    stats_table.add_row("Total Line Changes", str(total_changes))
    stats_table.add_row(
        "Average Changes per Commit", f"{total_changes / len(commit_objects):.1f}"
    )

    console.print(stats_table)

    # Modification type distribution
    if mod_type_counts:
        mod_table = Table(
            title="File Modification Types", show_header=True, header_style="bold cyan"
        )
        mod_table.add_column("Type", style="white")
        mod_table.add_column("Description", style="green")
        mod_table.add_column("Count", style="yellow")

        mod_type_descriptions = {
            "A": "Added",
            "M": "Modified",
            "D": "Deleted",
            "R": "Renamed",
            "C": "Copied",
            "T": "Type changed",
            "U": "Unmerged",
            "X": "Unknown",
            "B": "Broken pairing",
        }

        for mod_type, count in sorted(
            mod_type_counts.items(), key=lambda x: x[1], reverse=True
        ):
            mod_table.add_row(
                mod_type, mod_type_descriptions.get(mod_type, "Unknown"), str(count)
            )

        console.print("\n")
        console.print(mod_table)

## Export to JSON

Export the commit objects to JSON format for further processing or storage.

In [9]:
# Export commits to JSON for inspection
if commit_objects:
    console.print("\n[bold cyan]JSON Export[/bold cyan]\n")

    # Export the first commit as formatted JSON
    first_commit_json = commit_objects[0].model_dump_json(indent=2)

    # Show a preview of the JSON structure
    json_preview = (
        first_commit_json[:800] + "..."
        if len(first_commit_json) > 800
        else first_commit_json
    )

    syntax = Syntax(json_preview, "json", theme="monokai", line_numbers=True)
    console.print(
        Panel(
            syntax,
            title=f"JSON Export Preview: {commit_objects[0].get_short_sha()}",
            border_style="cyan",
        )
    )

    # Save all commits to a JSON file
    all_commits_data = [commit.model_dump() for commit in commit_objects]
    output_file = project_root / "notebooks" / "commit_objects_export.json"

    with open(output_file, "w") as f:
        json.dump(all_commits_data, f, indent=2, default=str)

    num_commits = len(commit_objects)
    console.print(
        f"\n[green]✅ Exported all {num_commits} commits to:[/green] {output_file}"
    )