# Análise de Diffs de Commits Terraform
Este notebook percorre todo o diretório `dataset/`, carrega commits de cada repositório e exibe os patches aplicados, estruturando um DataFrame das mudanças.


## 1. Importação de bibliotecas

In [21]:
import os
import json
import pandas as pd
from IPython.display import Markdown, display

## 2. Funções utilitárias

In [22]:


def load_all_commit_jsonl(dataset_dir: str) -> list[dict]:
    """
    Percorre `dataset_dir`, encontra todos os arquivos .jsonl e adiciona campo 'repo'.
    """
    commits = []
    for root, _, files in os.walk(dataset_dir):
        repo = os.path.basename(root)
        for fname in files:
            if fname.endswith('.jsonl'):
                path = os.path.join(root, fname)
                with open(path, encoding='utf-8') as f:
                    for line in f:
                        data = json.loads(line)
                        data['repo'] = repo
                        commits.append(data)
    return commits


def parse_patch_to_dataframe(patch: str) -> pd.DataFrame:
    """
    Transforma patch unificado em DataFrame com colunas: change (added/removed/context/meta) e content.
    """
    rows = []
    for line in patch.splitlines():
        if line.startswith('+++') or line.startswith('---') or line.startswith('@@'):
            change = 'meta'
            content = line
        elif line.startswith('+') and not line.startswith('+++'):
            change = 'added'
            content = line[1:]
        elif line.startswith('-') and not line.startswith('---'):
            change = 'removed'
            content = line[1:]
        else:
            change = 'context'
            content = line
        rows.append({'change': change, 'content': content})
    return pd.DataFrame(rows)

## 3. Carregando e analisando commits

In [None]:
dataset_dir = 'dataset'
commits = load_all_commit_jsonl(dataset_dir)
print(f"Total de commits carregados: {len(commits)}")

## 4. Loop principal e exibição

In [None]:

output_dir = 'reports/commits'
os.makedirs(output_dir, exist_ok=True)

for idx, commit in enumerate(commits, 1):
    repo = commit.get('repo')
    hash_ = commit.get('commit_hash')
    safe_hash = hash_[:7]
    author = commit.get('author')
    date = commit.get('date')
    message = commit.get('message')
    file = commit.get('file')
    patch = commit.get('patch', '')

    md_lines = [
        f"# Commit {idx} — {repo}",
        f"**Hash:** {hash_}",
        f"**Autor:** {author}",
        f"**Data:** {date}",
        f"**Mensagem:** {message}",
        f"**Arquivo:** {file}",
        "```diff"
    ]
    md_lines += patch.splitlines()
    md_lines.append("```")
    md_path = os.path.join(output_dir, f"commit_{repo}_{safe_hash}.md")
    with open(md_path, 'w', encoding='utf-8') as f:
        f.write("".join(md_lines))

    df = parse_patch_to_dataframe(patch)
    if not df.empty:
        csv_path = os.path.join(output_dir, f"diff_{repo}_{safe_hash}.csv")
        df.to_csv(csv_path, index=False, encoding='utf-8')
        print(f"Relatório salvo: {md_path}, {csv_path}")
    else:
        print(f"Nenhuma mudança em commit {idx}: {repo} {safe_hash}")

## 5. Exportar relatórios

In [None]:
import os
output_dir = 'reports'
os.makedirs(output_dir, exist_ok=True)

all_diffs = []

for idx, commit in enumerate(commits, 1):
    repo = commit.get('repo')
    hash_ = commit.get('commit_hash')
    safe_hash = hash_[:7]
    patch = commit.get('patch', '')

    df = parse_patch_to_dataframe(patch)
    if df.empty:
        continue

    df.insert(0, 'repo', repo)
    df.insert(1, 'commit', safe_hash)
    df.insert(2, 'file', commit.get('file', ''))
    all_diffs.append(df)

    csv_path = os.path.join(output_dir, f'diff_{repo}_{safe_hash}.csv')
    df.to_csv(csv_path, index=False, encoding='utf-8')

    html_lines = [
        '<html><body>',
        f'<h2>Commit {idx} — {repo} ({safe_hash})</h2>',
        '<pre>'
    ]
    html_lines += [line.replace('<', '&lt;') for line in patch.splitlines()]
    html_lines += ['</pre></body></html>']
    html_content = ''.join(html_lines)
    html_path = os.path.join(output_dir, f'patch_{repo}_{safe_hash}.html')
    with open(html_path, 'w', encoding='utf-8') as f:
        f.write(html_content)

    print(f"Exportado: {csv_path} e {html_path}")

if all_diffs:
    df_all = pd.concat(all_diffs, ignore_index=True)
    full_csv = os.path.join(output_dir, 'all_diffs.csv')
    df_all.to_csv(full_csv, index=False, encoding='utf-8')
    print(f"CSV consolidado em: {full_csv}")

    html_full = ['<html><body>', '<h1>Relatório Consolidado de Diffs</h1>']
    for df in all_diffs:
        repo = df.iloc[0]['repo']
        commit = df.iloc[0]['commit']
        html_full.append(f'<h2>{repo} — {commit}</h2>')
        html_full.append(df.to_html(index=False, escape=False))
    html_full.append('</body></html>')
    full_html = os.path.join(output_dir, 'all_diffs.html')
    with open(full_html, 'w', encoding='utf-8') as f:
        f.write(''.join(html_full))
    print(f"HTML consolidado em: {full_html}")