In [1]:
import os
import sys
import pandas as pd
from datetime import datetime
import pytz
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from pathlib import Path
import random
from typing import List, Dict, Any
import logging

notebook_dir = os.getcwd()
parent_dir = str(Path(notebook_dir).parent)

if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    print(f"Added parent directory '{Path(parent_dir).name}' to sys.path")
    
os.chdir(parent_dir)

Added parent directory 'finer' to sys.path


In [2]:
from models.repository import Repository
from models.commit import Commit
from models.file import File
from models.cf import CommitFile, MetadataHelper
from models.hunk import Hunk
from utils.worker import get_optimal_max_workers

In [3]:
def create_commmit_link(repo, sha):
    return f"https://github.com/{repo.org_name}/{repo.repo_name}/commit/{sha}"

In [4]:
def process_metadata(mt, com, name, file_content):
    cf = CommitFile(com.repo_name, com.org_name, name, com.sha, file_content, mt.change_type, mt.file_mode, mt.index_info)
    hunk = Hunk(None, name, com.repo_name, com.org_name, com.sha, mt.old_start, mt.old_length, mt.new_start, mt.new_length, mt.lines, mt.old_name, mt.new_name)
    return cf, hunk


In [5]:
def get_com_cfs_and_hunks(repo_path, com, file_names, workers):
    cfs = []
    hunks = []

    for name in file_names:
        file_content, _ = File.get_file_content(repo_path, com.sha, name)
        metadata_list = CommitFile.get_metadata(com.org_name, com.repo_name, com.sha, name, True)

        with ThreadPoolExecutor(max_workers=workers) as executor:
            futures = [executor.submit(process_metadata, mt, com, name, file_content) for mt in metadata_list]
            for future in futures:
                try:
                    cf, hunk = future.result()
                    cfs.append(cf)
                    hunks.append(hunk)
                except Exception as e:
                    print(f"Error processing metadata for file {name}: {e}")

    return cfs, hunks

In [6]:
def export_to_json(coms_data: Dict[str, Dict[str, Dict[str, List[str]]]], num_commits: int, workers: int) -> Dict[str, Any]:
    """
    Export repository data to JSON format with comprehensive error handling.
    Now accepts the nested org/repo structure.
    
    Args:
        coms_data: Dictionary with structure {"org_name": {"repo_name": {"commits": [...]}}}
        num_commits: Number of commits to sample per repository
        workers: Number of parallel workers for processing
    
    Returns:
        Dictionary containing the structured JSON data
    """
    result = {
        'errors': [],
        'repos': []
    }
    
    if not coms_data:
        result['errors'].append("No repository data provided")
        return result
    
    if num_commits <= 0:
        result['errors'].append(f"Invalid num_commits value: {num_commits}")
        return result
    
    for org_name, repos in coms_data.items():
        for repo_name, repo_data in repos.items():
            try:
                if not repo_data or 'commits' not in repo_data:
                    result['errors'].append(f"Skipping invalid repository data for {org_name}/{repo_name}")
                    continue
                    
                commits = repo_data['commits']
                
                if not commits:
                    result['errors'].append(f"No commits found for repository: {org_name}/{repo_name}")
                    continue
                
                actual_num_commits = min(num_commits, len(commits))
                if actual_num_commits < num_commits:
                    result['errors'].append(
                        f"Requested {num_commits} commits but only {len(commits)} available for {org_name}/{repo_name}"
                    )
                
                repo_entry = {
                    'repo_name': repo_name,
                    'org_name': org_name,
                    'commits': []
                }
                
                for commit_info in commits:
                    try:
                        try:
                            file_names = Commit.get_file_names_from_commit(
                                f"download\\orgs\\{org_name}\\{repo_name}",
                                commit_info[0]
                            )
                        except Exception as e:
                            result['errors'].append(
                                f"Error getting files for commit {commit_info[0]} in {org_name}/{repo_name}: {str(e)}"
                            )
                            file_names = []
                        
                        if not file_names:
                            result['errors'].append(
                                f"No files found for commit {commit_info[0]} in {org_name}/{repo_name}"
                            )
                            continue
                        
                        try:
                            cfs, hunks = get_com_cfs_and_hunks(
                                f"download\\orgs\\{org_name}\\{repo_name}",
                                Commit(commit_info[0], repo_name, org_name, datetime.now(pytz.utc), commit_info[1]),
                                file_names,
                                workers
                            )
                        except Exception as e:
                            result['errors'].append(
                                f"Error processing content for commit {commit_info[0]} in {org_name}/{repo_name}: {str(e)}"
                            )
                            continue
                        
                        commit_entry = {
                            'message': commit_info[1],
                            'sha': commit_info[0],
                            'link': f"https://github.com/{org_name}/{repo_name}/commit/{commit_info[0]}",
                            'files': [],
                            'what': commit_info[2],
                            'why': commit_info[3],
                            'files_changed': commit_info[4]
                        }
                        
                        file_content_map = {}
                        
                        for cf in cfs or []:
                            try:
                                if not hasattr(cf, 'file_name') or not hasattr(cf, 'content'):
                                    continue
                                    
                                file_content_map[cf.file_name] = {
                                    'name': cf.file_name,
                                    'content': {
                                        'current': cf.content,
                                        'diffs': []
                                    }
                                }
                            except Exception as e:
                                result['errors'].append(
                                    f"Error processing file content for {cf.file_name} in commit {commit_info[0]}: {str(e)}"
                                )
                        
                        for hunk in hunks or []:
                            try:
                                if (hasattr(hunk, 'file_name') and 
                                    hasattr(hunk, 'lines') and 
                                    hunk.file_name in file_content_map):
                                    file_content_map[hunk.file_name]['content']['diffs'].append(hunk.lines)
                            except Exception as e:
                                result['errors'].append(
                                    f"Error processing hunk for file {hunk.file_name} in commit {commit_info[0]}: {str(e)}"
                                )
                        
                        commit_entry['files'] = list(file_content_map.values())
                        
                        if commit_entry['files']:
                            repo_entry['commits'].append(commit_entry)
                        else:
                            result['errors'].append(
                                f"No valid files found for commit {commit_info[0]} in {org_name}/{repo_name}"
                            )
                    except Exception as e:
                        result['errors'].append(
                            f"Unexpected error processing commit {commit_info[0]} in {org_name}/{repo_name}: {str(e)}"
                        )
                        continue
                
                if repo_entry['commits']:
                    result['repos'].append(repo_entry)
                else:
                    result['errors'].append(
                        f"No valid commits found for repository {org_name}/{repo_name}"
                    )
            
            except Exception as e:
                result['errors'].append(
                    f"Unexpected error processing repository {org_name}/{repo_name}: {str(e)}"
                )
                continue
    
    if result['errors']:
        logging.warning(f"Encountered {len(result['errors'])} errors during export:")
        for error in result['errors']:
            logging.warning(f" - {error}")
    
    return result

In [7]:
def save_json_in_file(data: Dict[str, Any], file_path: str):
    """
    Save the structured JSON data to a file.
    
    Args:
        data: Dictionary containing the structured JSON data
        file_path: Path to the output JSON file
    """
    try:
        with open(file_path, 'w') as f:
            import json
            json.dump(data, f, indent=4)
        print(f"Data successfully saved to {file_path}")
    except Exception as e:
        print(f"Error saving data to {file_path}: {e}")

In [8]:
max_workers = get_optimal_max_workers()

In [9]:
df_commits = pd.read_csv('data/chosen_commits.csv')
coms_data = {}

for _, row in df_commits.iterrows():
    org = row['org_name']
    repo = row['repo_name']
    sha = row['sha']
    message = row['message']
    what = row['what']
    why = row['why']
    files_changed = row['files_changed']
    
    if org not in coms_data:
        coms_data[org] = {}
    
    if repo not in coms_data[org]:
        coms_data[org][repo] = { "commits": [] }
    
    coms_data[org][repo]["commits"].append((sha, message, what, why, files_changed))

In [10]:
json_data = export_to_json(coms_data, 10, max_workers)
output_file = 'evaluate_set.json'
save_json_in_file(json_data, output_file)

Data successfully saved to evaluate_set.json
