In [1]:
import os
import sys
import pandas as pd
from datetime import datetime
import pytz
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from pathlib import Path
import random
from typing import List, Dict, Any
import logging

# Get the current notebook's directory
notebook_dir = os.getcwd()
parent_dir = str(Path(notebook_dir).parent)

if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    print(f"Added parent directory '{Path(parent_dir).name}' to sys.path")
    
os.chdir(parent_dir)

Added parent directory 'finer' to sys.path


In [2]:
from models.repository import Repository
from models.commit import Commit
from models.file import File
from models.cf import CommitFile, MetadataHelper
from models.hunk import Hunk
from utils.worker import get_optimal_max_workers

In [3]:
def create_commmit_link(repo, sha):
    return f"https://github.com/{repo.org_name}/{repo.repo_name}/commit/{sha}"

In [4]:
def process_repository(row):
    repo = Repository.csv_row_to_Repository(row)
    repo_path = repo.get_repo_path()

    try:
        print(f"Processing repository: {repo_path}")
        commits = Commit.get_commit_data(repo_path, datetime.now(pytz.timezone("UTC")), True)

        return {
            'repo': repo,
            'commits': commits
        }
    except Exception as e:
        print(f"Error processing {repo_path}: {e}")
        return None


In [5]:
def process_metadata(mt, com, name, file_content):
    cf = CommitFile(com.repo_name, com.org_name, name, com.sha, file_content, mt.change_type, mt.file_mode, mt.index_info)
    hunk = Hunk(None, name, com.repo_name, com.org_name, com.sha, mt.old_start, mt.old_length, mt.new_start, mt.new_length, mt.lines, mt.old_name, mt.new_name)
    return cf, hunk


In [6]:
def get_com_cfs_and_hunks(repo, com, file_names, workers):
    cfs = []
    hunks = []

    for name in file_names:
        file_content, _ = File.get_file_content(repo.get_repo_path(), com.sha, name)
        metadata_list = CommitFile.get_metadata(com.org_name, com.repo_name, com.sha, name, True)

        with ThreadPoolExecutor(max_workers=workers) as executor:
            futures = [executor.submit(process_metadata, mt, com, name, file_content) for mt in metadata_list]
            for future in futures:
                try:
                    cf, hunk = future.result()
                    cfs.append(cf)
                    hunks.append(hunk)
                except Exception as e:
                    print(f"Error processing metadata for file {name}: {e}")

    return cfs, hunks

In [7]:
def export_to_json(repos_data: List[Dict[str, Any]], num_commits: int, workers: int) -> Dict[str, Any]:
    """
    Export repository data to JSON format with comprehensive error handling.
    
    Args:
        repos_data: List of repository data dictionaries
        num_commits: Number of commits to sample per repository
        workers: Number of parallel workers for processing
    
    Returns:
        Dictionary containing the structured JSON data
    """
    # Initialize result structure
    result = {
        'errors': [],  # Track any errors that occurred during processing
        'repos': []
    }
    
    # Validate input
    if not repos_data:
        result['errors'].append("No repository data provided")
        return result
    
    if num_commits <= 0:
        result['errors'].append(f"Invalid num_commits value: {num_commits}")
        return result
    
    for data in repos_data:
        try:
            if not data or 'repo' not in data or 'commits' not in data:
                result['errors'].append(f"Skipping invalid repository data: {data}")
                continue
                
            repo = data['repo']
            commits = data['commits']
            
            # Skip if no commits available
            if not commits:
                result['errors'].append(f"No commits found for repository: {repo.repo_name}")
                continue
            
            # Handle case where requested commits exceed available commits
            actual_num_commits = min(num_commits, len(commits))
            if actual_num_commits < num_commits:
                result['errors'].append(
                    f"Requested {num_commits} commits but only {len(commits)} available for {repo.repo_name}"
                )
            
            try:
                selected_commits = random.sample(commits, actual_num_commits)
            except ValueError as e:
                result['errors'].append(
                    f"Error sampling commits for {repo.repo_name}: {str(e)}"
                )
                continue
            
            repo_entry = {
                'repo_name': repo.repo_name if hasattr(repo, 'repo_name') else '',
                'org_name': repo.org_name if hasattr(repo, 'org_name') else '',
                'commits': []
            }
            
            for commit in selected_commits:
                try:
                    # Get file names for this commit
                    try:
                        file_names = Commit.get_file_names_from_commit(
                            repo.get_repo_path(), 
                            commit.sha
                        ) if hasattr(repo, 'get_repo_path') else []
                    except Exception as e:
                        result['errors'].append(
                            f"Error getting files for commit {commit.sha} in {repo.repo_name}: {str(e)}"
                        )
                        file_names = []
                    
                    if not file_names:
                        result['errors'].append(
                            f"No files found for commit {commit.sha} in {repo.repo_name}"
                        )
                        continue
                    
                    # Get content and hunks
                    try:
                        cfs, hunks = get_com_cfs_and_hunks(
                            repo, 
                            commit, 
                            file_names, 
                            workers
                        )
                    except Exception as e:
                        result['errors'].append(
                            f"Error processing content for commit {commit.sha} in {repo.repo_name}: {str(e)}"
                        )
                        continue
                    
                    # Organize commit data
                    commit_entry = {
                        'message': getattr(commit, 'message', ''),
                        'sha': getattr(commit, 'sha', ''),
                        'link': f"https://github.com/{repo.org_name}/{repo.repo_name}/commit/{commit.sha}",
                        'files': []
                    }
                    
                    # Group content by file
                    file_content_map = {}
                    
                    # Process current file contents
                    for cf in cfs or []:
                        try:
                            if not hasattr(cf, 'file_name') or not hasattr(cf, 'content'):
                                continue
                                
                            file_content_map[cf.file_name] = {
                                'name': cf.file_name,
                                'content': {
                                    'current': cf.content,
                                    'diffs': []
                                }
                            }
                        except Exception as e:
                            result['errors'].append(
                                f"Error processing file content for {cf.file_name} in commit {commit.sha}: {str(e)}"
                            )
                    
                    # Process hunks/diffs
                    for hunk in hunks or []:
                        try:
                            if (hasattr(hunk, 'file_name') and 
                                hasattr(hunk, 'lines') and 
                                hunk.file_name in file_content_map):
                                file_content_map[hunk.file_name]['content']['diffs'].append(hunk.lines)
                        except Exception as e:
                            result['errors'].append(
                                f"Error processing hunk for file {hunk.file_name} in commit {commit.sha}: {str(e)}"
                            )
                    
                    # Add files to commit entry
                    commit_entry['files'] = list(file_content_map.values())
                    
                    # Only add commit if we have files
                    if commit_entry['files']:
                        repo_entry['commits'].append(commit_entry)
                    else:
                        result['errors'].append(
                            f"No valid files found for commit {commit.sha} in {repo.repo_name}"
                        )
                
                except Exception as e:
                    result['errors'].append(
                        f"Unexpected error processing commit {getattr(commit, 'sha', 'unknown')} in {repo.repo_name}: {str(e)}"
                    )
                    continue
            
            # Only add repo if we have commits
            if repo_entry['commits']:
                result['repos'].append(repo_entry)
            else:
                result['errors'].append(
                    f"No valid commits found for repository {repo.repo_name}"
                )
        
        except Exception as e:
            result['errors'].append(
                f"Unexpected error processing repository {getattr(repo, 'repo_name', 'unknown')}: {str(e)}"
            )
            continue
    
    # Log all errors at the end
    if result['errors']:
        logging.warning(f"Encountered {len(result['errors'])} errors during export:")
        for error in result['errors']:
            logging.warning(f" - {error}")
    
    return result

In [8]:
def save_json_in_file(data: Dict[str, Any], file_path: str):
    """
    Save the structured JSON data to a file.
    
    Args:
        data: Dictionary containing the structured JSON data
        file_path: Path to the output JSON file
    """
    try:
        with open(file_path, 'w') as f:
            import json
            json.dump(data, f, indent=4)
        print(f"Data successfully saved to {file_path}")
    except Exception as e:
        print(f"Error saving data to {file_path}: {e}")

In [9]:
repos = pd.read_csv('./code_samples.csv', skiprows=1)
repos = repos.dropna(subset=['html_url'])
max_workers = get_optimal_max_workers()

In [10]:
selected_repos = [
    'aws-marketplace-serverless-saas-integration',
    'azure-search-python-samples',
    'android-custom-lint-rules',
    'eign-eureka',
    'tut-spring-boot-kotlin'
]
selected_rows = [row for _, row in repos.iterrows() if row['name'] in selected_repos]

In [11]:
repos_data = []

In [12]:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {executor.submit(process_repository, row): row['name'] for row in selected_rows}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing repositories"):
        result = future.result()
        if result:
            repos_data.append(result)

Processing repository: download\orgs\googlesamples\android-custom-lint-rules
Processing repository: download\orgs\aws-samples\aws-marketplace-serverless-saas-integration
Processing repository: download\orgs\Azure-Samples\azure-search-python-samples
Processing repository: download\orgs\spring-guides\tut-spring-boot-kotlin


Processing repositories:   0%|          | 0/4 [00:00<?, ?it/s]

Processing repositories: 100%|██████████| 4/4 [00:00<00:00,  4.20it/s]


In [13]:
print(f"\nCollected data from {len(repos_data)} repositories.")
print(f"""Repo commits:
      
{', \n'.join([f"{data['repo'].repo_name} ({len(data['commits'])})" for data in repos_data])}""")


Collected data from 4 repositories.
Repo commits:
      
tut-spring-boot-kotlin (108), 
android-custom-lint-rules (132), 
aws-marketplace-serverless-saas-integration (162), 
azure-search-python-samples (324)


In [14]:
# file_names = Commit.get_file_names_from_commit(
#     f"download\\orgs\\googlesamples\\android-custom-lint-rules", 
#     "ed5fc41bf6f040e56c4b4c729c9e575f89c2b3ff"
# )
# print(f"File names in commit: {file_names}")

# commit = Commit("ed5fc41bf6f040e56c4b4c729c9e575f89c2b3ff", "android-custom-lint-rules", "googlesamples", datetime.now(pytz.timezone("UTC")), f"""Merge pull request #85 from googlesamples/kts
# Update build files to KTS and version catalogs (and latest AGP)""")

# def test(repo_path, com, file_names, workers):
#     cfs = []
#     hunks = []

#     for name in file_names:
#         file_content, _ = File.get_file_content(repo_path, com.sha, name)
#         metadata_list = CommitFile.get_metadata(com.org_name, com.repo_name, com.sha, name, True)
#         print(f"Processing file: {name} with metadata: {metadata_list}")

#         with ThreadPoolExecutor(max_workers=workers) as executor:
#             futures = [executor.submit(process_metadata, mt, com, name, file_content) for mt in metadata_list]
#             for future in futures:
#                 try:
#                     cf, hunk = future.result()
#                     cfs.append(cf)
#                     hunks.append(hunk)
#                 except Exception as e:
#                     print(f"Error processing metadata for file {name}: {e}")

#     return cfs, hunks

# cfs, hunks = test(f"download\\orgs\\googlesamples\\android-custom-lint-rules", commit, file_names, 30)

In [15]:
json_data = export_to_json(repos_data, 10, max_workers)
output_file = 'exported_data.json'
save_json_in_file(json_data, output_file)



Data successfully saved to exported_data.json
