In [2]:
import subprocess
from pathlib import Path
import logging
from datetime import datetime

CLONED_REPOS_PATH = Path("E:\\Repositories\\Cloned_Reposit")
OUTPUT_BASE_PATH = Path("E:\\Repositories\\Summary_Analysis")
DESIGNITE_JAR_PATH = Path("E:\\DesigniteJava\\DesigniteJava.jar") 

LOG_FILE = Path.cwd() / f"process_repositories_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"

PARALLEL_PROCESSES = 1 

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler(LOG_FILE),
        logging.StreamHandler()
    ]
)


def verify_paths():
    """
    Verify that essential paths exist.
    """
    if not CLONED_REPOS_PATH.exists() or not CLONED_REPOS_PATH.is_dir():
        logging.error(f"Cloned repositories path does not exist or is not a directory: {CLONED_REPOS_PATH}")
        return False

    if not DESIGNITE_JAR_PATH.exists() or not DESIGNITE_JAR_PATH.is_file():
        logging.error(f"DesigniteJava.jar not found at path: {DESIGNITE_JAR_PATH}")
        return False

    # Create output base directory if it doesn't exist
    if not OUTPUT_BASE_PATH.exists():
        try:
            OUTPUT_BASE_PATH.mkdir(parents=True, exist_ok=True)
            logging.info(f"Created output base directory: {OUTPUT_BASE_PATH}")
        except Exception as e:
            logging.error(f"Failed to create output base directory: {e}")
            return False

    return True

def get_repositories():
    """
    Get all repository directories under the cloned repositories path.
    """
    repos = [repo for repo in CLONED_REPOS_PATH.iterdir() if repo.is_dir()]
    logging.info(f"Found {len(repos)} repositories in {CLONED_REPOS_PATH}")
    return repos

def create_output_directory(repo_name):
    """
    Create an output directory for the given repository.
    """
    output_dir = OUTPUT_BASE_PATH / repo_name
    if not output_dir.exists():
        try:
            output_dir.mkdir(parents=True, exist_ok=True)
            logging.info(f"Created output directory: {output_dir}")
        except Exception as e:
            logging.error(f"Failed to create output directory for {repo_name}: {e}")
            return None
    else:
        logging.info(f"Output directory already exists: {output_dir}")
    return output_dir

def run_designite(repo_path, output_path):
    """
    Run the DesigniteJava.jar command for the given repository.
    """
    command = [
        "java",
        "-jar",
        str(DESIGNITE_JAR_PATH),
        "-i",
        str(repo_path),
        "-o",
        str(output_path)
    ]
    repo_name = repo_path.name
    logging.info(f"Processing repository: {repo_name}")
    logging.info(f"Executing command: {' '.join(command)}")

    try:
        result = subprocess.run(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            check=True
        )
        logging.info(f"Successfully processed: {repo_name}")
        # Optionally, log the output from DesigniteJava.jar
        # logging.debug(result.stdout)
    except subprocess.CalledProcessError as e:
        logging.error(f"DesigniteJava returned a non-zero exit code for {repo_name}: {e.returncode}")
        logging.error(f"Error output: {e.stderr}")
    except Exception as e:
        logging.error(f"Failed to process {repo_name}: {e}")

def process_repositories_sequentially(repos):
    """
    Process repositories one by one.
    """
    for repo in repos:
        repo_name = repo.name
        output_dir = create_output_directory(repo_name)
        if output_dir:
            run_designite(repo, output_dir)

def process_repositories_in_parallel(repos, max_workers=4):
    """
    Process repositories in parallel using multiprocessing.
    """
    from concurrent.futures import ThreadPoolExecutor, as_completed

    def process_single_repo(repo):
        repo_name = repo.name
        output_dir = create_output_directory(repo_name)
        if output_dir:
            run_designite(repo, output_dir)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_repo = {executor.submit(process_single_repo, repo): repo for repo in repos}
        for future in as_completed(future_to_repo):
            repo = future_to_repo[future]
            try:
                future.result()
            except Exception as exc:
                logging.error(f"Repository {repo.name} generated an exception: {exc}")


def main():
    logging.info("=== Starting Processing of Java Repositories ===")

    if not verify_paths():
        logging.error("Verification of paths failed. Exiting.")
        return

    repositories = get_repositories()

    if not repositories:
        logging.info("No repositories found to process. Exiting.")
        return

    if PARALLEL_PROCESSES > 1:
        logging.info(f"Processing repositories in parallel with {PARALLEL_PROCESSES} workers.")
        process_repositories_in_parallel(repositories, max_workers=PARALLEL_PROCESSES)
    else:
        logging.info("Processing repositories sequentially.")
        process_repositories_sequentially(repositories)

    logging.info("=== All repositories have been processed. ===")


main()

2024-09-20 15:12:45,877 - INFO - === Starting Processing of Java Repositories ===
2024-09-20 15:12:45,901 - INFO - Created output base directory: E:\Repositories\Summary_Analysis
2024-09-20 15:12:46,115 - INFO - Found 100 repositories in E:\Repositories\Cloned_Reposit
2024-09-20 15:12:46,117 - INFO - Processing repositories sequentially.
2024-09-20 15:12:46,120 - INFO - Created output directory: E:\Repositories\Summary_Analysis\abtin_bigdata
2024-09-20 15:12:46,125 - INFO - Processing repository: abtin_bigdata
2024-09-20 15:12:46,130 - INFO - Executing command: java -jar E:\DesigniteJava\DesigniteJava.jar -i E:\Repositories\Cloned_Reposit\abtin_bigdata -o E:\Repositories\Summary_Analysis\abtin_bigdata
2024-09-20 15:12:58,557 - INFO - Successfully processed: abtin_bigdata
2024-09-20 15:12:58,560 - INFO - Created output directory: E:\Repositories\Summary_Analysis\acl9es_GhostHunter
2024-09-20 15:12:58,562 - INFO - Processing repository: acl9es_GhostHunter
2024-09-20 15:12:58,565 - INFO -