In [1]:
from concurrent.futures import ThreadPoolExecutor
import zipfile
import json
import os
import requests
import subprocess
import tempfile


def analyze_class_files(class_files_batch):
    try:
        # Create a temporary file to store javap output
        with tempfile.NamedTemporaryFile(mode='w+', delete=False) as tmpfile:
            # Use javap to get public class details for a batch of class files
            subprocess.run(['javap', '-public'] + class_files_batch, stdout=tmpfile, text=True, check=True)
            tmpfile.seek(0)
            class_info = tmpfile.read()

        # Process the output to extract non-abstract, non-interface classes
        selected_fqns = []
        for line in class_info.splitlines():
            if line.startswith('public ') and 'class' in line and 'abstract' not in line and 'interface' not in line:
                parts = line.split()
                if 'class' in parts:
                    idx = parts.index('class')
                    fqn = parts[idx + 1].replace('/', '.')
                    selected_fqns.append(fqn)
        return selected_fqns
    except subprocess.CalledProcessError:
        return []

def extract_fqns_from_jar(jar_path):
    all_fqns = []
    selected_fqns = []
    batch_size = 10000  # Can be adjustable based on the system capabilities

    with tempfile.TemporaryDirectory() as temp_dir:
        with zipfile.ZipFile(jar_path, 'r') as jar:
            jar.extractall(temp_dir)
            class_files = [os.path.join(root, name)
                           for root, dirs, files in os.walk(temp_dir)
                           for name in files if name.endswith(".class") and "$" not in name]

        # Create batches of class files
        batches = [class_files[i:i + batch_size] for i in range(0, len(class_files), batch_size)]

        # Extract selected FQNs using ThreadPoolExecutor to process batches in parallel
        with ThreadPoolExecutor() as executor:
            results = executor.map(analyze_class_files, batches)

        # Flatten the list of results
        for result in results:
            selected_fqns.extend(result)

        # Extract all FQNs
        for file_name in jar.namelist():
            if file_name.endswith(".class") and "$" not in file_name:
                fqn = file_name.replace("/", ".").rstrip(".class")
                all_fqns.append(fqn)

    return all_fqns, selected_fqns


def extract_maven_metadata_from_jar(jar_name):
    # Extract artifactId and version from the jar name
    parts = jar_name.split('-')
    version = parts[-1].replace('.jar', '')  # Version is after the last "-"
    artifact_id = '-'.join(parts[:-1])  # Rejoin the remaining parts as the artifactId

    # Construct the search URL for the Maven Central Repository API
    search_url = f'https://search.maven.org/solrsearch/select'
    params = {
        'q': f'a:"{artifact_id}" AND v:"{version}"',
        'rows': '1',  # Looking for a specific version
        'wt': 'json',  # Response format (json)
    }

    try:
        # Make the HTTP GET request
        response = requests.get(search_url, params=params)
        response.raise_for_status()  # Raises an HTTPError if the response was an error
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error for {jar_name}:", e)
        return None
    
    # Parse the search results
    search_results = response.json()
    docs = search_results['response']['docs']

    # Check if there are any results
    if not docs:
        print(f"Dependency not found for {jar_name}.")
        return None

    # Extract the Maven dependency snippet
    doc = docs[0]  # Get the first (and likely only) result
    group_id = doc['g']
    artifact_id = doc['a']
    version = doc['v']
    dependency_snippet = f'<dependency>\n\t<groupId>{group_id}</groupId>\n\t<artifactId>{artifact_id}</artifactId>\n\t<version>{version}</version>\n</dependency>'
    
    return dependency_snippet


def extract_latest_modification_date(jar_path):
    with zipfile.ZipFile(jar_path, 'r') as jar:
        for info in jar.infolist():
            year, month, day, hour, minute, second = info.date_time

    if year and month and day:
        # Format the date and time separately
        date_info = {
            "year": year,
            "month": month,
            "day": day,
            "time": f"{hour}:{minute}:{second}"
        }
        return date_info
    else:
        return None


def process_jar_files_in_folder(folder_path, json_output_path_all_type, json_output_path_selected_type):
    # Read existing data if the file exists
    data_for_all_type = {}
    data_for_selected_type = {}

    if os.path.exists(json_output_path_all_type):
        with open(json_output_path_all_type, 'r') as json_file:
            data_for_all_type = json.load(json_file)

    if os.path.exists(json_output_path_selected_type):
        with open(json_output_path_selected_type, 'r') as json_file:
            data_for_selected_type = json.load(json_file)

    # Update with new data
    for file in os.listdir(folder_path):
        if file.endswith(".jar"):
            jar_path = os.path.join(folder_path, file)
            maven_metadata = extract_maven_metadata_from_jar(file)
            all_fqns, selected_fqns = extract_fqns_from_jar(jar_path)
            latest_modification_date = extract_latest_modification_date(jar_path)
            # Update if new or replace if already exists
            data_for_all_type[file] = {
                "latest_modification_date": latest_modification_date,
                "meta_data": maven_metadata,
                "fqns": all_fqns,
            }
            data_for_selected_type[file] = {
                "latest_modification_date": latest_modification_date,
                "meta_data": maven_metadata,
                "fqns": selected_fqns,
            }

    # Write updated data back to the file
    with open(json_output_path_all_type, 'w') as json_file:
        json.dump(data_for_all_type, json_file, indent=4)

    with open(json_output_path_selected_type, 'w') as json_file:
        json.dump(data_for_selected_type, json_file, indent=4)


In [2]:
process_jar_files_in_folder('maven_jars', '../knowledge_db_all_type.json', '../knowledge_db_selected_type.json')

Dependency not found for jackson-datatype-jdk8-2.13.3-javadoc.jar.
Dependency not found for grDevices-3.5-beta76.jar.
Dependency not found for jackson-datatype-jsr310-2.16.1-javadoc.jar.
Dependency not found for material-1.11.0.jar.
Dependency not found for collections-28.0.0.jar.
Dependency not found for utils-3.5-beta76.jar.
Dependency not found for snakeyaml-2.2-javadoc.jar.
Dependency not found for appcompat-v7-28.0.0.jar.
Dependency not found for testthat-1.0.2-renjin-17.jar.
Dependency not found for config-1.4.3-javadoc.jar.
Dependency not found for jackson-module-kotlin-2.13.3-javadoc.jar.
Dependency not found for glide-5.0.0-rc01-javadoc.jar.
Dependency not found for HikariCP-5.0.1-javadoc.jar.
Dependency not found for kotlin-stdlib-1.6.20-RC.jar.
Dependency not found for json-20240205-javadoc.jar.
Dependency not found for stats-3.5-beta75.jar.
Dependency not found for jakarta.enterprise.cdi-api-4.1.0-M1.jar.
Dependency not found for tools-3.5-beta76.jar.
Dependency not found f