In [1]:
import pandas as pd
import os
import subprocess

csv_path = r"E:\\Repositories\\dataset.csv\\dataset.csv"


df = pd.read_csv(csv_path)
java_repos = df[df['language'] == 'Java']
repository_list = java_repos['repository'].tolist()
total_java_repos = len(repository_list)
print(f"Total Java repositories found: {total_java_repos}")

# Folder where I clone the repositories
output_dir = r"E:\\Repositories\\Cloned_Reposit"


os.makedirs(output_dir, exist_ok=True)

success_count = 0
target_count = 100  

# Iterate over each repository and clone until target_count is reached
for repo_full_name in repository_list:
    if success_count >= target_count:
        break  

    repo_url = f"https://github.com/{repo_full_name}.git"

    # Extract the repository name for the local folder
    local_repo_name = repo_full_name.replace('/', '_')
    clone_path = os.path.join(output_dir, local_repo_name)

    # Check if repository is already cloned
    if os.path.exists(clone_path):
        print(f"[{success_count + 1}/{target_count}] Repository '{repo_full_name}' already cloned at '{clone_path}'. Skipping.")
        success_count += 1
        continue  

    # Attempt to clone the repository
    try:
        print(f"[{success_count + 1}/{target_count}] Cloning '{repo_full_name}' into '{clone_path}'...")
        subprocess.run(['git', 'clone', repo_url, clone_path], check=True)
        print(f"✅ Successfully cloned '{repo_full_name}'.")
        success_count += 1  
    except subprocess.CalledProcessError as e:
        print(f"❌ Failed to clone '{repo_full_name}': {e}. Moving to the next repository.")
    except Exception as e:
        print(f"❌ An unexpected error occurred while cloning '{repo_full_name}': {e}. Moving to the next repository.")

# Final summary
if success_count >= target_count:
    print(f"\n🎉 Successfully cloned {success_count} Java repositories.")
else:
    print(f"\n⚠️ Only {success_count} Java repositories were successfully cloned out of {total_java_repos} available.")

Total Java repositories found: 462173
[1/100] Cloning 'NCIP/c3pr-docs' into 'E:\\Repositories\\Cloned_Reposit\NCIP_c3pr-docs'...
✅ Successfully cloned 'NCIP/c3pr-docs'.
[2/100] Cloning 'bigloupe/SoS-JobScheduler' into 'E:\\Repositories\\Cloned_Reposit\bigloupe_SoS-JobScheduler'...
❌ Failed to clone 'bigloupe/SoS-JobScheduler': Command '['git', 'clone', 'https://github.com/bigloupe/SoS-JobScheduler.git', 'E:\\\\Repositories\\\\Cloned_Reposit\\bigloupe_SoS-JobScheduler']' returned non-zero exit status 128.. Moving to the next repository.
[2/100] Cloning 'pszabolcs/canvasandroid' into 'E:\\Repositories\\Cloned_Reposit\pszabolcs_canvasandroid'...
✅ Successfully cloned 'pszabolcs/canvasandroid'.
[3/100] Cloning 'mk12/mycraft' into 'E:\\Repositories\\Cloned_Reposit\mk12_mycraft'...
✅ Successfully cloned 'mk12/mycraft'.
[4/100] Cloning 'Jarcionek/MTG-Deck-Builder' into 'E:\\Repositories\\Cloned_Reposit\Jarcionek_MTG-Deck-Builder'...
✅ Successfully cloned 'Jarcionek/MTG-Deck-Builder'.
[5/100] 