In [1]:
import gitlab
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get the private token from environment variables
PRIVATE_TOKEN = os.getenv("PRIVATE_TOKEN")
if not PRIVATE_TOKEN:
    raise ValueError("PRIVATE_TOKEN is not set. Please set it in the .env file.")

# Authenticate with your GitLab instance
GITLAB_URL = "https://gitlab.aisingapore.net"

# Initialize GitLab connection
gl = gitlab.Gitlab(GITLAB_URL, private_token=PRIVATE_TOKEN)

# Access the project by its ID
project_id = 2529  # Replace with your actual project ID
aiap_project = gl.projects.get(project_id)

# Specify the branch to scrape
branch_name = "soh_sze_han"  # Replace with the specific branch name you want to scrape

# Directory to store the downloaded files
DOWNLOAD_DIR = f"gitlab_branch_{branch_name}_files"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)


# Function to scrape all files in a specific branch
def scrape_branch(branch_name):
    # Get repository tree (files and directories) at the root level of the branch
    tree = aiap_project.repository_tree(ref=branch_name, recursive=True, get_all=True)

    for item in tree:
        # File path to be saved
        item_path = os.path.join(DOWNLOAD_DIR, item["path"])

        if item["type"] == "blob":  # If it's a file
            # Create directories as needed
            os.makedirs(os.path.dirname(item_path), exist_ok=True)

            try:
                # Get file content from the GitLab API
                file_data = aiap_project.files.get(
                    file_path=item["path"], ref=branch_name
                )
                file_content = file_data.decode()

                # Write file content (try to handle both text and binary)
                if is_text_file(item["path"]):
                    # Handle text file
                    with open(item_path, "w", encoding="utf-8") as f:
                        f.write(file_content.decode("utf-8"))
                    print(
                        f"Downloaded text file: {item['path']} from branch: {branch_name}"
                    )
                else:
                    # Handle binary file
                    with open(item_path, "wb") as f:
                        f.write(file_content)
                    print(
                        f"Downloaded binary file: {item['path']} from branch: {branch_name}"
                    )

            except gitlab.exceptions.GitlabGetError as e:
                print(f"Failed to download file: {item['path']}. Error: {e}")

        elif item["type"] == "tree":  # If it's a directory
            # Create directory
            os.makedirs(item_path, exist_ok=True)
            print(f"Directory found: {item['path']}")


# Function to determine if a file is likely a text file based on its extension
def is_text_file(file_path):
    text_extensions = [
        ".txt",
        ".md",
        ".py",
        ".java",
        ".json",
        ".yaml",
        ".yml",
        ".csv",
        ".html",
        ".xml",
        ".sh",
    ]
    _, ext = os.path.splitext(file_path)
    return ext.lower() in text_extensions


# Scrape the specified branch
print(f"Scraping branch: {branch_name}")
scrape_branch(branch_name)
print("Scraping complete.")


Scraping branch: soh_sze_han
Directory found: assignment0
Directory found: assignment1
Directory found: assignment1/census_data
Directory found: assignment1/src
Directory found: assignment1/tests
Directory found: assignment2
Directory found: assignment2/A2P1_docker_workshop
Directory found: assignment2/A2P1_docker_workshop/assets
Directory found: assignment2/imgs
Directory found: assignment3
Directory found: assignment4
Directory found: assignment4/conf
Directory found: assignment4/docker
Directory found: assignment4/src
Directory found: assignment4/tests
Directory found: assignment5
Directory found: assignment5/data
Directory found: assignment5/data/image
Directory found: assignment5/docker
Directory found: assignment5/images
Directory found: assignment5/robustness_requirements
Directory found: assignment5/src
Directory found: assignment5/tests
Directory found: assignment6
Directory found: assignment6/images
Directory found: assignment6/src
Directory found: assignment6/tests
Directory