In [1]:
import gitlab
import os
import time
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get the private token from environment variables
PRIVATE_TOKEN = os.getenv("PRIVATE_TOKEN")
if not PRIVATE_TOKEN:
    raise ValueError("PRIVATE_TOKEN is not set. Please set it in the .env file.")

# Authenticate with your GitLab instance
GITLAB_URL = "https://gitlab.aisingapore.net"

# Initialize GitLab connection
gl = gitlab.Gitlab(GITLAB_URL, private_token=PRIVATE_TOKEN)

# Access the project by its ID
project_id = 2529  # Replace with your actual project ID
aiap_project = gl.projects.get(project_id)

# Directory to store the downloaded files
BASE_DOWNLOAD_DIR = "aiap17-gitlab-data"
os.makedirs(BASE_DOWNLOAD_DIR, exist_ok=True)

# File extensions that are suitable for RAG LLM
RAG_FILE_EXTENSIONS = [
    ".md",
    ".txt",
    ".csv",
    ".json",
    ".yaml",
    ".yml",
    ".html",  # Textual formats
    ".py",
    ".ipynb",  # Python scripts and notebooks
    ".pdf",  # PDF files
]

# Branches to exclude
EXCLUDED_BRANCHES = [
    "15-research-open-source-models-or-cheaper-api-that-has-better-or-equal-performance-to-gpt-4o-mini",
    "4-review-literature-on-vision-transformer-models",
    "master",
    "main",
    "new_joey",
    "feature-add-new-test-cases",
    "pei_shan_n",
]


# Function to determine if a file is suitable for RAG based on its extension
def is_rag_file(file_path):
    _, ext = os.path.splitext(file_path)
    return ext.lower() in RAG_FILE_EXTENSIONS


# Function to scrape all files in a specific branch
def scrape_branch(branch_name):
    # Get repository tree (files and directories) at the root level of the branch
    tree = aiap_project.repository_tree(ref=branch_name, recursive=True, get_all=True)

    for item in tree:
        # File path to be saved inside the base directory aiap17-gitlab-data
        item_path = os.path.join(BASE_DOWNLOAD_DIR, branch_name, item["path"])

        if item["type"] == "blob" and is_rag_file(
            item["path"]
        ):  # If it's a suitable file
            # Create directories as needed
            os.makedirs(os.path.dirname(item_path), exist_ok=True)

            # Extract the file extension
            _, ext = os.path.splitext(item["path"])

            # Implementing retry mechanism
            max_retries = 3
            for attempt in range(max_retries):
                try:
                    # Get file content from the GitLab API
                    file_data = aiap_project.files.get(
                        file_path=item["path"], ref=branch_name
                    )
                    file_content = file_data.decode()

                    # Write file content (handle both text and binary-like files properly)
                    if ext.lower() in [
                        ".py",
                        ".md",
                        ".txt",
                        ".yaml",
                        ".yml",
                        ".html",
                        ".csv",
                        ".json",
                    ]:
                        # Handle text file
                        file_content_text = file_content.decode("utf-8")
                        with open(item_path, "w", encoding="utf-8") as f:
                            f.write(file_content_text)
                        print(
                            f"Downloaded text file: {item['path']} from branch: {branch_name}"
                        )
                    else:
                        # Handle binary-like file (e.g., Jupyter notebooks, PDFs)
                        with open(item_path, "wb") as f:
                            f.write(file_content)
                        print(
                            f"Downloaded binary-like file: {item['path']} from branch: {branch_name}"
                        )

                    # Break if successful
                    break

                except gitlab.exceptions.GitlabGetError as e:
                    print(
                        f"Attempt {attempt + 1} to download file {item['path']} failed. Error: {e}"
                    )
                    if attempt < max_retries - 1:
                        time.sleep(5)  # Wait for 5 seconds before retrying
                    else:
                        print(
                            f"Failed to download file {item['path']} after {max_retries} attempts."
                        )

        elif item["type"] == "tree":  # If it's a directory
            # Create directory
            os.makedirs(item_path, exist_ok=True)
            print(f"Directory found: {item['path']}")


# Function to scrape all branches of a project, excluding specified branches
def scrape_all_branches():
    # Get all branches of the project
    branches = aiap_project.branches.list(all=True)

    for branch in branches:
        branch_name = branch.name
        if branch_name not in EXCLUDED_BRANCHES:
            print(f"Scraping branch: {branch_name}")
            scrape_branch(branch_name)
            print(f"Scraping complete for branch: {branch_name}")
        else:
            print(f"Skipping branch: {branch_name} (excluded)")


# Scrape all branches
scrape_all_branches()
print("Scraping complete for all branches.")

Skipping branch: 15-research-open-source-models-or-cheaper-api-that-has-better-or-equal-performance-to-gpt-4o-mini (excluded)
Skipping branch: 4-review-literature-on-vision-transformer-models (excluded)
Scraping branch: Onn_Yun_Hui
Directory found: assignment0
Directory found: assignment1
Directory found: assignment1/src
Directory found: assignment1/tests
Directory found: assignment2
Directory found: assignment2/A2P1_docker_workshop
Directory found: assignment2/A2P1_docker_workshop/assets
Directory found: assignment2/imgs
Directory found: assignment3
Directory found: assignment4
Directory found: assignment4/src
Directory found: assignment4/tests
Directory found: assignment5
Directory found: assignment5/data
Directory found: assignment5/data/image
Directory found: assignment5/images
Directory found: assignment5/robustness_requirements
Directory found: assignment5/src
Directory found: assignment5/tests
Directory found: assignment6
Directory found: assignment6/images
Directory found: assi