# Proof_of_concept of gitlab metadata extraction
---

In [1]:
import requests
print(requests.__version__)

print(2+3+5)


2.32.5
10


In [None]:
import requests
from urllib.parse import urlparse, quote
from pybtex.database import parse_string


def build_gitlab_api(repo_url):
    
    parsed = urlparse(repo_url)
    gitlab_host = f"{parsed.scheme}://{parsed.netloc}" # -->https://gitlab.com
    project_path = parsed.path.lstrip("/").removesuffix(".git") # -->remram44/taguette
    encoded_path = quote(project_path, safe="") # -->remram44%2Ftaguette
    api_base = f"{gitlab_host}/api/v4" # -->https://gitlab.com/api/v4   

    return api_base, encoded_path


def fetch_json(api_base, encoded_path, endpoint, headers):

    url = f"{api_base}{endpoint.format(encoded_path=encoded_path)}" # -->https://gitlab.com/api/v4/projects/remram44%2Ftaguette/repository/tree?ref=master
    resp = requests.get(url, headers=headers, timeout=10)
    return resp.json() if resp.ok else {}


def extract_citation(api_base, encoded_path, headers):

    tree = fetch_json(api_base, encoded_path, "/projects/{encoded_path}/repository/tree?ref=master", headers)

    citation_files = [
        f["path"]
        for f in tree
        if "citation" in f["path"].lower()
        and f["path"].split(".")[-1].lower() in ("cff", "md", "txt")
    ]

    if not citation_files:
        return None

    filename = citation_files[0]

    raw_url = (
        f"{api_base}/projects/{encoded_path}/repository/files/"
        f"{quote(filename, safe='')}/raw?ref=master"
    )

    resp = requests.get(raw_url, headers=headers, timeout=10)
    if not resp.ok:
        return None

    content = resp.text

    if filename.lower().endswith(".cff"):
        try:
            import yaml
            return yaml.safe_load(content)
        except Exception:
            return {"raw": content}

    try:
        bib_data = parse_string(content, bib_format="bibtex")
        entry = list(bib_data.entries.values())[0]

        return {
            "type": entry.type, 
            "title": entry.fields.get("title"), 
            "authors": [str(a) for a in entry.persons["author"]], 
            "year": int(entry.fields.get("year")), 
            "journal": entry.fields.get("journal"), 
            "volume": entry.fields.get("volume"), 
            "number": entry.fields.get("number"), 
            "pages": entry.fields.get("pages"), 
            "doi": entry.fields.get("doi"), 
            "url": entry.fields.get("url")
        }
    except Exception:
        return {"raw": content}


def get_archived_links(original_url): 
    results = { "wayback_available": False, "wayback_url": None, "software_heritage_available": False, "software_heritage_url": None } 
    wayback_api = f"https://archive.org/wayback/available?url={original_url}" 
    try: 
        wb_resp = requests.get(wayback_api, timeout=10) 
        if wb_resp.status_code == 200: 
            wb_data = wb_resp.json() 
            snapshot = wb_data.get("archived_snapshots", {}).get("closest") 
            if snapshot and snapshot.get("available"): 
                results["wayback_available"] = True 
                results["wayback_url"] = snapshot["url"] 
    except Exception as e: 
        print("Wayback check failed:", e) 
    swh_url = f"https://archive.softwareheritage.org/browse/origin/directory/?origin_url={original_url}" 
    try: 
        swh_resp = requests.get(swh_url, timeout=10) 
        if swh_resp.status_code == 200: 
            results["software_heritage_available"] = True 
            results["software_heritage_url"] = swh_url 
    except Exception as e: 
        print("Software Heritage check failed:", e) 
    return results

def get_gitlab_metadata(repo_url, token=None):

    api_base, encoded_path = build_gitlab_api(repo_url)
    headers = {"PRIVATE-TOKEN": token} if token else {}
    project = fetch_json(api_base, encoded_path, "/projects/{encoded_path}?license=true", headers)
    languages = fetch_json(api_base, encoded_path, "/projects/{encoded_path}/languages", headers)
    tags = fetch_json(api_base, encoded_path, "/projects/{encoded_path}/repository/tags", headers)
    commits = fetch_json(api_base, encoded_path, "/projects/{encoded_path}/repository/commits", headers)
    branches = fetch_json(api_base, encoded_path, "/projects/{encoded_path}/repository/branches", headers)
    contributors = fetch_json(api_base, encoded_path, "/projects/{encoded_path}/repository/contributors", headers)
    citation = extract_citation(api_base, encoded_path, headers)
    archive_info = get_archived_links(project.get("web_url", ""))
    keywords = project.get("tag_list", [])
    
    required = {
        "Name": project.get("name"),
        "Description": project.get("description"),
        "URL": project.get("web_url"),
        "Clone URL": project.get("http_url_to_repo"),
        "Languages": ", ".join(languages.keys()) if languages else None,
        "Version": tags[0]["name"] if isinstance(tags, list) and tags else None,
    }

    recommended = {
        "Latest Commit": commits[0]["id"] if isinstance(commits, list) and commits else None,
        "Branches Count": len(branches) if isinstance(branches, list) else None,
        "README URL": project.get("readme_url"),
        "License": project.get("license", {}).get("name"),
        "Citation": citation,
        "Authors": project.get("namespace", {}).get("name"),
        "Archive": archive_info,
        "Keywords": ", ".join(keywords) if keywords else None,
    }

    optional = {
        "Last Activity": project.get("last_activity_at"),
        "Visibility": project.get("visibility"),
        "Contributors": ", ".join([c["name"] for c in contributors])
        if isinstance(contributors, list)
        else None,
    }

    return {
        "required": required,
        "recommended": recommended,
        "optional": optional,
    }


if __name__ == "__main__":
    repos = ["https://gitlab.com/remram44/taguette.git"]

    for repo in repos:
        print(f"\nFetching metadata for: {repo}")
        try:
            meta = get_gitlab_metadata(repo)
            print("\n### REQUIRED ###")
            for k, v in meta["required"].items():
                print(k, ":", v)

            print("\n### RECOMMENDED ###")
            for k, v in meta["recommended"].items():
                print(k, ":", v)

            print("\n### OPTIONAL ###")
            for k, v in meta["optional"].items():
                print(k, ":", v)

        except Exception as e:
            print("Error:", e)


# Further increase asyronous activites to reduce the time taken to retrieve data from multiple endpoints.
# import asyncio  # Import asyncio for asynchronous operations


Fetching metadata for: https://gitlab.com/remram44/taguette.git
https://gitlab.com

### REQUIRED ###
Name : taguette
Description : Free and open source qualitative research tool
URL : https://gitlab.com/remram44/taguette
Clone URL : https://gitlab.com/remram44/taguette.git
Languages : Python, JavaScript, HTML, CSS, Dockerfile
Version : v1.5.1

### RECOMMENDED ###
Latest Commit : 53124d11654efae11d98be5d2ffd548156dc5f29
Branches Count : 20
README URL : https://gitlab.com/remram44/taguette/-/blob/master/README.rst
License : BSD 3-Clause "New" or "Revised" License
Citation : {'type': 'article', 'title': 'Taguette: open-source qualitative data analysis', 'authors': ['Rampin, Rémi', 'Rampin, Vicky'], 'year': 2021, 'journal': 'Journal of Open Source Software', 'volume': '6', 'number': '68', 'pages': '3522', 'doi': '10.21105/joss.03522', 'url': 'https://doi.org/10.21105/joss.03522'}
Authors : Remi Rampin
Archive : {'wayback_available': True, 'wayback_url': 'http://web.archive.org/web/2025090