# Proof_of_concept of gitlab metadata extraction
---

In [1]:
import requests
print(requests.__version__)



2.31.0


In [85]:
# REQUIRED PROP. META DATA FROM GITLAB REPO

import requests
from urllib.parse import urlparse, quote
from pybtex.database import parse_string

def get_gitlab_metadata(repo_url, token=None):
    
    parsed = urlparse(repo_url)
    # print(f'### -------------- ### \nparsed: {parsed}')
    
    gitlab_host = f"{parsed.scheme}://{parsed.netloc}" # just the base URL (gitlab)
    # print(f'gitlab_host: {gitlab_host}')
    
    project_path = parsed.path.lstrip("/").removesuffix(".git") # making it api friendly
    # print(f'project_path: {project_path}')
    
    encoded_path = quote(project_path, safe="") # w/o slashes for API
    # print(f'encoded_path: {encoded_path}')
    
    api_base = f"{gitlab_host}/api/v4"
    # print(f'api_base: {api_base}\n### -------------- ###\n')

    headers = {"PRIVATE-TOKEN": token} if token else {} # tokens/pass for private repos [Only if needed]

    def fetch_json(endpoint):
        resp = requests.get(f"{api_base}{endpoint}", headers=headers, timeout=10)
        return resp.json() if resp.ok else {}

    project = fetch_json(f"/projects/{encoded_path}?license=true")
    # print(f'project: {(project.keys())}\n')
    
    # To print all project keys and values in the project dictionary
    for p in project:
        print(f'{p}: {project[p]}\n')

    languages = fetch_json(f"/projects/{encoded_path}/languages")
    tags = fetch_json(f"/projects/{encoded_path}/repository/tags")
    commit = fetch_json(f"/projects/{encoded_path}/repository/commits")
    branches = fetch_json(f"/projects/{encoded_path}/repository/branches")
    contributors = fetch_json(f"/projects/{encoded_path}/repository/contributors")
    license_info = project.get("license", {}) if project else {}
    license_name = license_info.get("name") if license_info else None
    authors_info = project.get("namespace", {}) if project else {}
    authors_name = authors_info.get("name") if authors_info else None
    # Citation data extraction process
    citation_ = fetch_json(f"/projects/{encoded_path}/repository/tree?ref=master")

    citation_files = []
    for f in citation_:
        file_name = f["path"]
        # Check if the file name contains "CITATION" (case-insensitive)
        if "citation" in file_name.lower():
            # Check if the file extension is one of the desired types
            ext = file_name.split('.')[-1].lower()
            if ext in ["cff", "txt", "md"]:
                citation_files.append(file_name)

    for filename in citation_files:
        file_content_resp = requests.get(f"{api_base}/projects/{encoded_path}/repository/files/{quote(filename, safe='')}/raw?ref=master", headers=headers, timeout=10)
        if file_content_resp.ok:
            content = file_content_resp.text
            print(f"\n--- Content of {filename} ---\n")
            if filename.lower().endswith('.cff'):
                try:
                    import yaml
                    cff_data = yaml.safe_load(content)
                    print(yaml.dump(cff_data, sort_keys=False))
                except ImportError:
                    print("PyYAML is not installed. Please install it to parse CFF files.")
                except yaml.YAMLError:
                    print(content)
            else:
                citation_content = content
                # print(content)
        else:
            print(f"Failed to fetch {filename}: {file_content_resp.status_code}")

    bib_data = parse_string(content, bib_format="bibtex")
    entry = list(bib_data.entries.values())[0]

    citation_dict_auto = {
        "type": entry.type,
        "title": entry.fields.get("title"),
        "authors": [str(a) for a in entry.persons["author"]],
        "year": int(entry.fields.get("year")),
        "journal": entry.fields.get("journal"),
        "volume": entry.fields.get("volume"),
        "number": entry.fields.get("number"),
        "pages": entry.fields.get("pages"),
        "doi": entry.fields.get("doi"),
        "url": entry.fields.get("url")
    }



    '''
    'get' can return (id, desccription, name, name_with_namespace, path, path_with_namespace, created_at, default_branch, tag_list, ssh_url_to_repo, 
    http_url_to_repo, web_url, readme_url, avatar_url, star_count, forks_count, last_activity_at, visibility, namespace.) - 20 Properties
    '''

    required = {
        "Name": project.get("name", "N/A"),
        "Description": project.get("description", "N/A"),
        "URL": project.get("web_url", "N/A"),
        "Clone URL": project.get("http_url_to_repo", "N/A"),
        "Languages": ", ".join(languages.keys()) if languages else "N/A",
        "Version": tags[0]["name"] if isinstance(tags, list) and tags else "No version tag",
    }

    recommended = {
        "Latest Commit": commit[0]["id"] if isinstance(commit, list) and commit else "N/A",
        "Branches": len(branches) if isinstance(branches, list) else "N/A",
        "README_url": project.get("readme_url", "N/A"),
        "License": license_name,
        # "Citation_content": citation_content if 'citation_content' in locals() else "N/A",
        "Citation": citation_dict_auto if 'citation_dict_auto' in locals() else "N/A",
        "Authors": authors_name,

    }

    optional = {
        "Last Activity": project.get("last_activity_at", "N/A"),
        "Visibility": project.get("visibility", "N/A"),
        "Contributors": ", ".join([c["name"] for c in contributors]) if isinstance(contributors, list) else "N/A",
    }
 
    return {"required": required, "recommended": recommended, "optional": optional}

if __name__ == "__main__":
    repos = [
        "https://gitlab.com/remram44/taguette.git"
    ]

    for r in repos:
        print(f"\nFetching metadata for: {r}")
        try:
            meta = get_gitlab_metadata(r)
            print(f'### Required Peoperties ###\n')
            for keys, values in meta["required"].items():
                print(f"{keys}: {values}")
            print(f"\n### Recommended Properties ###\n")
            for keys, values in meta["recommended"].items():
                print(f"{keys}: {values}")
            print(f'\n### Optional Properties ###\n')
            for keys, values in meta["optional"].items():
                print(f"{keys}: {values}")
        except Exception as e:
            print(f"Error: {e}")



Fetching metadata for: https://gitlab.com/remram44/taguette.git
id: 8339211

description: Free and open source qualitative research tool

name: taguette

name_with_namespace: Remi Rampin / taguette

path: taguette

path_with_namespace: remram44/taguette

created_at: 2018-09-12T02:00:49.402Z

default_branch: master

tag_list: ['document', 'hacktoberfest', 'highlights', 'notes', 'qual', 'qualitative', 'research', 'tagging', 'tags', 'text']

topics: ['document', 'hacktoberfest', 'highlights', 'notes', 'qual', 'qualitative', 'research', 'tagging', 'tags', 'text']

ssh_url_to_repo: git@gitlab.com:remram44/taguette.git

http_url_to_repo: https://gitlab.com/remram44/taguette.git

web_url: https://gitlab.com/remram44/taguette

readme_url: https://gitlab.com/remram44/taguette/-/blob/master/README.rst

forks_count: 35

license_url: https://gitlab.com/remram44/taguette/-/blob/master/LICENSE.txt

license: {'key': 'bsd-3-clause', 'name': 'BSD 3-Clause "New" or "Revised" License', 'nickname': None,

In [84]:
# Structuring the code better

import requests
from urllib.parse import urlparse, quote
from pybtex.database import parse_string

t = token = None  # If needed for private repos
parsed = urlparse(r)
# print(f'### -------------- ### \nparsed: {parsed}')

gitlab_host = f"{parsed.scheme}://{parsed.netloc}" # just the base URL (gitlab)
# print(f'gitlab_host: {gitlab_host}')

project_path = parsed.path.lstrip("/").removesuffix(".git") # making it api friendly
# print(f'project_path: {project_path}')

encoded_path = quote(project_path, safe="") # w/o slashes for API
# print(f'encoded_path: {encoded_path}')

api_base = f"{gitlab_host}/api/v4"
# print(f'api_base: {api_base}\n### -------------- ###\n')

headers = {"PRIVATE-TOKEN": token} if token else {} # tokens/pass for private repos [Only if needed]


def fetch_json(endpoint):
    resp = requests.get(f"{api_base}{endpoint}", headers=headers, timeout=10)
    return resp.json() if resp.ok else {}

def citation_extraction(encoded_path):
        # Citation data extraction process
    citation_ = fetch_json(f"/projects/{encoded_path}/repository/tree?ref=master")

    citation_files = []
    for f in citation_:
        file_name = f["path"]
        # Check if the file name contains "CITATION" (case-insensitive)
        if "citation" in file_name.lower():
            # Check if the file extension is one of the desired types
            ext = file_name.split('.')[-1].lower()
            if ext in ["cff", "txt", "md"]:
                citation_files.append(file_name)

    for filename in citation_files:
        file_content_resp = requests.get(f"{api_base}/projects/{encoded_path}/repository/files/{quote(filename, safe='')}/raw?ref=master", headers=headers, timeout=10)
        if file_content_resp.ok:
            content = file_content_resp.text
            # print(f"\n--- Content of {filename} ---\n")
            if filename.lower().endswith('.cff'):
                try:
                    import yaml
                    cff_data = yaml.safe_load(content)
                    print(yaml.dump(cff_data, sort_keys=False))
                except ImportError:
                    print("PyYAML is not installed. Please install it to parse CFF files.")
                except yaml.YAMLError:
                    print(content)
            else:
                citation_content = content
                # print(content)
        else:
            print(f"Failed to fetch {filename}: {file_content_resp.status_code}")

    bib_data = parse_string(content, bib_format="bibtex")
    entry = list(bib_data.entries.values())[0]

    citation_dict_auto = {
        "type": entry.type,
        "title": entry.fields.get("title"),
        "authors": [str(a) for a in entry.persons["author"]],
        "year": int(entry.fields.get("year")),
        "journal": entry.fields.get("journal"),
        "volume": entry.fields.get("volume"),
        "number": entry.fields.get("number"),
        "pages": entry.fields.get("pages"),
        "doi": entry.fields.get("doi"),
        "url": entry.fields.get("url")
    }

    return citation_dict_auto



def get_gitlab_metadata(repo_url, token=None):


    project = fetch_json(f"/projects/{encoded_path}?license=true")
    # print(f'project: {(project.keys())}\n')
    
    # To print all project keys and values in the project dictionary
    for p in project:
        print(f'{p}: {project[p]}\n')

    languages = fetch_json(f"/projects/{encoded_path}/languages")
    tags = fetch_json(f"/projects/{encoded_path}/repository/tags")
    commit = fetch_json(f"/projects/{encoded_path}/repository/commits")
    branches = fetch_json(f"/projects/{encoded_path}/repository/branches")
    contributors = fetch_json(f"/projects/{encoded_path}/repository/contributors")
    license_info = project.get("license", {}) if project else {}
    license_name = license_info.get("name") if license_info else None
    citation_dict_auto = citation_extraction(encoded_path)
    authors_info = project.get("namespace", {}) if project else {}
    authors_name = authors_info.get("name") if authors_info else None


    '''
    'get' can return (id, desccription, name, name_with_namespace, path, path_with_namespace, created_at, default_branch, tag_list, ssh_url_to_repo, 
    http_url_to_repo, web_url, readme_url, avatar_url, star_count, forks_count, last_activity_at, visibility, namespace.) - 20 Properties
    '''

    required = {
        "Name": project.get("name", "N/A"),
        "Description": project.get("description", "N/A"),
        "URL": project.get("web_url", "N/A"),
        "Clone URL": project.get("http_url_to_repo", "N/A"),
        "Languages": ", ".join(languages.keys()) if languages else "N/A",
        "Version": tags[0]["name"] if isinstance(tags, list) and tags else "No version tag",
    }

    recommended = {
        "Latest Commit": commit[0]["id"] if isinstance(commit, list) and commit else "N/A",
        "Branches": len(branches) if isinstance(branches, list) else "N/A",
        "README_url": project.get("readme_url", "N/A"),
        "License": license_name,
        # "Citation_content": citation_content if 'citation_content' in locals() else "N/A",
        "Citation": citation_dict_auto if 'citation_dict_auto' in locals() else "N/A",
        "Authors": authors_name,

    }

    optional = {
        "Last Activity": project.get("last_activity_at", "N/A"),
        "Visibility": project.get("visibility", "N/A"),
        "Contributors": ", ".join([c["name"] for c in contributors]) if isinstance(contributors, list) else "N/A",
    }
 
    return {"required": required, "recommended": recommended, "optional": optional}

if __name__ == "__main__":
    repos = [
        "https://gitlab.com/remram44/taguette.git"
    ]

    for r in repos:
        print(f"\nFetching metadata for: {r}")
        try:
            meta = get_gitlab_metadata(r)
            print(f'### Required Peoperties ###\n')
            for keys, values in meta["required"].items():
                print(f"{keys}: {values}")
            print(f"\n### Recommended Properties ###\n")
            for keys, values in meta["recommended"].items():
                print(f"{keys}: {values}")
            print(f'\n### Optional Properties ###\n')
            for keys, values in meta["optional"].items():
                print(f"{keys}: {values}")
        except Exception as e:
            print(f"Error: {e}")



Fetching metadata for: https://gitlab.com/remram44/taguette.git
id: 8339211

description: Free and open source qualitative research tool

name: taguette

name_with_namespace: Remi Rampin / taguette

path: taguette

path_with_namespace: remram44/taguette

created_at: 2018-09-12T02:00:49.402Z

default_branch: master

tag_list: ['document', 'hacktoberfest', 'highlights', 'notes', 'qual', 'qualitative', 'research', 'tagging', 'tags', 'text']

topics: ['document', 'hacktoberfest', 'highlights', 'notes', 'qual', 'qualitative', 'research', 'tagging', 'tags', 'text']

ssh_url_to_repo: git@gitlab.com:remram44/taguette.git

http_url_to_repo: https://gitlab.com/remram44/taguette.git

web_url: https://gitlab.com/remram44/taguette

readme_url: https://gitlab.com/remram44/taguette/-/blob/master/README.rst

forks_count: 35

license_url: https://gitlab.com/remram44/taguette/-/blob/master/LICENSE.txt

license: {'key': 'bsd-3-clause', 'name': 'BSD 3-Clause "New" or "Revised" License', 'nickname': None,