In [18]:
import pandas as pd
import os
import requests

### Extract repo elements

In [19]:
def extract_owner_and_repo(github_url):
    try:
        # Remove the base URL and split the rest
        parts = github_url.replace("https://github.com/", "").split("/")
        # Validate structure
        if len(parts) >= 2:
            owner = parts[0]
            repo = parts[1]
            return owner, repo
        else:
            raise ValueError("Invalid GitHub URL structure.")
    except Exception as e:
        print(f"Error: {e}")
        return None, None

def list_local_repo_structure(repo_path):
    paths = []
    for root, dirs, files in os.walk(repo_path):
        rel_dir = os.path.relpath(root, repo_path)
        if rel_dir == '.':
            rel_dir = ''
        if rel_dir:
            paths.append(rel_dir + '/')
        for f in files:
            file_path = f"{rel_dir}/{f}" if rel_dir else f
            paths.append(file_path)
    return paths

def list_online_repo_structure(owner, repo):
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/"
    stack = [(url, '')]
    paths = []
    while stack:
        current_url, current_path = stack.pop()
        response = requests.get(current_url)
        if response.status_code == 200:
            items = response.json()
            for item in items:
                if item['type'] == 'dir':
                    paths.append(current_path + item['name'] + '/')
                    stack.append((item['url'], current_path + item['name'] + '/'))
                else:
                    paths.append(current_path + item['name'])
    return paths

In [20]:
def is_online_repo(path):
    return path.startswith("http://") or path.startswith("https://")

local_dbt_repo = ''
online_dbt_repo = 'https://github.com/dbt-labs/jaffle-shop'

# Use local repo?
if False:
    repo_path = local_dbt_repo
else:
    repo_path = online_dbt_repo

if is_online_repo(repo_path):
    owner, repo = extract_owner_and_repo(online_dbt_repo)
    repo_elements = list_online_repo_structure(owner,repo)
else:
    repo_elements = list_local_repo_structure(local_dbt_repo)

print(repo_elements)

['.github/', '.gitignore', '.pre-commit-config.yaml', '.sqlfluff', '.sqlfluffignore', 'README.md', 'Taskfile.yml', 'analyses/', 'data-tests/', 'dbt_project.yml', 'jaffle-data/', 'macros/', 'models/', 'package-lock.yml', 'packages.yml', 'requirements.in', 'requirements.txt', 'seeds/', 'seeds/.gitkeep', 'models/marts/', 'models/staging/', 'models/staging/__sources.yml', 'models/staging/stg_customers.sql', 'models/staging/stg_customers.yml', 'models/staging/stg_locations.sql', 'models/staging/stg_locations.yml', 'models/staging/stg_order_items.sql', 'models/staging/stg_order_items.yml', 'models/staging/stg_orders.sql', 'models/staging/stg_orders.yml', 'models/staging/stg_products.sql', 'models/staging/stg_products.yml', 'models/staging/stg_supplies.sql', 'models/staging/stg_supplies.yml', 'models/marts/customers.sql', 'models/marts/customers.yml', 'models/marts/locations.sql', 'models/marts/locations.yml', 'models/marts/metricflow_time_spine.sql', 'models/marts/order_items.sql', 'models/m

In [24]:
dbt_extensions = ['.sql', '.yml', '.yaml', '.csv']

def select_dbt_elements_by_extension(dbt_extensions, repo_elements):
    # Filter elements with relevant extensions
    return [element for element in repo_elements if any(element.endswith(ext) for ext in dbt_extensions)]

repo_dbt_elements = select_dbt_elements_by_extension(dbt_extensions, repo_elements)
print(repo_dbt_elements)

def select_dbt_models(dbt_extensions, repo_dbt_elements):
    return [
        element for element in repo_dbt_elements
        if element.startswith('models/') and any(element.endswith(ext) for ext in dbt_extensions)
    ]

repo_dbt_models = select_dbt_models(dbt_extensions, repo_dbt_elements)
print(repo_dbt_models)

['.pre-commit-config.yaml', 'Taskfile.yml', 'dbt_project.yml', 'package-lock.yml', 'packages.yml', 'models/staging/__sources.yml', 'models/staging/stg_customers.sql', 'models/staging/stg_customers.yml', 'models/staging/stg_locations.sql', 'models/staging/stg_locations.yml', 'models/staging/stg_order_items.sql', 'models/staging/stg_order_items.yml', 'models/staging/stg_orders.sql', 'models/staging/stg_orders.yml', 'models/staging/stg_products.sql', 'models/staging/stg_products.yml', 'models/staging/stg_supplies.sql', 'models/staging/stg_supplies.yml', 'models/marts/customers.sql', 'models/marts/customers.yml', 'models/marts/locations.sql', 'models/marts/locations.yml', 'models/marts/metricflow_time_spine.sql', 'models/marts/order_items.sql', 'models/marts/order_items.yml', 'models/marts/orders.sql', 'models/marts/orders.yml', 'models/marts/products.sql', 'models/marts/products.yml', 'models/marts/supplies.sql', 'models/marts/supplies.yml', 'macros/cents_to_dollars.sql', 'macros/generate

In [26]:
def generate_dbt_models_df(repo_dbt_models):
    data = []
    for path in repo_dbt_models:
        name = os.path.basename(path)
        extension = os.path.splitext(name)[1]
        data.append({'path': path, 'name': name, 'extension': extension})
    return pd.DataFrame(data)

dbt_models_df = generate_dbt_models_df(repo_dbt_models)
display(dbt_models_df)

Unnamed: 0,path,name,extension
0,models/staging/__sources.yml,__sources.yml,.yml
1,models/staging/stg_customers.sql,stg_customers.sql,.sql
2,models/staging/stg_customers.yml,stg_customers.yml,.yml
3,models/staging/stg_locations.sql,stg_locations.sql,.sql
4,models/staging/stg_locations.yml,stg_locations.yml,.yml
5,models/staging/stg_order_items.sql,stg_order_items.sql,.sql
6,models/staging/stg_order_items.yml,stg_order_items.yml,.yml
7,models/staging/stg_orders.sql,stg_orders.sql,.sql
8,models/staging/stg_orders.yml,stg_orders.yml,.yml
9,models/staging/stg_products.sql,stg_products.sql,.sql


In [23]:
dbt_config_elements = ['packages.yml', 'dbt_project.yml']