In [1]:
import pandas as pd
import re

import os
import requests

#### Extract repo elements

In [2]:
def extract_owner_and_repo(github_url):
    try:
        # Remove the base URL and split the rest
        parts = github_url.replace("https://github.com/", "").split("/")
        # Validate structure
        if len(parts) >= 2:
            owner = parts[0]
            repo = parts[1]
            return owner, repo
        else:
            raise ValueError("Invalid GitHub URL structure.")
    except Exception as e:
        print(f"Error: {e}")
        return None, None

def list_local_repo_structure(repo_path):
    paths = []
    for root, dirs, files in os.walk(repo_path):
        rel_dir = os.path.relpath(root, repo_path)
        if rel_dir == '.':
            rel_dir = ''
        if rel_dir:
            paths.append(rel_dir + '/')
        for f in files:
            file_path = f"{rel_dir}/{f}" if rel_dir else f
            paths.append(file_path)
    return paths

def list_online_repo_structure(owner, repo):
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/"
    stack = [(url, '')]
    paths = []
    while stack:
        current_url, current_path = stack.pop()
        response = requests.get(current_url)
        if response.status_code == 200:
            items = response.json()
            for item in items:
                if item['type'] == 'dir':
                    paths.append(current_path + item['name'] + '/')
                    stack.append((item['url'], current_path + item['name'] + '/'))
                else:
                    paths.append(current_path + item['name'])
    return paths

In [78]:
def is_online_repo(path):
    return path.startswith("http://") or path.startswith("https://")

local_dbt_repo = ''
online_dbt_repo = 'https://github.com/dbt-labs/jaffle-shop'

# Use local repo?
if False:
    repo_path = local_dbt_repo
else:
    repo_path = online_dbt_repo

is_online = is_online_repo(repo_path)
if is_online:
    owner, repo = extract_owner_and_repo(online_dbt_repo)
    repo_elements = list_online_repo_structure(owner,repo)
else:
    repo_elements = list_local_repo_structure(local_dbt_repo)

print(repo_elements)

['.github/', '.gitignore', '.pre-commit-config.yaml', '.sqlfluff', '.sqlfluffignore', 'README.md', 'Taskfile.yml', 'analyses/', 'data-tests/', 'dbt_project.yml', 'jaffle-data/', 'macros/', 'models/', 'package-lock.yml', 'packages.yml', 'requirements.in', 'requirements.txt', 'seeds/', 'seeds/.gitkeep', 'models/marts/', 'models/staging/', 'models/staging/__sources.yml', 'models/staging/stg_customers.sql', 'models/staging/stg_customers.yml', 'models/staging/stg_locations.sql', 'models/staging/stg_locations.yml', 'models/staging/stg_order_items.sql', 'models/staging/stg_order_items.yml', 'models/staging/stg_orders.sql', 'models/staging/stg_orders.yml', 'models/staging/stg_products.sql', 'models/staging/stg_products.yml', 'models/staging/stg_supplies.sql', 'models/staging/stg_supplies.yml', 'models/marts/customers.sql', 'models/marts/customers.yml', 'models/marts/locations.sql', 'models/marts/locations.yml', 'models/marts/metricflow_time_spine.sql', 'models/marts/order_items.sql', 'models/m

#### Select dbt elements

In [79]:
dbt_extensions = ['.sql', '.yml', '.yaml', '.csv']

def select_dbt_elements_by_extension(dbt_extensions, repo_elements):
    # Filter elements with relevant extensions
    return [element for element in repo_elements if any(element.endswith(ext) for ext in dbt_extensions)]

repo_dbt_elements = select_dbt_elements_by_extension(dbt_extensions, repo_elements)
print(repo_dbt_elements)

def select_dbt_models(dbt_extensions, repo_dbt_elements):
    return [
        element for element in repo_dbt_elements
        if element.startswith('models/') and any(element.endswith(ext) for ext in dbt_extensions)
    ]

repo_dbt_models = select_dbt_models(dbt_extensions, repo_dbt_elements)
print(repo_dbt_models)

['.pre-commit-config.yaml', 'Taskfile.yml', 'dbt_project.yml', 'package-lock.yml', 'packages.yml', 'models/staging/__sources.yml', 'models/staging/stg_customers.sql', 'models/staging/stg_customers.yml', 'models/staging/stg_locations.sql', 'models/staging/stg_locations.yml', 'models/staging/stg_order_items.sql', 'models/staging/stg_order_items.yml', 'models/staging/stg_orders.sql', 'models/staging/stg_orders.yml', 'models/staging/stg_products.sql', 'models/staging/stg_products.yml', 'models/staging/stg_supplies.sql', 'models/staging/stg_supplies.yml', 'models/marts/customers.sql', 'models/marts/customers.yml', 'models/marts/locations.sql', 'models/marts/locations.yml', 'models/marts/metricflow_time_spine.sql', 'models/marts/order_items.sql', 'models/marts/order_items.yml', 'models/marts/orders.sql', 'models/marts/orders.yml', 'models/marts/products.sql', 'models/marts/products.yml', 'models/marts/supplies.sql', 'models/marts/supplies.yml', 'macros/cents_to_dollars.sql', 'macros/generate

In [80]:
dbt_config_elements = ['packages.yml', 'dbt_project.yml']

In [81]:
def generate_dbt_models_df(repo_dbt_models):
    data = []
    for path in repo_dbt_models:
        name = os.path.basename(path)
        extension = os.path.splitext(name)[1]
        data.append({'path': path, 'name': name, 'extension': extension})
    return pd.DataFrame(data)

dbt_models_df = generate_dbt_models_df(repo_dbt_models)
display(dbt_models_df)

Unnamed: 0,path,name,extension
0,models/staging/__sources.yml,__sources.yml,.yml
1,models/staging/stg_customers.sql,stg_customers.sql,.sql
2,models/staging/stg_customers.yml,stg_customers.yml,.yml
3,models/staging/stg_locations.sql,stg_locations.sql,.sql
4,models/staging/stg_locations.yml,stg_locations.yml,.yml
5,models/staging/stg_order_items.sql,stg_order_items.sql,.sql
6,models/staging/stg_order_items.yml,stg_order_items.yml,.yml
7,models/staging/stg_orders.sql,stg_orders.sql,.sql
8,models/staging/stg_orders.yml,stg_orders.yml,.yml
9,models/staging/stg_products.sql,stg_products.sql,.sql


#### Add code

In [82]:
def get_base_url(repo_url):
    if repo_url.startswith("https://github.com"):
        parts = repo_url.replace("https://github.com/", "").split("/")
        owner, repo = parts[0], parts[1]
        return f"https://raw.githubusercontent.com/{owner}/{repo}/main"
    else:
        raise ValueError("URL not valid.")

def extract_file_content(path, is_online = False, repo_base_url = None):
    try:
        if is_online:
            # Build complete URL
            file_url = f"{repo_base_url}/{path}" if repo_base_url else path
            response = requests.get(file_url)
            if response.status_code == 200:
                return response.text
            else:
                return f"Error: {response.status_code} {response.reason}"
        else:
            # Read content
            with open(path, 'r', encoding='utf-8') as file:
                return file.read()
    except Exception as e:
        return f"Error: {e}"

def add_code_column(df, is_online = False, repo_url = None):
    if is_online:
        repo_base_url = get_base_url(repo_url)
    else:
        repo_base_url = ''

    df['code'] = df['path'].apply(lambda path: extract_file_content(path, is_online, repo_base_url))
    return df

dbt_models_df = add_code_column(dbt_models_df, is_online, online_dbt_repo)
dbt_models_df.head(3)


Unnamed: 0,path,name,extension,code
0,models/staging/__sources.yml,__sources.yml,.yml,version: 2\n\nsources:\n - name: ecom\n sc...
1,models/staging/stg_customers.sql,stg_customers.sql,.sql,with\n\nsource as (\n\n select * from {{ so...
2,models/staging/stg_customers.yml,stg_customers.yml,.yml,models:\n - name: stg_customers\n descript...


#### Add config block

In [83]:
def extract_config_block(sql_code):
    pattern = r"{{\s*config\((.*?)\)\s*}}"
    match = re.search(pattern, sql_code, re.DOTALL)
    return match.group(0) if match else None

def add_config_column(df):
    df['config'] = df.apply(
        lambda row: extract_config_block(row['code']) if row['extension'] == '.sql' else None,
        axis=1
    )
    return df

dbt_models_df = add_config_column(dbt_models_df)
dbt_models_df.head(3)

Unnamed: 0,path,name,extension,code,config
0,models/staging/__sources.yml,__sources.yml,.yml,version: 2\n\nsources:\n - name: ecom\n sc...,
1,models/staging/stg_customers.sql,stg_customers.sql,.sql,with\n\nsource as (\n\n select * from {{ so...,
2,models/staging/stg_customers.yml,stg_customers.yml,.yml,models:\n - name: stg_customers\n descript...,


In [84]:
dbt_models_df.iloc[0].code

'version: 2\n\nsources:\n  - name: ecom\n    schema: raw\n    description: E-commerce data for the Jaffle Shop\n    freshness:\n      warn_after:\n        count: 24\n        period: hour\n    tables:\n      - name: raw_customers\n        description: One record per person who has purchased one or more items\n      - name: raw_orders\n        description: One record per order (consisting of one or more order items)\n        loaded_at_field: ordered_at\n      - name: raw_items\n        description: Items included in an order\n      - name: raw_stores\n        loaded_at_field: opened_at\n      - name: raw_products\n        description: One record per SKU for items sold in stores\n      - name: raw_supplies\n        description: One record per supply per SKU of items sold in stores\n'

In [85]:
test = """
{{
    config(
        materialized="table"
    )
}}
"""

dbt_models_df.at[0, 'config'] = test

In [89]:
def extract_materialized_value(config_text):
    if config_text:
        match = re.search(r"materialized\s*=\s*[\"']([^\"']+)[\"']", config_text)
        return match.group(1) if match else None
    return None

def check_is_snapshot(config_text):
    if config_text:
        return 'strategy' in config_text
    return False

dbt_models_df['materialized'] = dbt_models_df['config'].apply(extract_materialized_value)
dbt_models_df['is_snapshot'] = dbt_models_df['config'].apply(check_is_snapshot)
dbt_models_df['materialized'] = dbt_models_df.apply(lambda row: 'snapshot' if row['is_snapshot'] else row['materialized'] ,1)

def contains_jinja_code(code_text):
    if code_text:
        return bool(re.search(r"{%|{#", code_text))
    return False

dbt_models_df['has_jinja_code'] = dbt_models_df['code'].apply(contains_jinja_code)


def categorize_model(name):
    if name.startswith("base"):
        return "base"
    elif name.startswith("stg"):
        return "stg"
    elif name.startswith("int"):
        return "int"
    elif name.startswith("test"):
        return "test"
    elif name.startswith("snap"):
        return "snap"
    elif name.startswith("__sources"):
        return "sources"
    else:
        return "other"

dbt_models_df['model_category'] = dbt_models_df['name'].apply(categorize_model)

def get_vertical(name, model_category):
    base_name = re.sub(r'\.[^.]+$', '', name)
    
    if model_category == 'sources':
        return 'sources'
    
    known_categories = ['stg', 'int']
    if model_category not in known_categories:
        # Para model_category = other u otras no conocidas, devolver base_name sin extensión
        return base_name
    
    # Para stg o int, extraer vertical antes de "__" o "."
    pattern = rf'^{re.escape(model_category)}_([a-z0-9_]+?)(?:__|\.|$)'
    match = re.search(pattern, base_name)
    return match.group(1) if match else base_name

dbt_models_df['vertical'] = dbt_models_df.apply(lambda row: get_vertical(row['name'], row['model_category']), axis=1)

In [90]:
dbt_models_df

Unnamed: 0,path,name,extension,code,config,yml_code,materialized,is_snapshot,has_jinja_code,model_category,vertical
0,models/staging/__sources.yml,__sources.yml,.yml,version: 2\n\nsources:\n - name: ecom\n sc...,"\n{{\n config(\n materialized=""table...",,table,False,False,sources,sources
1,models/staging/stg_customers.sql,stg_customers.sql,.sql,with\n\nsource as (\n\n select * from {{ so...,,models:\n - name: stg_customers\n descript...,,False,False,stg,customers
2,models/staging/stg_locations.sql,stg_locations.sql,.sql,with\n\nsource as (\n\n select * from {{ so...,,models:\n - name: stg_locations\n descript...,,False,False,stg,locations
3,models/staging/stg_order_items.sql,stg_order_items.sql,.sql,with\n\nsource as (\n\n select * from {{ so...,,models:\n - name: stg_order_items\n descri...,,False,False,stg,order_items
4,models/staging/stg_orders.sql,stg_orders.sql,.sql,with\n\nsource as (\n\n select * from {{ so...,,models:\n - name: stg_orders\n description...,,False,False,stg,orders
5,models/staging/stg_products.sql,stg_products.sql,.sql,with\n\nsource as (\n\n select * from {{ so...,,models:\n - name: stg_products\n descripti...,,False,False,stg,products
6,models/staging/stg_supplies.sql,stg_supplies.sql,.sql,with\n\nsource as (\n\n select * from {{ so...,,models:\n - name: stg_supplies\n descripti...,,False,False,stg,supplies
7,models/marts/customers.sql,customers.sql,.sql,with\n\ncustomers as (\n\n select * from {{...,,models:\n - name: customers\n description:...,,False,False,other,customers
8,models/marts/locations.sql,locations.sql,.sql,with\n\nlocations as (\n\n select * from {{...,,semantic_models:\n - name: locations\n des...,,False,False,other,locations
9,models/marts/metricflow_time_spine.sql,metricflow_time_spine.sql,.sql,-- metricflow_time_spine.sql\nwith\n\ndays as ...,,,,False,False,other,metricflow_time_spine


In [87]:
def assign_yml_rows_to_each_model(dbt_models_df):
    dbt_models_df['yml_code'] = None

    for idx, row in dbt_models_df[dbt_models_df['extension'] == '.yml'].iterrows():
        base_name = row['name'].rsplit('.', 1)[0]

        matching_row = dbt_models_df[(dbt_models_df['name'] == base_name + '.sql')]

        if not matching_row.empty:
            dbt_models_df.loc[matching_row.index, 'yml_code'] = row['code']
            dbt_models_df.drop(idx, inplace=True)
        else:
            dbt_models_df.loc[idx, 'yml_code'] = row['code']
            dbt_models_df.loc[idx, 'yml_code'] = None

    dbt_models_df.reset_index(drop=True, inplace=True)

    return dbt_models_df

dbt_models_df = assign_yml_rows_to_each_model(dbt_models_df)
display(dbt_models_df)

Unnamed: 0,path,name,extension,code,config,yml_code
0,models/staging/__sources.yml,__sources.yml,.yml,version: 2\n\nsources:\n - name: ecom\n sc...,"\n{{\n config(\n materialized=""table...",
1,models/staging/stg_customers.sql,stg_customers.sql,.sql,with\n\nsource as (\n\n select * from {{ so...,,models:\n - name: stg_customers\n descript...
2,models/staging/stg_locations.sql,stg_locations.sql,.sql,with\n\nsource as (\n\n select * from {{ so...,,models:\n - name: stg_locations\n descript...
3,models/staging/stg_order_items.sql,stg_order_items.sql,.sql,with\n\nsource as (\n\n select * from {{ so...,,models:\n - name: stg_order_items\n descri...
4,models/staging/stg_orders.sql,stg_orders.sql,.sql,with\n\nsource as (\n\n select * from {{ so...,,models:\n - name: stg_orders\n description...
5,models/staging/stg_products.sql,stg_products.sql,.sql,with\n\nsource as (\n\n select * from {{ so...,,models:\n - name: stg_products\n descripti...
6,models/staging/stg_supplies.sql,stg_supplies.sql,.sql,with\n\nsource as (\n\n select * from {{ so...,,models:\n - name: stg_supplies\n descripti...
7,models/marts/customers.sql,customers.sql,.sql,with\n\ncustomers as (\n\n select * from {{...,,models:\n - name: customers\n description:...
8,models/marts/locations.sql,locations.sql,.sql,with\n\nlocations as (\n\n select * from {{...,,semantic_models:\n - name: locations\n des...
9,models/marts/metricflow_time_spine.sql,metricflow_time_spine.sql,.sql,-- metricflow_time_spine.sql\nwith\n\ndays as ...,,
