In [88]:
import pandas as pd
import re
import yaml
import sqlparse
import os
import pandas as pd
import requests


#### Extract repo elements

In [89]:
def extract_owner_and_repo(github_url):
    try:
        # Remove the base URL and split the rest
        parts = github_url.replace("https://github.com/", "").split("/")
        # Validate structure
        if len(parts) >= 2:
            owner = parts[0]
            repo = parts[1]
            return owner, repo
        else:
            raise ValueError("Invalid GitHub URL structure.")
    except Exception as e:
        print(f"Error: {e}")
        return None, None

def list_local_repo_structure(repo_path):
    paths = []
    for root, dirs, files in os.walk(repo_path):
        rel_dir = os.path.relpath(root, repo_path)
        if rel_dir == '.':
            rel_dir = ''
        if rel_dir:
            paths.append(rel_dir + '/')
        for f in files:
            file_path = f"{rel_dir}/{f}" if rel_dir else f
            paths.append(file_path)
    return paths

def list_online_repo_structure(owner, repo):
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/"
    stack = [(url, '')]
    paths = []
    while stack:
        current_url, current_path = stack.pop()
        response = requests.get(current_url)
        if response.status_code == 200:
            items = response.json()
            for item in items:
                if item['type'] == 'dir':
                    paths.append(current_path + item['name'] + '/')
                    stack.append((item['url'], current_path + item['name'] + '/'))
                else:
                    paths.append(current_path + item['name'])
    return paths

In [92]:
def is_online_repo(path):
    return path.startswith("http://") or path.startswith("https://")

local_dbt_repo = ''
online_dbt_repo = 'https://github.com/dbt-labs/jaffle-shop'

# Use local repo?
if False:
    repo_path = local_dbt_repo
else:
    repo_path = online_dbt_repo

is_online = is_online_repo(repo_path)
if is_online:
    owner, repo = extract_owner_and_repo(online_dbt_repo)
    repo_elements = list_online_repo_structure(owner,repo)
else:
    repo_elements = list_local_repo_structure(local_dbt_repo)

print(repo_elements)

['.github/', '.gitignore', '.pre-commit-config.yaml', '.sqlfluff', '.sqlfluffignore', 'README.md', 'Taskfile.yml', 'analyses/', 'data-tests/', 'dbt_project.yml', 'jaffle-data/', 'macros/', 'models/', 'package-lock.yml', 'packages.yml', 'requirements.in', 'requirements.txt', 'seeds/', 'seeds/.gitkeep', 'models/marts/', 'models/staging/', 'models/staging/__sources.yml', 'models/staging/stg_customers.sql', 'models/staging/stg_customers.yml', 'models/staging/stg_locations.sql', 'models/staging/stg_locations.yml', 'models/staging/stg_order_items.sql', 'models/staging/stg_order_items.yml', 'models/staging/stg_orders.sql', 'models/staging/stg_orders.yml', 'models/staging/stg_products.sql', 'models/staging/stg_products.yml', 'models/staging/stg_supplies.sql', 'models/staging/stg_supplies.yml', 'models/marts/customers.sql', 'models/marts/customers.yml', 'models/marts/locations.sql', 'models/marts/locations.yml', 'models/marts/metricflow_time_spine.sql', 'models/marts/order_items.sql', 'models/m

#### Select dbt elements

In [93]:
dbt_extensions = ['.sql', '.yml', '.yaml', '.csv']

def select_dbt_elements_by_extension(dbt_extensions, repo_elements):
    # Filter elements with relevant extensions
    return [element for element in repo_elements if any(element.endswith(ext) for ext in dbt_extensions)]

repo_dbt_elements = select_dbt_elements_by_extension(dbt_extensions, repo_elements)
print(repo_dbt_elements)

def select_dbt_models(dbt_extensions, repo_dbt_elements):
    return [
        element for element in repo_dbt_elements
        if element.startswith('models/') and any(element.endswith(ext) for ext in dbt_extensions)
    ]

repo_dbt_models = select_dbt_models(dbt_extensions, repo_dbt_elements)
print(repo_dbt_models)

['.pre-commit-config.yaml', 'Taskfile.yml', 'dbt_project.yml', 'package-lock.yml', 'packages.yml', 'models/staging/__sources.yml', 'models/staging/stg_customers.sql', 'models/staging/stg_customers.yml', 'models/staging/stg_locations.sql', 'models/staging/stg_locations.yml', 'models/staging/stg_order_items.sql', 'models/staging/stg_order_items.yml', 'models/staging/stg_orders.sql', 'models/staging/stg_orders.yml', 'models/staging/stg_products.sql', 'models/staging/stg_products.yml', 'models/staging/stg_supplies.sql', 'models/staging/stg_supplies.yml', 'models/marts/customers.sql', 'models/marts/customers.yml', 'models/marts/locations.sql', 'models/marts/locations.yml', 'models/marts/metricflow_time_spine.sql', 'models/marts/order_items.sql', 'models/marts/order_items.yml', 'models/marts/orders.sql', 'models/marts/orders.yml', 'models/marts/products.sql', 'models/marts/products.yml', 'models/marts/supplies.sql', 'models/marts/supplies.yml', 'macros/cents_to_dollars.sql', 'macros/generate

In [94]:
dbt_config_elements = ['packages.yml', 'dbt_project.yml']

In [95]:
def generate_dbt_models_df(repo_dbt_models):
    data = []
    for path in repo_dbt_models:
        name = os.path.basename(path)
        extension = os.path.splitext(name)[1]
        data.append({'path': path, 'name': name, 'extension': extension})
    return pd.DataFrame(data)

dbt_models_df = generate_dbt_models_df(repo_dbt_models)
display(dbt_models_df)

Unnamed: 0,path,name,extension
0,models/staging/__sources.yml,__sources.yml,.yml
1,models/staging/stg_customers.sql,stg_customers.sql,.sql
2,models/staging/stg_customers.yml,stg_customers.yml,.yml
3,models/staging/stg_locations.sql,stg_locations.sql,.sql
4,models/staging/stg_locations.yml,stg_locations.yml,.yml
5,models/staging/stg_order_items.sql,stg_order_items.sql,.sql
6,models/staging/stg_order_items.yml,stg_order_items.yml,.yml
7,models/staging/stg_orders.sql,stg_orders.sql,.sql
8,models/staging/stg_orders.yml,stg_orders.yml,.yml
9,models/staging/stg_products.sql,stg_products.sql,.sql


#### Add code

In [96]:
if False:
    def get_base_url(repo_url):
        if repo_url.startswith("https://github.com"):
            parts = repo_url.replace("https://github.com/", "").split("/")
            owner, repo = parts[0], parts[1]
            return f"https://raw.githubusercontent.com/{owner}/{repo}/main"
        else:
            raise ValueError("URL not valid.")

    def extract_file_content(path, is_online = False, repo_base_url = None):
        try:
            if is_online:
                # Build complete URL
                file_url = f"{repo_base_url}/{path}" if repo_base_url else path
                response = requests.get(file_url)
                if response.status_code == 200:
                    return response.text
                else:
                    return f"Error: {response.status_code} {response.reason}"
            else:
                # Read content
                with open(path, 'r', encoding='utf-8') as file:
                    return file.read()
        except Exception as e:
            return f"Error: {e}"

    def add_code_column(df, is_online = False, repo_url = None):
        if is_online:
            repo_base_url = get_base_url(repo_url)
        else:
            repo_base_url = ''

        df['code'] = df['path'].apply(lambda path: extract_file_content(path, is_online, repo_base_url))
        return df

    dbt_models_df = add_code_column(dbt_models_df, is_online, online_dbt_repo)
    dbt_models_df.head(3)


In [97]:


def get_base_url(repo_url):
    if repo_url.startswith("https://github.com"):
        parts = repo_url.replace("https://github.com/", "").split("/")
        owner, repo = parts[0], parts[1]
        return f"https://raw.githubusercontent.com/{owner}/{repo}/main"
    else:
        raise ValueError("URL not valid.")

def extract_file_content(path, is_online=False, repo_base_url=None):
    try:
        if is_online:
            # Build complete URL
            file_url = f"{repo_base_url}/{path}" if repo_base_url else path
            response = requests.get(file_url)
            if response.status_code == 200:
                content = response.text
            else:
                return f"Error: {response.status_code} {response.reason}"
        else:
            # Read content locally
            with open(path, 'r', encoding='utf-8') as file:
                content = file.read()

        # Process content based on file type
        if path.endswith(('.yml', '.yaml')):
            try:
                return yaml.safe_load(content)  # Parse YAML and return as dictionary
            except yaml.YAMLError as e:
                return f"Error parsing YAML: {e}"
        elif path.endswith('.sql'):
            try:
                return sqlparse.format(content, reindent=True, keyword_case='upper')  # Format SQL
            except Exception as e:
                return f"Error parsing SQL: {e}"
        else:
            return content  # Return plain text for other types

    except Exception as e:
        return f"Error: {e}"

def add_code_column(df, is_online=False, repo_url=None):
    if is_online:
        repo_base_url = get_base_url(repo_url)
    else:
        repo_base_url = ''

    # Extract content for each file and process it based on type
    df['code'] = df['path'].apply(lambda path: extract_file_content(path, is_online, repo_base_url))
    return df

dbt_models_df = add_code_column(dbt_models_df, is_online, online_dbt_repo)
dbt_models_df.head(3)


Unnamed: 0,path,name,extension,code
0,models/staging/__sources.yml,__sources.yml,.yml,"{'version': 2, 'sources': [{'name': 'ecom', 's..."
1,models/staging/stg_customers.sql,stg_customers.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...
2,models/staging/stg_customers.yml,stg_customers.yml,.yml,"{'models': [{'name': 'stg_customers', 'descrip..."


In [98]:
dbt_models_df

Unnamed: 0,path,name,extension,code
0,models/staging/__sources.yml,__sources.yml,.yml,"{'version': 2, 'sources': [{'name': 'ecom', 's..."
1,models/staging/stg_customers.sql,stg_customers.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...
2,models/staging/stg_customers.yml,stg_customers.yml,.yml,"{'models': [{'name': 'stg_customers', 'descrip..."
3,models/staging/stg_locations.sql,stg_locations.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...
4,models/staging/stg_locations.yml,stg_locations.yml,.yml,"{'models': [{'name': 'stg_locations', 'descrip..."
5,models/staging/stg_order_items.sql,stg_order_items.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...
6,models/staging/stg_order_items.yml,stg_order_items.yml,.yml,"{'models': [{'name': 'stg_order_items', 'descr..."
7,models/staging/stg_orders.sql,stg_orders.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...
8,models/staging/stg_orders.yml,stg_orders.yml,.yml,"{'models': [{'name': 'stg_orders', 'descriptio..."
9,models/staging/stg_products.sql,stg_products.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...


In [99]:
dbt_models_df.iloc[1].code

"WITH SOURCE AS\n  (SELECT *\n   FROM {{ source('ecom', 'raw_customers') }}), renamed AS\n  (SELECT ----------  ids\n id AS customer_id, ---------- text\n name AS customer_name\n   FROM SOURCE)\nSELECT *\nFROM renamed"

#### Add config block

In [100]:
def extract_config_block(sql_code):
    pattern = r"{{\s*config\((.*?)\)\s*}}"
    match = re.search(pattern, sql_code, re.DOTALL)
    return match.group(0) if match else None

def add_config_column(df):
    df['config'] = df.apply(
        lambda row: extract_config_block(row['code']) if row['extension'] == '.sql' else None,
        axis=1
    )
    return df

dbt_models_df = add_config_column(dbt_models_df)
dbt_models_df.head(3)

Unnamed: 0,path,name,extension,code,config
0,models/staging/__sources.yml,__sources.yml,.yml,"{'version': 2, 'sources': [{'name': 'ecom', 's...",
1,models/staging/stg_customers.sql,stg_customers.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,
2,models/staging/stg_customers.yml,stg_customers.yml,.yml,"{'models': [{'name': 'stg_customers', 'descrip...",


In [101]:
dbt_models_df.iloc[0].code

{'version': 2,
 'sources': [{'name': 'ecom',
   'schema': 'raw',
   'description': 'E-commerce data for the Jaffle Shop',
   'freshness': {'warn_after': {'count': 24, 'period': 'hour'}},
   'tables': [{'name': 'raw_customers',
     'description': 'One record per person who has purchased one or more items'},
    {'name': 'raw_orders',
     'description': 'One record per order (consisting of one or more order items)',
     'loaded_at_field': 'ordered_at'},
    {'name': 'raw_items', 'description': 'Items included in an order'},
    {'name': 'raw_stores', 'loaded_at_field': 'opened_at'},
    {'name': 'raw_products',
     'description': 'One record per SKU for items sold in stores'},
    {'name': 'raw_supplies',
     'description': 'One record per supply per SKU of items sold in stores'}]}]}

In [102]:
test = """
{{
    config(
        materialized="table"
    )
}}
"""

dbt_models_df.at[0, 'config'] = test

In [103]:
def extract_materialized_value(config_text):
    if config_text:
        match = re.search(r"materialized\s*=\s*[\"']([^\"']+)[\"']", config_text)
        return match.group(1) if match else None
    return None

def check_is_snapshot(config_text):
    if config_text:
        return 'strategy' in config_text
    return False

dbt_models_df['materialized'] = dbt_models_df['config'].apply(extract_materialized_value)
dbt_models_df['is_snapshot'] = dbt_models_df['config'].apply(check_is_snapshot)
dbt_models_df['materialized'] = dbt_models_df.apply(lambda row: 'snapshot' if row['is_snapshot'] else row['materialized'] ,1)

def contains_jinja_code(code_text):
    if isinstance(code_text, str):
        return bool(re.search(r"{%|{#", code_text))
    return False

dbt_models_df['has_jinja_code'] = dbt_models_df['code'].apply(contains_jinja_code)


def categorize_model(name):
    if name.startswith("base"):
        return "base"
    elif name.startswith("stg"):
        return "stg"
    elif name.startswith("int"):
        return "int"
    elif name.startswith("test"):
        return "test"
    elif name.startswith("snap"):
        return "snap"
    elif name.startswith("__sources"):
        return "sources"
    else:
        return "other"

dbt_models_df['model_category'] = dbt_models_df['name'].apply(categorize_model)

def get_vertical(name, model_category):
    base_name = re.sub(r'\.[^.]+$', '', name)
    
    if model_category == 'sources':
        return 'sources'
    
    known_categories = ['stg', 'int']
    if model_category not in known_categories:
        # Para model_category = other u otras no conocidas, devolver base_name sin extensión
        return base_name
    
    # Para stg o int, extraer vertical antes de "__" o "."
    pattern = rf'^{re.escape(model_category)}_([a-z0-9_]+?)(?:__|\.|$)'
    match = re.search(pattern, base_name)
    return match.group(1) if match else base_name

dbt_models_df['vertical'] = dbt_models_df.apply(lambda row: get_vertical(row['name'], row['model_category']), axis=1)

In [104]:
dbt_models_df

Unnamed: 0,path,name,extension,code,config,materialized,is_snapshot,has_jinja_code,model_category,vertical
0,models/staging/__sources.yml,__sources.yml,.yml,"{'version': 2, 'sources': [{'name': 'ecom', 's...","\n{{\n config(\n materialized=""table...",table,False,False,sources,sources
1,models/staging/stg_customers.sql,stg_customers.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,customers
2,models/staging/stg_customers.yml,stg_customers.yml,.yml,"{'models': [{'name': 'stg_customers', 'descrip...",,,False,False,stg,customers
3,models/staging/stg_locations.sql,stg_locations.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,locations
4,models/staging/stg_locations.yml,stg_locations.yml,.yml,"{'models': [{'name': 'stg_locations', 'descrip...",,,False,False,stg,locations
5,models/staging/stg_order_items.sql,stg_order_items.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,order_items
6,models/staging/stg_order_items.yml,stg_order_items.yml,.yml,"{'models': [{'name': 'stg_order_items', 'descr...",,,False,False,stg,order_items
7,models/staging/stg_orders.sql,stg_orders.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,orders
8,models/staging/stg_orders.yml,stg_orders.yml,.yml,"{'models': [{'name': 'stg_orders', 'descriptio...",,,False,False,stg,orders
9,models/staging/stg_products.sql,stg_products.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,products


In [105]:
def assign_yml_rows_to_each_model(dbt_models_df):
    dbt_models_df['yml_code'] = None

    yml_df = dbt_models_df[dbt_models_df['extension'] == '.yml'].copy()
    yml_df['delete'] = False

    for idx, row in yml_df.iterrows():
        base_name = row['name'].rsplit('.', 1)[0]

        sql_match = dbt_models_df[(dbt_models_df['name'] == base_name + '.sql')]

        if not sql_match.empty:
            dbt_models_df.at[sql_match.index[0], 'yml_code'] = row['code']
            yml_df.at[idx, 'delete'] = True
        else:
            yml_df.at[idx, 'yml_code'] = row['code']
            yml_df.at[idx, 'code'] = None

    yml_df = yml_df[~yml_df['delete']]

    dbt_models_df = dbt_models_df[dbt_models_df['extension'] != '.yml']

    yml_df = yml_df.drop(columns=['delete'])
    dbt_models_df = pd.concat([dbt_models_df, yml_df], ignore_index=True)

    return dbt_models_df

dbt_models_df = assign_yml_rows_to_each_model(dbt_models_df)


In [106]:
dbt_models_df

Unnamed: 0,path,name,extension,code,config,materialized,is_snapshot,has_jinja_code,model_category,vertical,yml_code
0,models/staging/stg_customers.sql,stg_customers.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,customers,"{'models': [{'name': 'stg_customers', 'descrip..."
1,models/staging/stg_locations.sql,stg_locations.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,locations,"{'models': [{'name': 'stg_locations', 'descrip..."
2,models/staging/stg_order_items.sql,stg_order_items.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,order_items,"{'models': [{'name': 'stg_order_items', 'descr..."
3,models/staging/stg_orders.sql,stg_orders.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,orders,"{'models': [{'name': 'stg_orders', 'descriptio..."
4,models/staging/stg_products.sql,stg_products.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,products,"{'models': [{'name': 'stg_products', 'descript..."
5,models/staging/stg_supplies.sql,stg_supplies.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,supplies,"{'models': [{'name': 'stg_supplies', 'descript..."
6,models/marts/customers.sql,customers.sql,.sql,WITH customers AS\n (SELECT *\n FROM {{ ref...,,,False,False,other,customers,"{'models': [{'name': 'customers', 'description..."
7,models/marts/locations.sql,locations.sql,.sql,WITH locations AS\n (SELECT *\n FROM {{ ref...,,,False,False,other,locations,"{'semantic_models': [{'name': 'locations', 'de..."
8,models/marts/metricflow_time_spine.sql,metricflow_time_spine.sql,.sql,-- metricflow_time_spine.sql\nWITH days AS (--...,,,False,False,other,metricflow_time_spine,
9,models/marts/order_items.sql,order_items.sql,.sql,WITH order_items AS\n (SELECT *\n FROM {{ r...,,,False,False,other,order_items,"{'models': [{'name': 'order_items', 'columns':..."


In [109]:
def has_tests(yml_code):
    if isinstance(yml_code, dict):
        # Check if any column contains 'tests' or 'data_tests'
        return any('tests' in column or 'data_tests' in column for column in yml_code.get('columns', []))
    return False

def extract_tests(yml_code):
    if not yml_code or not isinstance(yml_code, dict):
        return None

    tests_dict = {}
    # Check for columns and extract tests
    for column in yml_code.get('columns', []):
        column_name = column.get('name')
        if column_name:
            tests = column.get('tests', []) + column.get('data_tests', [])
            if tests:
                tests_dict[column_name] = tests
    return tests_dict if tests_dict else None

dbt_models_df['has_tests'] = dbt_models_df['yml_code'].apply(has_tests)
dbt_models_df['tests'] = dbt_models_df['yml_code'].apply(extract_tests)


In [110]:
dbt_models_df

Unnamed: 0,path,name,extension,code,config,materialized,is_snapshot,has_jinja_code,model_category,vertical,yml_code,has_tests,tests
0,models/staging/stg_customers.sql,stg_customers.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,customers,"{'models': [{'name': 'stg_customers', 'descrip...",False,
1,models/staging/stg_locations.sql,stg_locations.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,locations,"{'models': [{'name': 'stg_locations', 'descrip...",False,
2,models/staging/stg_order_items.sql,stg_order_items.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,order_items,"{'models': [{'name': 'stg_order_items', 'descr...",False,
3,models/staging/stg_orders.sql,stg_orders.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,orders,"{'models': [{'name': 'stg_orders', 'descriptio...",False,
4,models/staging/stg_products.sql,stg_products.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,products,"{'models': [{'name': 'stg_products', 'descript...",False,
5,models/staging/stg_supplies.sql,stg_supplies.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,supplies,"{'models': [{'name': 'stg_supplies', 'descript...",False,
6,models/marts/customers.sql,customers.sql,.sql,WITH customers AS\n (SELECT *\n FROM {{ ref...,,,False,False,other,customers,"{'models': [{'name': 'customers', 'description...",False,
7,models/marts/locations.sql,locations.sql,.sql,WITH locations AS\n (SELECT *\n FROM {{ ref...,,,False,False,other,locations,"{'semantic_models': [{'name': 'locations', 'de...",False,
8,models/marts/metricflow_time_spine.sql,metricflow_time_spine.sql,.sql,-- metricflow_time_spine.sql\nWITH days AS (--...,,,False,False,other,metricflow_time_spine,,False,
9,models/marts/order_items.sql,order_items.sql,.sql,WITH order_items AS\n (SELECT *\n FROM {{ r...,,,False,False,other,order_items,"{'models': [{'name': 'order_items', 'columns':...",False,
