In [1]:
import pandas as pd
import re
import yaml
import sqlparse
import os
import pandas as pd
import requests

In [2]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 30)
#pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', 10) 

#### Extract repo elements

In [3]:
def extract_owner_and_repo(github_url):
    try:
        # Remove the base URL and split the rest
        parts = github_url.replace("https://github.com/", "").split("/")
        # Validate structure
        if len(parts) >= 2:
            owner = parts[0]
            repo = parts[1]
            return owner, repo
        else:
            raise ValueError("Invalid GitHub URL structure.")
    except Exception as e:
        print(f"Error: {e}")
        return None, None

def list_local_repo_structure(repo_path):
    paths = []
    for root, dirs, files in os.walk(repo_path):
        rel_dir = os.path.relpath(root, repo_path)
        if rel_dir == '.':
            rel_dir = ''
        if rel_dir:
            paths.append(rel_dir + '/')
        for f in files:
            file_path = f"{rel_dir}/{f}" if rel_dir else f
            paths.append(file_path)
    return paths

def list_online_repo_structure(owner, repo):
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/"
    stack = [(url, '')]
    paths = []
    while stack:
        current_url, current_path = stack.pop()
        response = requests.get(current_url)
        if response.status_code == 200:
            items = response.json()
            for item in items:
                if item['type'] == 'dir':
                    paths.append(current_path + item['name'] + '/')
                    stack.append((item['url'], current_path + item['name'] + '/'))
                else:
                    paths.append(current_path + item['name'])
    return paths

In [4]:
def is_online_repo(path):
    return path.startswith("http://") or path.startswith("https://")

local_dbt_repo = ''
online_dbt_repo = 'https://github.com/dbt-labs/jaffle-shop'

# Use local repo?
if False:
    repo_path = local_dbt_repo
else:
    repo_path = online_dbt_repo

is_online = is_online_repo(repo_path)
if is_online:
    owner, repo = extract_owner_and_repo(online_dbt_repo)
    repo_elements = list_online_repo_structure(owner,repo)
else:
    repo_elements = list_local_repo_structure(local_dbt_repo)

print(repo_elements)

['.github/', '.gitignore', '.pre-commit-config.yaml', '.sqlfluff', '.sqlfluffignore', 'README.md', 'Taskfile.yml', 'analyses/', 'data-tests/', 'dbt_project.yml', 'jaffle-data/', 'macros/', 'models/', 'package-lock.yml', 'packages.yml', 'requirements.in', 'requirements.txt', 'seeds/', 'seeds/.gitkeep', 'models/marts/', 'models/staging/', 'models/staging/__sources.yml', 'models/staging/stg_customers.sql', 'models/staging/stg_customers.yml', 'models/staging/stg_locations.sql', 'models/staging/stg_locations.yml', 'models/staging/stg_order_items.sql', 'models/staging/stg_order_items.yml', 'models/staging/stg_orders.sql', 'models/staging/stg_orders.yml', 'models/staging/stg_products.sql', 'models/staging/stg_products.yml', 'models/staging/stg_supplies.sql', 'models/staging/stg_supplies.yml', 'models/marts/customers.sql', 'models/marts/customers.yml', 'models/marts/locations.sql', 'models/marts/locations.yml', 'models/marts/metricflow_time_spine.sql', 'models/marts/order_items.sql', 'models/m

### dbt models knowledge db

#### Select dbt elements

In [5]:
dbt_extensions = ['.sql', '.yml', '.yaml', '.csv']

def select_dbt_elements_by_extension(dbt_extensions, repo_elements):
    # Filter elements with relevant extensions
    return [element for element in repo_elements if any(element.endswith(ext) for ext in dbt_extensions)]

repo_dbt_elements = select_dbt_elements_by_extension(dbt_extensions, repo_elements)
print(repo_dbt_elements)

def select_dbt_models(dbt_extensions, repo_dbt_elements):
    return [
        element for element in repo_dbt_elements
        if element.startswith('models/') and any(element.endswith(ext) for ext in dbt_extensions)
    ]

repo_dbt_models = select_dbt_models(dbt_extensions, repo_dbt_elements)
print(repo_dbt_models)

['.pre-commit-config.yaml', 'Taskfile.yml', 'dbt_project.yml', 'package-lock.yml', 'packages.yml', 'models/staging/__sources.yml', 'models/staging/stg_customers.sql', 'models/staging/stg_customers.yml', 'models/staging/stg_locations.sql', 'models/staging/stg_locations.yml', 'models/staging/stg_order_items.sql', 'models/staging/stg_order_items.yml', 'models/staging/stg_orders.sql', 'models/staging/stg_orders.yml', 'models/staging/stg_products.sql', 'models/staging/stg_products.yml', 'models/staging/stg_supplies.sql', 'models/staging/stg_supplies.yml', 'models/marts/customers.sql', 'models/marts/customers.yml', 'models/marts/locations.sql', 'models/marts/locations.yml', 'models/marts/metricflow_time_spine.sql', 'models/marts/order_items.sql', 'models/marts/order_items.yml', 'models/marts/orders.sql', 'models/marts/orders.yml', 'models/marts/products.sql', 'models/marts/products.yml', 'models/marts/supplies.sql', 'models/marts/supplies.yml', 'macros/cents_to_dollars.sql', 'macros/generate

In [6]:
dbt_config_elements = ['packages.yml', 'dbt_project.yml']

In [7]:
def generate_dbt_models_df(repo_dbt_models):
    data = []
    for path in repo_dbt_models:
        name = os.path.basename(path)
        extension = os.path.splitext(name)[1]
        data.append({'path': path, 'name': name, 'extension': extension})
    return pd.DataFrame(data)

dbt_models_df = generate_dbt_models_df(repo_dbt_models)
display(dbt_models_df)

Unnamed: 0,path,name,extension
0,models/staging/__sources.yml,__sources.yml,.yml
1,models/staging/stg_customers.sql,stg_customers.sql,.sql
2,models/staging/stg_customers.yml,stg_customers.yml,.yml
3,models/staging/stg_locations.sql,stg_locations.sql,.sql
4,models/staging/stg_locations.yml,stg_locations.yml,.yml
...,...,...,...
21,models/marts/orders.yml,orders.yml,.yml
22,models/marts/products.sql,products.sql,.sql
23,models/marts/products.yml,products.yml,.yml
24,models/marts/supplies.sql,supplies.sql,.sql


#### Add sql code

In [8]:
if False:
    def get_base_url(repo_url):
        if repo_url.startswith("https://github.com"):
            parts = repo_url.replace("https://github.com/", "").split("/")
            owner, repo = parts[0], parts[1]
            return f"https://raw.githubusercontent.com/{owner}/{repo}/main"
        else:
            raise ValueError("URL not valid.")

    def extract_file_content(path, is_online = False, repo_base_url = None):
        try:
            if is_online:
                # Build complete URL
                file_url = f"{repo_base_url}/{path}" if repo_base_url else path
                response = requests.get(file_url)
                if response.status_code == 200:
                    return response.text
                else:
                    return f"Error: {response.status_code} {response.reason}"
            else:
                # Read content
                with open(path, 'r', encoding='utf-8') as file:
                    return file.read()
        except Exception as e:
            return f"Error: {e}"

    def add_code_column(df, is_online = False, repo_url = None):
        if is_online:
            repo_base_url = get_base_url(repo_url)
        else:
            repo_base_url = ''

        df['sql_code'] = df['path'].apply(lambda path: extract_file_content(path, is_online, repo_base_url))
        return df

    dbt_models_df = add_code_column(dbt_models_df, is_online, online_dbt_repo)
    dbt_models_df.head(3)


In [9]:
def get_base_url(repo_url):
    if repo_url.startswith("https://github.com"):
        parts = repo_url.replace("https://github.com/", "").split("/")
        owner, repo = parts[0], parts[1]
        return f"https://raw.githubusercontent.com/{owner}/{repo}/main"
    else:
        raise ValueError("URL not valid.")

def extract_file_content(path, is_online=False, repo_base_url=None):
    try:
        if is_online:
            # Build complete URL
            file_url = f"{repo_base_url}/{path}" if repo_base_url else path
            response = requests.get(file_url)
            if response.status_code == 200:
                content = response.text
            else:
                return f"Error: {response.status_code} {response.reason}"
        else:
            # Read content locally
            with open(path, 'r', encoding='utf-8') as file:
                content = file.read()

        # Process content based on file type
        if path.endswith(('.yml', '.yaml')):
            try:
                return yaml.safe_load(content)  # Parse YAML and return as dictionary
            except yaml.YAMLError as e:
                return f"Error parsing YAML: {e}"
        elif path.endswith('.sql'):
            try:
                return sqlparse.format(content, reindent=True, keyword_case='upper')  # Format SQL
            except Exception as e:
                return f"Error parsing SQL: {e}"
        else:
            return content  # Return plain text for other types

    except Exception as e:
        return f"Error: {e}"

def add_code_column(df, is_online=False, repo_url=None):
    if is_online:
        repo_base_url = get_base_url(repo_url)
    else:
        repo_base_url = ''

    # Extract content for each file and process it based on type
    df['sql_code'] = df['path'].apply(lambda path: extract_file_content(path, is_online, repo_base_url))
    return df

dbt_models_df = add_code_column(dbt_models_df, is_online, online_dbt_repo)
dbt_models_df.head(3)


Unnamed: 0,path,name,extension,sql_code
0,models/staging/__sources.yml,__sources.yml,.yml,"{'version': 2, 'sources': [{'name': 'ecom', 's..."
1,models/staging/stg_customers.sql,stg_customers.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...
2,models/staging/stg_customers.yml,stg_customers.yml,.yml,"{'models': [{'name': 'stg_customers', 'descrip..."


#### Add config block

In [10]:
def extract_config_block(sql_code):
    pattern = r"{{\s*config\((.*?)\)\s*}}"
    match = re.search(pattern, sql_code, re.DOTALL)
    return match.group(0) if match else None

def add_config_column(df):
    df['config'] = df.apply(
        lambda row: extract_config_block(row['sql_code']) if row['extension'] == '.sql' else None,
        axis=1
    )
    return df

dbt_models_df = add_config_column(dbt_models_df)
dbt_models_df.head(3)

Unnamed: 0,path,name,extension,sql_code,config
0,models/staging/__sources.yml,__sources.yml,.yml,"{'version': 2, 'sources': [{'name': 'ecom', 's...",
1,models/staging/stg_customers.sql,stg_customers.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,
2,models/staging/stg_customers.yml,stg_customers.yml,.yml,"{'models': [{'name': 'stg_customers', 'descrip...",


In [11]:
test = """
{{
    config(
        materialized="table"
    )
}}
"""

dbt_models_df.at[0, 'config'] = test

#### Add model metadata

In [12]:
def extract_materialized_value(config_text):
    if config_text:
        match = re.search(r"materialized\s*=\s*[\"']([^\"']+)[\"']", config_text)
        return match.group(1) if match else None
    return None

def check_is_snapshot(config_text):
    if config_text:
        return 'strategy' in config_text
    return False

dbt_models_df['materialized'] = dbt_models_df['config'].apply(extract_materialized_value)
dbt_models_df['is_snapshot'] = dbt_models_df['config'].apply(check_is_snapshot)
dbt_models_df['materialized'] = dbt_models_df.apply(lambda row: 'snapshot' if row['is_snapshot'] else row['materialized'] ,1)

def contains_jinja_code(code_text):
    if isinstance(code_text, str):
        return bool(re.search(r"{%|{#", code_text))
    return False

dbt_models_df['has_jinja_code'] = dbt_models_df['sql_code'].apply(contains_jinja_code)


def categorize_model(name):
    if name.startswith("base"):
        return "base"
    elif name.startswith("stg"):
        return "stg"
    elif name.startswith("int"):
        return "int"
    elif name.startswith("test"):
        return "test"
    elif name.startswith("snap"):
        return "snap"
    elif name.startswith("__sources"):
        return "sources"
    else:
        return "other"

dbt_models_df['model_category'] = dbt_models_df['name'].apply(categorize_model)

def get_vertical(name, model_category):
    base_name = re.sub(r'\.[^.]+$', '', name)
    
    if model_category == 'sources':
        return 'sources'
    
    known_categories = ['stg', 'int']
    if model_category not in known_categories:
        # Para model_category = other u otras no conocidas, devolver base_name sin extensión
        return base_name
    
    # Para stg o int, extraer vertical antes de "__" o "."
    pattern = rf'^{re.escape(model_category)}_([a-z0-9_]+?)(?:__|\.|$)'
    match = re.search(pattern, base_name)
    return match.group(1) if match else base_name

dbt_models_df['vertical'] = dbt_models_df.apply(lambda row: get_vertical(row['name'], row['model_category']), axis=1)

#### Zip the dataframe by models

In [13]:
def assign_yml_rows_to_each_model(dbt_models_df):
    dbt_models_df['yml_code'] = None

    yml_df = dbt_models_df[dbt_models_df['extension'] == '.yml'].copy()
    yml_df['delete'] = False

    for idx, row in yml_df.iterrows():
        base_name = row['name'].rsplit('.', 1)[0]

        sql_match = dbt_models_df[(dbt_models_df['name'] == base_name + '.sql')]

        if not sql_match.empty:
            dbt_models_df.at[sql_match.index[0], 'yml_code'] = row['sql_code']
            yml_df.at[idx, 'delete'] = True
        else:
            yml_df.at[idx, 'yml_code'] = row['sql_code']
            yml_df.at[idx, 'sql_code'] = None

    yml_df = yml_df[~yml_df['delete']]

    dbt_models_df = dbt_models_df[dbt_models_df['extension'] != '.yml']

    yml_df = yml_df.drop(columns=['delete'])
    dbt_models_df = pd.concat([dbt_models_df, yml_df], ignore_index=True)

    return dbt_models_df

dbt_models_df = assign_yml_rows_to_each_model(dbt_models_df)


#### Extract sql code info

In [14]:
def extract_tests(yml_code):
    if not isinstance(yml_code, dict):
        return None

    tests_dict = {'columns': {}, 'unit_tests': []}

    # Extract tests from all models
    for model in yml_code.get('models', []):
        for column in model.get('columns', []):
            column_name = column.get('name')
            if column_name:
                # Combine 'tests' and 'data_tests' if present
                tests = column.get('tests', []) + column.get('data_tests', [])
                if tests:
                    tests_dict['columns'][column_name] = tests

    # Extract unit tests
    if 'unit_tests' in yml_code:
        unit_test_names = [test.get('name') for test in yml_code['unit_tests'] if test.get('name')]
        if unit_test_names:
            tests_dict['unit_tests'] = unit_test_names

    return tests_dict if tests_dict['columns'] or tests_dict['unit_tests'] else None

dbt_models_df['tests'] = dbt_models_df['yml_code'].apply(extract_tests)
dbt_models_df['has_tests'] = dbt_models_df['tests'].apply(lambda x: x is not None)


In [15]:
def extract_ids_from_query(code):
    if not isinstance(code, str):
        return None
    
    # Parse the SQL query
    parsed = sqlparse.parse(code)
    if not parsed:
        return None
    
    # Regular expression to find columns ending in '_id'
    id_pattern = re.compile(r'\b(\w+_id)\b')
    
    cte_ids = set()
    output_ids = set()
    
    for statement in parsed:
        # Flatten tokens to handle nested structures
        token_list = sqlparse.sql.TokenList(statement.tokens).flatten()
        inside_cte = False
        
        for token in token_list:
            # Detect CTE start (with keyword 'WITH')
            if token.ttype is sqlparse.tokens.Keyword and token.value.upper() == 'WITH':
                inside_cte = True
            
            # Detect SELECT after a WITH block ends
            if inside_cte and token.ttype is sqlparse.tokens.Keyword and token.value.upper() == 'SELECT':
                inside_cte = False
            
            if token.ttype is sqlparse.tokens.Name or token.ttype is None:
                match = id_pattern.search(token.value)
                if match:
                    if inside_cte:
                        cte_ids.add(match.group(1))
                    else:
                        output_ids.add(match.group(1))
    ids = {
        'cte_ids': list(cte_ids),
        'output_ids': list(output_ids)
    }
    return ids['output_ids'] if ids['output_ids'] != [] else None

dbt_models_df['sql_ids'] = dbt_models_df['sql_code'].apply(extract_ids_from_query)


In [16]:
def has_select_all_in_last_select(code):
    if not isinstance(code, str):
        return False

    parsed = sqlparse.parse(code)
    if not parsed:
        return False

    select_statements = [stmt for stmt in parsed if stmt.get_type() == 'SELECT']
    if not select_statements:
        return False
    last_select = select_statements[-1]

    for token in last_select.tokens:
        if token.ttype is sqlparse.tokens.Wildcard and token.value == '*':
            return True

    return False

dbt_models_df['has_select_all_in_last_select'] = dbt_models_df['sql_code'].apply(has_select_all_in_last_select)


In [17]:
def has_group_by(code):
    if not isinstance(code, str):
        return False

    parsed = sqlparse.parse(code)
    if not parsed:
        return False
    return 'group by' in code.lower()


dbt_models_df['has_group_by'] = dbt_models_df['sql_code'].apply(has_group_by)

In [18]:
def find_primary_key(tests_dict):
    if not isinstance(tests_dict, dict) or 'columns' not in tests_dict:
        return None

    for column, tests in tests_dict.get('columns', {}).items():
        # Check if the column has the required tests for a primary key
        if tests == ['not_null', 'unique'] or 'dbt_constraints.primary_key' in tests:
            return column
    
    return None

dbt_models_df['primary_key'] = dbt_models_df['tests'].apply(find_primary_key)

In [19]:
def extract_sql_filters(sql_query):
    if not isinstance(sql_query, str) or not sql_query.strip():
        return None

    sql_query_clean = ' '.join(sql_query.split()).lower()

    filters_patterns = [
        (r'\bwhere\b\s+(.*?)(?=\bgroup\b|\border\b|\blimit\b|\bhaving\b|;|$)', 'where'),
        (r'\bon\b\s+(.*?)(?=\bleft\b|\bright\b|\binner\b|\bouter\b|\bjoin\b|\bselect\b|\bwhere\b|\bgroup\b|\border\b|\blimit\b|;|$)', 'join'),
        (r'\bhaving\b\s+(.*?)(?=\bgroup\b|\border\b|\blimit\b|;|$)', 'having')
    ]

    filters = []
    joins = []

    for pattern, clause_type in filters_patterns:
        matches = re.findall(pattern, sql_query_clean, re.DOTALL)
        for match in matches:
            sub_conditions = re.split(r'\band\b|\bor\b', match)
            for condition in sub_conditions:
                cleaned = condition.strip().strip('()')
                if cleaned:
                    if clause_type == 'join':
                        joins.append(cleaned)
                    else:
                        filters.append(cleaned)
    all_filters = filters + joins
    return all_filters if all_filters != [] else None

dbt_models_df['filters'] = dbt_models_df['sql_code'].apply(extract_sql_filters)
dbt_models_df['is_filtered'] = dbt_models_df['filters'].apply(lambda x: x is not None)

In [20]:
def extract_dbt_macros(sql_query):

    if not isinstance(sql_query, str) or not sql_query.strip():
        return None
    
    macro_pattern = r"\{\{\s*([\w\.]+)\s*\(.*?\)\s*\}\}"
    matches = re.findall(macro_pattern, sql_query)
    filtered_macros = sorted(set(m for m in matches if m not in ('ref', 'source')))
    
    return filtered_macros if filtered_macros != [] else None

dbt_models_df['macros'] = dbt_models_df['sql_code'].apply(extract_dbt_macros)
dbt_models_df['has_macros'] = dbt_models_df['macros'].apply(lambda x: x is not None)

#### Calculate models structure

In [21]:
def extract_source_details(code, source_pattern):
    if not isinstance(code, str):
        return False, None
    sources = re.findall(source_pattern, code)
    if sources:
        return True, [f"{source[0]}.{source[1]}" for source in sources]
    return False, None

def enrich_dbt_models(dbt_models_df):
    # Helper regex patterns
    source_pattern = r"\{\{\s*source\(['\"](.*?)['\"],\s*['\"](.*?)['\"]\)\s*\}\}"
    ref_pattern = r"\{\{\s*ref\(['\"](.*?)['\"]\)\s*\}\}"
    
    # Add 'parent_models' - extract all models referenced using 'ref'
    dbt_models_df['parent_models'] = dbt_models_df['sql_code'].apply(
        lambda code: re.findall(ref_pattern, code) if isinstance(code, str) else []
    )
    
    dbt_models_df[['is_source_model', 'source']] = dbt_models_df['sql_code'].apply(
        lambda code: pd.Series(extract_source_details(code, source_pattern))
    )
    
    # Build a dictionary to track children relationships
    model_children = {}
    for idx, row in dbt_models_df.iterrows():
        for parent in row['parent_models']:
            model_children.setdefault(parent, []).append(row['name'].replace('.sql', ''))

    # Add 'children_models' - list all models that depend on this model
    dbt_models_df['children_models'] = dbt_models_df['name'].apply(
        lambda name: model_children.get(name.replace('.sql', ''), [])
    )
    
    # Add 'is_end_model' - True if there are no children
    dbt_models_df['is_end_model'] = dbt_models_df['children_models'].apply(lambda children: len(children) == 0)
    
    return dbt_models_df

dbt_models_enriched_df = enrich_dbt_models(dbt_models_df)

In [22]:
display(dbt_models_enriched_df)

Unnamed: 0,path,name,extension,sql_code,config,materialized,is_snapshot,has_jinja_code,model_category,vertical,yml_code,tests,has_tests,sql_ids,has_select_all_in_last_select,has_group_by,primary_key,filters,is_filtered,macros,has_macros,parent_models,is_source_model,source,children_models,is_end_model
0,models/staging/stg_customers.sql,stg_customers.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,customers,"{'models': [{'name': 'stg_customers', 'descrip...","{'columns': {'customer_id': ['not_null', 'uniq...",True,[customer_id],True,False,customer_id,,False,,False,[],True,[ecom.raw_customers],[customers],False
1,models/staging/stg_locations.sql,stg_locations.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,locations,"{'models': [{'name': 'stg_locations', 'descrip...","{'columns': {'location_id': ['not_null', 'uniq...",True,[location_id],True,False,location_id,,False,[dbt.date_trunc],True,[],True,[ecom.raw_stores],[locations],False
2,models/staging/stg_order_items.sql,stg_order_items.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,order_items,"{'models': [{'name': 'stg_order_items', 'descr...","{'columns': {'order_item_id': ['not_null', 'un...",True,"[order_id, product_id, order_item_id]",True,False,order_item_id,,False,,False,[],True,[ecom.raw_items],[order_items],False
3,models/staging/stg_orders.sql,stg_orders.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,orders,"{'models': [{'name': 'stg_orders', 'descriptio...","{'columns': {'order_id': ['not_null', 'unique'...",True,"[order_id, store_id, customer_id, location_id]",True,False,order_id,,False,"[cents_to_dollars, dbt.date_trunc]",True,[],True,[ecom.raw_orders],"[order_items, orders]",False
4,models/staging/stg_products.sql,stg_products.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,products,"{'models': [{'name': 'stg_products', 'descript...","{'columns': {'product_id': ['not_null', 'uniqu...",True,[product_id],True,False,product_id,,False,[cents_to_dollars],True,[],True,[ecom.raw_products],"[order_items, products]",False
5,models/staging/stg_supplies.sql,stg_supplies.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,supplies,"{'models': [{'name': 'stg_supplies', 'descript...","{'columns': {'supply_uuid': ['not_null', 'uniq...",True,"[product_id, supply_id]",True,False,supply_uuid,,False,"[cents_to_dollars, dbt_utils.generate_surrogat...",True,[],True,[ecom.raw_supplies],"[order_items, supplies]",False
6,models/marts/customers.sql,customers.sql,.sql,WITH customers AS\n (SELECT *\n FROM {{ ref...,,,False,False,other,customers,"{'models': [{'name': 'customers', 'description...","{'columns': {'customer_id': ['not_null', 'uniq...",True,"[order_id, customer_id]",True,True,customer_id,[customers.customer_id = customer_orders_summa...,True,,False,"[stg_customers, orders]",False,,[],True
7,models/marts/locations.sql,locations.sql,.sql,WITH locations AS\n (SELECT *\n FROM {{ ref...,,,False,False,other,locations,"{'semantic_models': [{'name': 'locations', 'de...",,False,,True,False,,,False,,False,[stg_locations],False,,[],True
8,models/marts/metricflow_time_spine.sql,metricflow_time_spine.sql,.sql,-- metricflow_time_spine.sql\nWITH days AS (--...,,,False,False,other,metricflow_time_spine,,,False,,True,False,,,False,[dbt_date.get_base_dates],True,[],False,,[],True
9,models/marts/order_items.sql,order_items.sql,.sql,WITH order_items AS\n (SELECT *\n FROM {{ r...,,,False,False,other,order_items,"{'models': [{'name': 'order_items', 'columns':...","{'columns': {'order_item_id': ['not_null', 'un...",True,"[order_id, product_id]",True,True,order_item_id,"[order_items.order_id = orders.order_id, order...",True,,False,"[stg_order_items, stg_orders, stg_products, st...",False,,[orders],False


### Add descriptions using LLM

In [38]:
def add_repo_root_path():
    import os
    import sys
    repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
    if repo_root not in sys.path:
        sys.path.append(repo_root)

In [39]:
add_repo_root_path()
import openai_setup

OPENAI_API_KEY = openai_setup.conf['key']
OPENAI_PROJECT = openai_setup.conf['project']
OPENAI_ORGANIZATION = openai_setup.conf['organization']
DEFAULT_LLM_MODEL = "gpt-4o-mini"

In [24]:
from langchain_openai import ChatOpenAI  # Importa el modelo de OpenAI
from langchain.schema import HumanMessage  # Para interactuar con mensajes

llm = ChatOpenAI(model=DEFAULT_LLM_MODEL, temperature=0.1, openai_api_key=OPENAI_API_KEY, openai_organization = OPENAI_ORGANIZATION)

In [25]:
def generate_query_description(llm, query, documentation = None):
    # Context and prompt
    prompt = f"""
        You are an expert data analyst, dbt analytics engineer and technical writer. 
        Your task is to generate a concise, clear, and standardized description of the following SQL query of a dbt model.

        SQL Query of the dbt model:
        {query}

        Additional Documentation in the model yaml file:
        {documentation}

        Guidelines:
        1. Describe the **main purpose** of the query in 2 to 3 sentences.
        2. Include the following details explicitly:
        - Tables or data sources referenced.
        - Filters, conditions, and joins applied.
        - Any aggregations (e.g., SUM, COUNT) or calculations performed.
        3. Avoid technical jargon and vague expressions.
        4. Limit the description to around **50 words** maximum.
        5. Maintain a similar level of detail and length for all responses to ensure consistency.
        6. Don't use "This query" or "This model", all the info must usefull and coherent.
        
        Format Example:
        "Retrieves all customer records from the 'customers' table where the country is 'US'. It joins the 'orders' table on 'customer_id' to calculate the total order amount per customer using SUM(). The result is grouped by 'customer_id'."
    """

    # Interact
    response = llm([HumanMessage(content=prompt)])
    return response.content

In [26]:
example_query = dbt_models_enriched_df.iloc[1].sql_code
print(example_query)
example_doc = dbt_models_enriched_df.iloc[1].yml_code
print(example_doc)

generate_query_description(llm, example_query, example_doc)

WITH SOURCE AS
  (SELECT *
   FROM {{ source('ecom', 'raw_stores') }}), renamed AS
  (SELECT ----------  ids
 id AS location_id, ---------- text
 name AS location_name, ---------- numerics
 tax_rate, ---------- timestamps
 {{ dbt.date_trunc('day', 'opened_at') }} AS opened_date
   FROM SOURCE)
SELECT *
FROM renamed
{'models': [{'name': 'stg_locations', 'description': 'List of open locations with basic cleaning and transformation applied, one row per location.', 'columns': [{'name': 'location_id', 'description': 'The unique key for each location.', 'data_tests': ['not_null', 'unique']}]}], 'unit_tests': [{'name': 'test_does_location_opened_at_trunc_to_date', 'description': 'Check that opened_at timestamp is properly truncated to a date.', 'model': 'stg_locations', 'given': [{'input': "source('ecom', 'raw_stores')", 'rows': [{'id': 1, 'name': 'Vice City', 'tax_rate': 0.2, 'opened_at': '2016-09-01T00:00:00'}, {'id': 2, 'name': 'San Andreas', 'tax_rate': 0.1, 'opened_at': '2079-10-27T23:59

  response = llm([HumanMessage(content=prompt)])


'"Extracts a list of open store locations from the \'raw_stores\' source table, renaming columns for clarity. It includes the unique location identifier, location name, tax rate, and truncates the \'opened_at\' timestamp to a date format. Each row represents a distinct location."'

In [27]:
from tqdm import tqdm
tqdm.pandas()

def generate_model_description(row):
    if pd.notna(row['sql_code']) or pd.notna(row['yml_code']):
        sql_code = row['sql_code'] if pd.notna(row['sql_code']) else ""
        yml_code = row['yml_code'] if pd.notna(row['yml_code']) else ""
        return generate_query_description(llm, sql_code, yml_code)
    return None

dbt_models_enriched_df['model_description'] = dbt_models_enriched_df.progress_apply(generate_model_description, axis=1)

100%|██████████| 14/14 [00:28<00:00,  2.05s/it]


In [33]:
def generate_jinja_code_description(llm, query, documentation = None):
    # Context and prompt
    prompt = f"""
        You are an expert data analyst, dbt analytics engineer, technical writer, and Jinja programmer. 
        Your task is to generate a concise, clear, and standardized description of the Jinja code within the dbt model.

        SQL Query of the dbt model with the Jinja code:
        {query}

        Additional Documentation in the model yaml file:
        {documentation}

        Guidelines:
        1. Focus only on what the Jinja code does, ignoring the logic or dependencies related to refs or source functions.
        2. Clearly explain the **main purpose** of the Jinja code in plain language.
        3. Avoid technical jargon and vague expressions.
        4. Limit the description to around **50 words** maximum.
        5. Ensure all responses are coherent, useful, and follow a consistent format.
        6. Avoid using phrases like "This ..." or "The code ...". Focus on describing the purpose directly.
        7. If has multiple sections, describe each section separetly.

        Examples Format:
        - Calculates the rolling average of sales over the last 30 days for each product.
        - Formats the date column to a standard YYYY-MM-DD format.
        - Dynamically generates filter conditions based on user inputs.

        Provide the description of the Jinja code:
    """

    # Interact
    response = llm([HumanMessage(content=prompt)])
    return response.content

In [36]:
def generate_jinja_description(row):
    if row['has_jinja_code']:
        sql_code = row['sql_code'] if pd.notna(row['sql_code']) else ""
        yml_code = row['yml_code'] if pd.notna(row['yml_code']) else ""
        return generate_jinja_code_description(llm, sql_code, yml_code)
    return None

dbt_models_enriched_df['jinja_description'] = dbt_models_enriched_df.progress_apply(generate_jinja_description, axis=1)

100%|██████████| 14/14 [00:00<00:00, 6278.23it/s]


In [37]:
dbt_models_enriched_df

Unnamed: 0,path,name,extension,sql_code,config,materialized,is_snapshot,has_jinja_code,model_category,vertical,yml_code,tests,has_tests,sql_ids,has_select_all_in_last_select,has_group_by,primary_key,filters,is_filtered,macros,has_macros,parent_models,is_source_model,source,children_models,is_end_model,model_description,jinja_description
0,models/staging/stg_customers.sql,stg_customers.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,customers,"{'models': [{'name': 'stg_customers', 'descrip...","{'columns': {'customer_id': ['not_null', 'uniq...",True,[customer_id],True,False,customer_id,,False,,False,[],True,[ecom.raw_customers],[customers],False,"""Extracts customer data from the 'raw_customer...",
1,models/staging/stg_locations.sql,stg_locations.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,locations,"{'models': [{'name': 'stg_locations', 'descrip...","{'columns': {'location_id': ['not_null', 'uniq...",True,[location_id],True,False,location_id,,False,[dbt.date_trunc],True,[],True,[ecom.raw_stores],[locations],False,"""Extracts a list of open store locations from ...",
2,models/staging/stg_order_items.sql,stg_order_items.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,order_items,"{'models': [{'name': 'stg_order_items', 'descr...","{'columns': {'order_item_id': ['not_null', 'un...",True,"[order_id, product_id, order_item_id]",True,False,order_item_id,,False,,False,[],True,[ecom.raw_items],[order_items],False,"""Extracts individual food and drink items from...",
3,models/staging/stg_orders.sql,stg_orders.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,orders,"{'models': [{'name': 'stg_orders', 'descriptio...","{'columns': {'order_id': ['not_null', 'unique'...",True,"[order_id, store_id, customer_id, location_id]",True,False,order_id,,False,"[cents_to_dollars, dbt.date_trunc]",True,[],True,[ecom.raw_orders],"[order_items, orders]",False,Retrieves order data from the 'raw_orders' sou...,
4,models/staging/stg_products.sql,stg_products.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,products,"{'models': [{'name': 'stg_products', 'descript...","{'columns': {'product_id': ['not_null', 'uniqu...",True,[product_id],True,False,product_id,,False,[cents_to_dollars],True,[],True,[ecom.raw_products],"[order_items, products]",False,Retrieves product data from the 'raw_products'...,
5,models/staging/stg_supplies.sql,stg_supplies.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,supplies,"{'models': [{'name': 'stg_supplies', 'descript...","{'columns': {'supply_uuid': ['not_null', 'uniq...",True,"[product_id, supply_id]",True,False,supply_uuid,,False,"[cents_to_dollars, dbt_utils.generate_surrogat...",True,[],True,[ecom.raw_supplies],"[order_items, supplies]",False,Retrieves supply expense data from the 'raw_su...,
6,models/marts/customers.sql,customers.sql,.sql,WITH customers AS\n (SELECT *\n FROM {{ ref...,,,False,False,other,customers,"{'models': [{'name': 'customers', 'description...","{'columns': {'customer_id': ['not_null', 'uniq...",True,"[order_id, customer_id]",True,True,customer_id,[customers.customer_id = customer_orders_summa...,True,,False,"[stg_customers, orders]",False,,[],True,"""Aggregates customer data from the 'stg_custom...",
7,models/marts/locations.sql,locations.sql,.sql,WITH locations AS\n (SELECT *\n FROM {{ ref...,,,False,False,other,locations,"{'semantic_models': [{'name': 'locations', 'de...",,False,,True,False,,,False,,False,[stg_locations],False,,[],True,"""Retrieves all location records from the 'stg_...",
8,models/marts/metricflow_time_spine.sql,metricflow_time_spine.sql,.sql,-- metricflow_time_spine.sql\nWITH days AS (--...,,,False,False,other,metricflow_time_spine,,,False,,True,False,,,False,[dbt_date.get_base_dates],True,[],False,,[],True,"Generates a date range of 10 years, starting f...",
9,models/marts/order_items.sql,order_items.sql,.sql,WITH order_items AS\n (SELECT *\n FROM {{ r...,,,False,False,other,order_items,"{'models': [{'name': 'order_items', 'columns':...","{'columns': {'order_item_id': ['not_null', 'un...",True,"[order_id, product_id]",True,True,order_item_id,"[order_items.order_id = orders.order_id, order...",True,,False,"[stg_order_items, stg_orders, stg_products, st...",False,,[orders],False,"""Combines data from 'stg_order_items', 'stg_or...",
