# **Cocoon: Semantic Data Lineage**


## **You Need...**

1. LLM API
2. Compiled DBT directory



In [None]:
! pip install cocoon_data==0.1.128

In [None]:
from cocoon_data import *

In [None]:
#@title  Download example dbt project (skip this step if you have your own)
import requests
import os
import base64

def url_path_join(*args):
    """Join path components for URL using forward slashes."""
    return '/'.join(arg.strip('/') for arg in args)

def download_github_directory(repo_owner, repo_name, directory_path, local_path):
    # GitHub API endpoint
    api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{directory_path}"

    # Send a GET request to the GitHub API
    response = requests.get(api_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Create the local directory if it doesn't exist
        os.makedirs(local_path, exist_ok=True)

        # Parse the JSON response
        contents = response.json()

        # Loop through each file in the directory
        for item in contents:
            if item['type'] == 'file':
                # Get the file content
                file_content = requests.get(item['download_url']).content

                # Save the file locally
                file_path = os.path.join(local_path, item['name'])
                with open(file_path, 'wb') as file:
                    file.write(file_content)
                print(f"Downloaded: {file_path}")
            elif item['type'] == 'dir':
                # If it's a subdirectory, recursively download its contents
                subdir_path = url_path_join(directory_path, item['name'])
                local_subdir_path = os.path.join(local_path, item['name'])
                download_github_directory(repo_owner, repo_name, subdir_path, local_subdir_path)
    else:
        print(f"Failed to retrieve directory contents. Status code: {response.status_code}")

# Usage
repo_owner = "Cocoon-Data-Transformation"
repo_name = "cocoon"
project_name = "dbt_amplitude"
directory_path = f"documentation/dbt_projects/{project_name}"
dbt_directory = os.path.join(".", project_name)

download_github_directory(repo_owner, repo_name, directory_path, dbt_directory)

In [None]:
#@title  Read your dbt project
project_name = "dbt_amplitude"
dbt_directory = os.path.join(".", project_name)
# please compile your dbt project and generate docs, as we need the relavant jso
# by default, we will read manifest.json and catalog.json from target. But if they are in different paths, please provide:
# manifest_path = os.path.join(dbt_path, 'target', 'manifest.json')
# catalog_path = os.path.join(dbt_path, 'target', 'catalog.json')

# make sure the following variables are available
# nodes are a list of model name; edges are a list of edge (from_idx, to_idx) for table lineage; sql_mapping mpas model name to sql; column mapping maps model name to column details
nodes, edges, sql_mapping, column_mapping = build_lineage_graph(dbt_directory)
# nodes, edges, sql_mapping, column_mapping = build_lineage_graph(dbt_path, manifest_path=manifest_path, catalog_path=catalog_path)

image = generate_workflow_image(nodes, edges, format='svg')
display(HTML(wrap_image_in_html(image, format='svg')))

print("Nodes:")
for i, node in enumerate(nodes):
    sql_content = sql_mapping.get(node, "").replace('\n', ' ')
    columns = column_mapping.get(node, [])
    print(f"{i}: {node}")
    if columns:
        print(f"    Columns: {columns}")
    else:
        print("    ⚠️ No columns found")
    if sql_content:
        print(f"    SQL Content: {sql_content[:30]}...")
    else:
        print("    ⚠️ No SQL content found")

print("\nEdges:")
print(edges)

In [None]:
#@title Provide your LLM API (prefer claude 3.5 sonnet)

# if you use Anthropic
cocoon_llm_setting['api_type'] = 'Anthropic'
os.environ["ANTHROPIC_API_KEY"] = "sk-ant-api..."
cocoon_llm_setting['aws_access_key'] = "claude-3-5-sonnet-20240620"

# if you use Vertex AI
# cocoon_llm_setting['api_type'] = 'AnthropicVertex'
# cocoon_llm_setting['vertex_region'] = "us-east5"
# cocoon_llm_setting['vertex_project_id'] = ""
# cocoon_llm_setting['vertex_model'] = "claude-3-5-sonnet@20240620"

# if you use Bedrock
# cocoon_llm_setting['api_type'] = 'AnthropicBedrock'
# cocoon_llm_setting['aws_access_key'] = "..."
# cocoon_llm_setting['aws_secret_key'] = "..."
# cocoon_llm_setting['aws_region'] = "us-east-1"
# cocoon_llm_setting['aws_model'] = "anthropic.claude-3-5-sonnet-20240620-v1:0"

# if you use Open AI
# cocoon_llm_setting['api_type'] = "openai"
# cocoon_llm_setting['api_key'] = "sk-proj-..."
# cocoon_llm_setting['openai_model'] = "gpt-4-turbo"

# if you use Azure Open AI
# cocoon_llm_setting['api_type'] = "azure"
# # azure openai key and endpoint
# cocoon_llm_setting['api_key'] = "..."
# cocoon_llm_setting['api_base'] = "https://xxx.openai.azure.com/"
# # deployed model in azure openai
# cocoon_llm_setting['api_version'] = "2023-12-01-preview"
# cocoon_llm_setting['azure_engine'] = "xxx"

# test if LLM works
test_message = "hello"
messages = [{"role": "user", "content": test_message}]
response = call_llm_chat(messages, temperature=0.1, top_p=0.1, use_cache=False)
print(response['choices'][0]['message']['content'])

Hello! How can I assist you today? Feel free to ask me any questions or let me know if you need help with anything.


In [None]:
#@title Build DBT Lineage by LLMs
dbt_directory = os.path.join(".", project_name)
main_workflow = create_cocoon_dbt_explore_workflow(nodes=nodes, edges=edges, sql_mapping=sql_mapping, column_mapping=column_mapping, viewer=True)
main_workflow.start()

In [None]:
#@title Save the result to disk
main_workflow.para['dbt_lineage'].save_to_disk(db_name=os.path.join(dbt_directory, "cocoon_lineage.db"))

Database saved to ./dbt_amplitude/cocoon_lineage.db


In [None]:
#@title Load the result from disk
dbt_lineage = DbtLineage(db_name=os.path.join(dbt_directory, "cocoon_lineage.db"))

Database imported from ./dbt_amplitude/cocoon_lineage.db


In [None]:
#@title Display the Table Lineage
dbt_lineage.interactive_lineage_display()

In [None]:
#@title Display the Column Lineage
# dbt_lineage.display_column_lineage(model_name="model.amplitude.amplitude__daily_performance", column_name="event_day")
dbt_lineage.display_column_lineage(model_name="model.amplitude.amplitude__daily_performance", column_name="number_users")

In [None]:
# write the whole lineage report to an html
project_name = "dbt_amplitude"
html_content = dbt_lineage.to_html(dbt_name=project_name)
with open(f'{project_name}.html', 'w') as f:
    f.write(html_content)