# **Cocoon: RAG Data Pipeline with Lineage**

Check out the video below, which walks through this notebook.

  <a href="https://www.youtube.com/watch?v=2w0Z0xQpQ_4" target="_blank">
    <img src="https://raw.githubusercontent.com/Cocoon-Data-Transformation/cocoon/main/images/Thumbnail2.png" width="600" alt="IMAGE ALT TEXT" style="cursor: pointer;">
  </a>

In [None]:
! pip install cocoon_data==0.1.162

In [4]:
from cocoon_data import *

In [5]:
#@title  Download example dbt project (skip this step if you have your own)
import requests
import os
import base64

def url_path_join(*args):
    """Join path components for URL using forward slashes."""
    return '/'.join(arg.strip('/') for arg in args)

def download_github_directory(repo_owner, repo_name, directory_path, local_path):
    # GitHub API endpoint
    api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{directory_path}"

    # Send a GET request to the GitHub API
    response = requests.get(api_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Create the local directory if it doesn't exist
        os.makedirs(local_path, exist_ok=True)

        # Parse the JSON response
        contents = response.json()

        # Loop through each file in the directory
        for item in contents:
            if item['type'] == 'file':
                # Get the file content
                file_content = requests.get(item['download_url']).content

                # Save the file locally
                file_path = os.path.join(local_path, item['name'])
                with open(file_path, 'wb') as file:
                    file.write(file_content)
                print(f"Downloaded: {file_path}")
            elif item['type'] == 'dir':
                # If it's a subdirectory, recursively download its contents
                subdir_path = url_path_join(directory_path, item['name'])
                local_subdir_path = os.path.join(local_path, item['name'])
                download_github_directory(repo_owner, repo_name, subdir_path, local_subdir_path)
    else:
        print(f"Failed to retrieve directory contents. Status code: {response.status_code}")

# Usage
repo_owner = "Cocoon-Data-Transformation"
repo_name = "cocoon"
project_name = "dbt_shopify"
directory_path = f"documentation/dbt_projects/{project_name}"
dbt_directory = os.path.join(".", project_name)

download_github_directory(repo_owner, repo_name, directory_path, dbt_directory)

Downloaded: ./dbt_shopify/CHANGELOG.md
Downloaded: ./dbt_shopify/DECISIONLOG.md
Downloaded: ./dbt_shopify/LICENSE
Downloaded: ./dbt_shopify/README.md
Downloaded: ./dbt_shopify/cocoon_lineage.db/column_mapping.csv
Downloaded: ./dbt_shopify/cocoon_lineage.db/edges.csv
Downloaded: ./dbt_shopify/cocoon_lineage.db/load.sql
Downloaded: ./dbt_shopify/cocoon_lineage.db/nodes.csv
Downloaded: ./dbt_shopify/cocoon_lineage.db/schema.sql
Downloaded: ./dbt_shopify/cocoon_lineage.db/sql_mapping.csv
Downloaded: ./dbt_shopify/cocoon_lineage.db/sql_tags.csv
Downloaded: ./dbt_shopify/cocoon_lineage.db/table_lineage.csv
Downloaded: ./dbt_shopify/dbt_project.yml
Downloaded: ./dbt_shopify/models/intermediate/int_shopify__customer_email_rollup.sql
Downloaded: ./dbt_shopify/models/intermediate/int_shopify__daily_abandoned_checkouts.sql
Downloaded: ./dbt_shopify/models/intermediate/int_shopify__daily_fulfillment.sql
Downloaded: ./dbt_shopify/models/intermediate/int_shopify__daily_orders.sql
Downloaded: ./dbt_s

In [None]:
#@title Provide your LLM API (prefer claude 3.5 sonnet; GPT-4 okay)

# if you use Anthropic
cocoon_llm_setting['api_type'] = 'Anthropic'
os.environ["ANTHROPIC_API_KEY"] = "sk-ant-api..."
cocoon_llm_setting['aws_access_key'] = "claude-3-5-sonnet-20240620"

# if you use Vertex AI
# cocoon_llm_setting['api_type'] = 'AnthropicVertex'
# cocoon_llm_setting['vertex_region'] = "us-east5"
# cocoon_llm_setting['vertex_project_id'] = ""
# cocoon_llm_setting['vertex_model'] = "claude-3-5-sonnet@20240620"

# if you use Bedrock
# cocoon_llm_setting['api_type'] = 'AnthropicBedrock'
# cocoon_llm_setting['aws_access_key'] = "..."
# cocoon_llm_setting['aws_secret_key'] = "..."
# cocoon_llm_setting['aws_region'] = "us-east-1"
# cocoon_llm_setting['aws_model'] = "anthropic.claude-3-5-sonnet-20240620-v1:0"

# if you use Open AI
# cocoon_llm_setting['api_type'] = "openai"
# cocoon_llm_setting['api_key'] = "sk-proj-..."
# cocoon_llm_setting['openai_model'] = "gpt-4-turbo"

# if you use Azure Open AI
# cocoon_llm_setting['api_type'] = "azure"
# # azure openai key and endpoint
# cocoon_llm_setting['api_key'] = "..."
# cocoon_llm_setting['api_base'] = "https://xxx.openai.azure.com/"
# # deployed model in azure openai
# cocoon_llm_setting['api_version'] = "2023-12-01-preview"
# cocoon_llm_setting['azure_engine'] = "xxx"

# test if LLM works
test_message = "hello"
messages = [{"role": "user", "content": test_message}]
response = call_llm_chat(messages, temperature=0.1, top_p=0.1, use_cache=False)
print(response['choices'][0]['message']['content'])

In [7]:
#@title Choose Lineage RAG
_, cocoon_workflow = create_cocoon_workflow(con=None, para = {"dbt_directory": dbt_directory})
cocoon_workflow.start()

HTML(value='\n<!DOCTYPE html>\n<html lang="en">\n<head>\n    <meta charset="UTF-8">\n    <meta name="viewport"…

VBox(children=(HBox(children=(Text(value='', layout=Layout(width='70%'), placeholder='Enter your question here…