# **Cocoon: RAG for Data Warehouse Tables**


### **You Need...**

1. LLM API
2. Data catalog built by [Cocoon](https://cocoon-data-transformation.github.io/page/model)


In [None]:
! pip install cocoon_data==0.1.165

In [None]:
from cocoon_data import *

In [None]:
#@title  Download example catalog (skip this step if you have your own)
import requests
import os
import base64

def download_github_directory(repo_owner, repo_name, directory_path, local_path):
    # GitHub API endpoint
    api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{directory_path}"

    # Send a GET request to the GitHub API
    response = requests.get(api_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Create the local directory if it doesn't exist
        os.makedirs(local_path, exist_ok=True)

        # Parse the JSON response
        contents = response.json()

        # Loop through each file in the directory
        for item in contents:
            if item['type'] == 'file':
                # Get the file content
                file_content = requests.get(item['download_url']).content

                # Save the file locally
                file_path = os.path.join(local_path, item['name'])
                with open(file_path, 'wb') as file:
                    file.write(file_content)
                print(f"Downloaded: {item['name']}")
            elif item['type'] == 'dir':
                # If it's a subdirectory, recursively download its contents
                subdir_path = os.path.join(directory_path, item['name'])
                local_subdir_path = os.path.join(local_path, item['name'])
                download_github_directory(repo_owner, repo_name, subdir_path, local_subdir_path)
    else:
        print(f"Failed to retrieve directory contents. Status code: {response.status_code}")

# Usage
repo_owner = "Cocoon-Data-Transformation"
repo_name = "cocoon"
directory_path = "documentation/model_gallery/saas_business"
dbt_directory = "./saas_business"
dbt_name = "SaaS Business"

download_github_directory(repo_owner, repo_name, directory_path, dbt_directory)

Downloaded: cocoon_er.yml
Downloaded: cocoon_join.yml
Downloaded: model.html
Downloaded: snapshot_sf_account_history_data.sql
Downloaded: snapshot_sf_account_history_data.yml
Downloaded: snapshot_sf_contact_history_data.sql
Downloaded: snapshot_sf_contact_history_data.yml
Downloaded: snapshot_sf_lead_data.sql
Downloaded: snapshot_sf_lead_data.yml
Downloaded: snapshot_sf_opportunity_history_data.sql
Downloaded: snapshot_sf_opportunity_history_data.yml
Downloaded: sources.yml
Downloaded: stg_balance_transaction_data.html
Downloaded: stg_balance_transaction_data.sql
Downloaded: stg_balance_transaction_data.yml
Downloaded: stg_card_data.html
Downloaded: stg_card_data.sql
Downloaded: stg_card_data.yml
Downloaded: stg_charge_data.html
Downloaded: stg_charge_data.sql
Downloaded: stg_charge_data.yml
Downloaded: stg_customer_data.html
Downloaded: stg_customer_data.sql
Downloaded: stg_customer_data.yml
Downloaded: stg_dispute_data.html
Downloaded: stg_dispute_data.sql
Downloaded: stg_dispute_dat

In [None]:
#@title Provide your LLM API (prefer claude 3.5 sonnet)

# if you use Anthropic for claude 3.5 sonnet
cocoon_llm_setting['api_type'] = 'Anthropic'
os.environ["ANTHROPIC_API_KEY"] = "sk-ant-api..."
cocoon_llm_setting['aws_access_key'] = "claude-3-5-sonnet-20240620"

# # if you use Vertex AI for claude 3.5 sonnet
# # if you use google colab
# # from google.colab import auth
# # auth.authenticate_user()
# cocoon_llm_setting['api_type'] = 'AnthropicVertex'
# cocoon_llm_setting['vertex_region'] = "us-east5"
# cocoon_llm_setting['vertex_project_id'] = ""
# cocoon_llm_setting['vertex_model'] = "claude-3-5-sonnet@20240620"

# # if you use Bedrock for claude 3.5 sonnet
# cocoon_llm_setting['api_type'] = 'AnthropicBedrock'
# cocoon_llm_setting['aws_access_key'] = "..."
# cocoon_llm_setting['aws_secret_key'] = "..."
# cocoon_llm_setting['aws_region'] = "us-east-1"
# cocoon_llm_setting['aws_model'] = "anthropic.claude-3-5-sonnet-20240620-v1:0"

# # if you use Vertex AI for llama3
# cocoon_llm_setting['api_type'] = 'Llama3Vertex'
# cocoon_llm_setting['vertex_region'] = "us-east5"
# cocoon_llm_setting['vertex_project_id'] = ""

# # if you use Bedrock for llama3
# cocoon_llm_setting['api_type'] = 'Llama3Bedrock'
# cocoon_llm_setting['aws_access_key'] = "..."
# cocoon_llm_setting['aws_secret_key'] = "..."
# cocoon_llm_setting['aws_region'] = "us-west-2"

# # if you use Open AI
# cocoon_llm_setting['api_type'] = "openai"
# cocoon_llm_setting['api_key'] = "sk-proj-..."
# cocoon_llm_setting['openai_model'] = "gpt-4-turbo"

# # if you use Azure Open AI
# cocoon_llm_setting['api_type'] = "azure"
# # azure openai key and endpoint
# cocoon_llm_setting['api_key'] = "..."
# cocoon_llm_setting['api_base'] = "https://xxx.openai.azure.com/"
# # deployed model in azure openai
# cocoon_llm_setting['api_version'] = "2023-12-01-preview"
# cocoon_llm_setting['azure_engine'] = "xxx"

# test if LLM works
test_message = "hello"
messages = [{"role": "user", "content": test_message}]
response = call_llm_chat(messages, temperature=0.1, top_p=0.1, use_cache=False)
print(response['choices'][0]['message']['content'])

In [None]:
cocoon_llm_setting['api_type'] = 'AnthropicBedrock'
cocoon_llm_setting['aws_access_key'] = "AKIAVVQPCDORZI7JKZH3"
cocoon_llm_setting['aws_secret_key'] = "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"
cocoon_llm_setting['aws_region'] = "us-east-1"
cocoon_llm_setting['aws_model'] = "anthropic.claude-3-5-sonnet-20240620-v1:0"

In [None]:
#@title RAG your Data Warehouse Tables (Choose Data Warehouse Copilot)
_, cocoon_workflow = create_cocoon_workflow(con=None, para = {"dbt_directory": dbt_directory})
cocoon_workflow.start()

HTML(value='\n<!DOCTYPE html>\n<html lang="en">\n<head>\n    <meta charset="UTF-8">\n    <meta name="viewport"‚Ä¶

VBox(children=(HBox(children=(Text(value='', layout=Layout(width='70%'), placeholder='Enter your question here‚Ä¶