# **Cocoon: Data Catalog for GenAI**


### **You Need...**

1. LLM API (cost is typically <50 cents per table)
2. DBT Project as the data catalog (built by [Cocoon Model](https://cocoon-data-transformation.github.io/page/model))


In [None]:
! pip install cocoon_data -U

In [None]:
from cocoon_data import *

In [None]:
#@title  Download example table (skip this step if you have your own table)
import requests
import os
import base64

def download_github_directory(repo_owner, repo_name, directory_path, local_path):
    # GitHub API endpoint
    api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{directory_path}"

    # Send a GET request to the GitHub API
    response = requests.get(api_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Create the local directory if it doesn't exist
        os.makedirs(local_path, exist_ok=True)

        # Parse the JSON response
        contents = response.json()

        # Loop through each file in the directory
        for item in contents:
            if item['type'] == 'file':
                # Get the file content
                file_content = requests.get(item['download_url']).content

                # Save the file locally
                file_path = os.path.join(local_path, item['name'])
                with open(file_path, 'wb') as file:
                    file.write(file_content)
                print(f"Downloaded: {item['name']}")
            elif item['type'] == 'dir':
                # If it's a subdirectory, recursively download its contents
                subdir_path = os.path.join(directory_path, item['name'])
                local_subdir_path = os.path.join(local_path, item['name'])
                download_github_directory(repo_owner, repo_name, subdir_path, local_subdir_path)
    else:
        print(f"Failed to retrieve directory contents. Status code: {response.status_code}")

# Usage
repo_owner = "Cocoon-Data-Transformation"
repo_name = "cocoon"
directory_path = "documentation/model_gallery/linkedin"
dbt_directory = "./linkedin"
dbt_name = "Linkedin"

download_github_directory(repo_owner, repo_name, directory_path, dbt_directory)

Downloaded: cocoon_er.yml
Downloaded: cocoon_join.yml
Downloaded: model.html
Downloaded: snapshot_linkedin_account_history_data.sql
Downloaded: snapshot_linkedin_account_history_data.yml
Downloaded: snapshot_linkedin_campaign_group_history_data.sql
Downloaded: snapshot_linkedin_campaign_group_history_data.yml
Downloaded: snapshot_linkedin_campaign_history_data.sql
Downloaded: snapshot_linkedin_campaign_history_data.yml
Downloaded: snapshot_linkedin_creative_history_data.sql
Downloaded: snapshot_linkedin_creative_history_data.yml
Downloaded: sources.yml
Downloaded: stg_linkedin_account_history_data.html
Downloaded: stg_linkedin_account_history_data.sql
Downloaded: stg_linkedin_account_history_data.yml
Downloaded: stg_linkedin_ad_analytics_by_campaign_data.html
Downloaded: stg_linkedin_ad_analytics_by_campaign_data.sql
Downloaded: stg_linkedin_ad_analytics_by_campaign_data.yml
Downloaded: stg_linkedin_ad_analytics_by_creative_data.html
Downloaded: stg_linkedin_ad_analytics_by_creative_da

In [None]:
#@title Provide your LLM API (prefer claude 3.5 sonnet)

# if you use Anthropic
openai.api_type ='Anthropic'
os.environ["ANTHROPIC_API_KEY"] = ""
cocoon_llm_setting['claude_model'] = "claude-3-5-sonnet-20240620"

# # if you use Vertex AI
# openai.api_type = 'AnthropicVertex'
# cocoon_llm_setting['vertex_region'] = "us-east5"
# cocoon_llm_setting['vertex_project_id'] = ""
# cocoon_llm_setting['vertex_model'] = "claude-3-5-sonnet@20240620"

# # if you use Bedrock
# openai.api_type = 'AnthropicBedrock'
# cocoon_llm_setting['aws_access_key'] = ""
# cocoon_llm_setting['aws_secret_key'] = ""
# cocoon_llm_setting['aws_region'] = "us-east-1"
# cocoon_llm_setting['aws_model'] = "anthropic.claude-3-5-sonnet-20240620-v1:0"

# # if you use Open AI
# openai.api_key  = ''
# cocoon_llm_setting['openai_model'] = "gpt-4-turbo"

# test if LLM works
test_message = "hello"
messages = [{"role": "user", "content": test_message}]
response = call_llm_chat(messages, temperature=0.1, top_p=0.1, use_cache=False)
print(response['choices'][0]['message']['content'])

In [None]:
# # if graphviz is not installed locally
# set_image_mode("mermaid")

In [None]:
# provide the directory of the dbt project built by cocoon
business_question = "Which campaign is the most effective?"
chat = ChatUI()
chat.display()

HTML(value='<iframe srcdoc="\n&lt;style&gt;\n    body, html {\n        margin: 0;\n        padding: 0;\n      …

In [None]:
query_widget, main_workflow = create_cocoon_genai_workflow(
                    para={"dbt_directory": dbt_directory,
                            "business_question":business_question,
                            "viewer": True,
                            "chatui": chat,})
main_workflow.start()

IntProgress(value=1, max=2)

Button(button_style='success', description='Finalize Debugged Query', icon='check', style=ButtonStyle())

In [None]:
# write the genai result to an html file
with open("cocoon_genai_result.html", "w") as f:
    f.write(chat.get_chat_html())