In [1]:
import pandas as pd
import re
import yaml
import sqlparse
import os
import pandas as pd
import numpy as np
import requests
from IPython.display import display, Markdown

from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

### INIT

In [2]:
def add_repo_root_path():
    import os
    import sys
    repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
    if repo_root not in sys.path:
        sys.path.append(repo_root)
        
add_repo_root_path()
from src import generate_knowledge
from src import create_rag_db
from src import llm_chain_tools
from src.enhanced_retriever import EnhancedRetriever

In [3]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
#pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', 10) 

In [4]:
generate_knowledge.add_repo_root_path()
import openai_setup

OPENAI_API_KEY = openai_setup.conf['key']
OPENAI_PROJECT = openai_setup.conf['project']
OPENAI_ORGANIZATION = openai_setup.conf['organization']
DEFAULT_LLM_MODEL = "gpt-4o-mini"
CHROMADB_DIRECTORY = '../chromadb'
COLLECTION_NAME = "my_chromadb" 

import os
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ['OPENAI_MODEL_NAME'] = DEFAULT_LLM_MODEL

In [5]:
langchain_openai_embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model="text-embedding-ada-002")
langchain_openai_llm = ChatOpenAI(model=DEFAULT_LLM_MODEL, temperature=0.1, openai_api_key=OPENAI_API_KEY, openai_organization = OPENAI_ORGANIZATION)

In [24]:
from langchain_openai import ChatOpenAI

loaded_vectorstore = Chroma(
    collection_name=COLLECTION_NAME,
    persist_directory=CHROMADB_DIRECTORY,
    embedding_function=langchain_openai_embeddings
)

_, repo_name = generate_knowledge.extract_owner_and_repo('https://github.com/dbt-labs/jaffle-shop')
dbt_models_df = pd.read_csv('../data/dbt_models_' + repo_name + '.csv')
dbt_project_df = pd.read_csv('../data/dbt_project_' + repo_name + '.csv')
dbt_repo_knowledge_df = create_rag_db.merge_dbt_models_and_project_dfs(dbt_models_df, dbt_project_df)

import nest_asyncio
nest_asyncio.apply()

import importlib
import src.llm_agents_flow
importlib.reload(src.llm_agents_flow)
from src.llm_agents_flow import dbtChatFlow

files = {
    'agents': '../config/agents.yml',
    'tasks': '../config/tasks.yml'
}

## EXECUTE FLOW

#### User OpenAI LLMs

In [None]:
flow = dbtChatFlow(files)
flow.plot()

user_input = "I want to add a new column 'overdue' to the model orders that come from raw_orders source, and have it available in customers. the overdue column is directly available in raw_orders, is not necessairy to calcylate it "
result = flow.kickoff(inputs={"request": user_input, "dbt_repo_knowledge_df": dbt_repo_knowledge_df, "vectorstore": loaded_vectorstore, "embedding_function":langchain_openai_embeddings})
display(Markdown(f"<div style='font-size: 18px;'><b>User input:</b> <i>{user_input}</i></div><hr>"))
display(Markdown(result.raw))

### Use local LLM model with LM Studio server mode

In [42]:
from crewai import LLM, Agent, Task, Crew
#local_llm_name = "Llama-3.2-3B-Instruct-4bit"
local_llm_name = "qwen2.5-coder-7b-instruct"
local_llm = LLM(model="lm_studio/"+local_llm_name, base_url="http://127.0.0.1:1234/v1")

#### Test local model

In [None]:
agent = Agent(
    role="Data Analyst",
    goal="Analyze eCommerce sales data",
    backstory="Expert in data analytics with years of experience",
    llm=local_llm
)

task = Task(
    description="Analyze sales trends from the last quarter and identify key insights.",
    agent=agent,
    expected_output="A detailed report summarizing sales trends, key insights, and recommendations."
)

crew = Crew(
    agents=[agent],
    tasks=[task],
    verbose=True
)

result = crew.kickoff() 
print(result)

#### Execute flow

In [None]:
local_flow = dbtChatFlow(files, local_llm)
local_flow.plot()

user_input = "I want to add a new column 'overdue' to the model orders that come from raw_orders source, and have it available in customers. the overdue column is directly available in raw_orders, is not necessairy to calcylate it "
result = local_flow.kickoff(inputs={"request": user_input, "dbt_repo_knowledge_df": dbt_repo_knowledge_df, "vectorstore": loaded_vectorstore, "embedding_function":langchain_openai_embeddings})
display(Markdown(f"<div style='font-size: 18px;'><b>User input:</b> <i>{user_input}</i></div><hr>"))
display(Markdown(result.raw))

## Streamlit interface

In [50]:
import streamlit as st

In [None]:
repo_dbt_elements = select_dbt_elements_by_extension(repo_elements)
repo_dbt_models = select_dbt_models(repo_dbt_elements)
dbt_project_df = select_dbt_project_files(repo_dbt_elements)
dbt_models_df = generate_dbt_models_df(repo_dbt_models)
dbt_project_df, dbt_models_df = move_snapshots_to_models(dbt_project_df, dbt_models_df)
dbt_models_df = add_model_code_column(dbt_models_df, is_online = True, online_dbt_repo = repo_path)
dbt_models_df = add_config_column(dbt_models_df)
dbt_models_df['materialized'] = dbt_models_df['config'].apply(extract_materialized_value)
dbt_models_df['is_snapshot'] = dbt_models_df['config'].apply(check_is_snapshot)
dbt_models_df['materialized'] = dbt_models_df.apply(lambda row: 'snapshot' if row['is_snapshot'] else row['materialized'] ,1)
dbt_models_df['has_jinja_code'] = dbt_models_df['sql_code'].apply(contains_jinja_code)
dbt_models_df['model_category'] = dbt_models_df['name'].apply(categorize_model)
dbt_models_df['vertical'] = dbt_models_df.apply(lambda row: get_vertical(row['name'], row['model_category']), axis=1)
dbt_models_df = assign_yml_rows_to_each_model(dbt_models_df)
dbt_models_df['tests'] = dbt_models_df['yml_code'].apply(extract_tests)
dbt_models_df['has_tests'] = dbt_models_df['tests'].apply(lambda x: x is not None)
dbt_models_df['sql_ids'] = dbt_models_df['sql_code'].apply(extract_ids_from_query)
dbt_models_df['has_select_all_in_last_select'] = dbt_models_df['sql_code'].apply(has_select_all_in_last_select)
dbt_models_df['has_group_by'] = dbt_models_df['sql_code'].apply(has_group_by)
dbt_models_df['primary_key'] = dbt_models_df['tests'].apply(find_primary_key)
dbt_models_df['filters'] = dbt_models_df['sql_code'].apply(extract_sql_filters)
dbt_models_df['is_filtered'] = dbt_models_df['filters'].apply(lambda x: x is not None)
dbt_models_df['macros'] = dbt_models_df['sql_code'].apply(extract_dbt_macros)
dbt_models_df['has_macros'] = dbt_models_df['macros'].apply(lambda x: x is not None)
dbt_models_enriched_df = enrich_dbt_models(dbt_models_df)

from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage

llm = ChatOpenAI(model=DEFAULT_LLM_MODEL, temperature=0.1, openai_api_key=OPENAI_API_KEY, openai_organization = OPENAI_ORGANIZATION)
dbt_models_enriched_df['model_description'] = dbt_models_enriched_df.progress_apply(
    lambda row: generate_model_description(llm, row),
    axis=1
)
dbt_models_enriched_df['jinja_description'] = dbt_models_enriched_df.progress_apply(
    lambda row: generate_jinja_description(llm, row),
    axis=1
)
dbt_project_df = add_project_code_column(dbt_project_df, is_online, online_dbt_repo)
dbt_project_df['is_seed'] = dbt_project_df['path'].apply(is_seed)
dbt_project_df['is_macro'] = dbt_project_df['path'].apply(is_macro)
dbt_project_df['is_test'] = dbt_project_df['path'].apply(is_test)
dbt_project_df['packages'] = dbt_project_df.apply(extract_packages, 1)
dbt_project_df['description'] = None
dbt_project_df['description'] = dbt_project_df.progress_apply(
    lambda row: generate_packages_description(llm, row),
    axis=1
)
dbt_project_df['description'] = dbt_project_df.progress_apply(
    lambda row: generate_macro_description(llm, row),
    axis=1
)
dbt_project_df['description'] = dbt_project_df.progress_apply(
    lambda row: generate_dbt_config_summary(llm, row),
    axis=1
)
dbt_project_df['description'] = dbt_project_df.progress_apply(
    lambda row: generate_tests_description(llm, row),
    axis=1
)
_, repo_name = extract_owner_and_repo(repo_path)
print(repo_name)

dbt_models_enriched_df.to_csv('../data/dbt_models_' + repo_name + '.csv', index=False)
dbt_project_df.to_csv('../data/dbt_project_' + repo_name + '.csv', index=False)