In [1]:
import pandas as pd
import re
import yaml
import sqlparse
import os
import pandas as pd
import numpy as np
import requests
from IPython.display import display, Markdown

from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

### INIT

In [2]:
def add_repo_root_path():
    import os
    import sys
    repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
    if repo_root not in sys.path:
        sys.path.append(repo_root)
        
add_repo_root_path()
from src import generate_knowledge
from src import create_rag_db
from src import llm_chain_tools
from src.enhanced_retriever import EnhancedRetriever

In [3]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
#pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', 10) 

In [4]:
generate_knowledge.add_repo_root_path()
import openai_setup

OPENAI_API_KEY = openai_setup.conf['key']
OPENAI_PROJECT = openai_setup.conf['project']
OPENAI_ORGANIZATION = openai_setup.conf['organization']
DEFAULT_LLM_MODEL = "gpt-4o-mini"
CHROMADB_DIRECTORY = '../chromadb'
COLLECTION_NAME = "my_chromadb" 

import os
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ['OPENAI_MODEL_NAME'] = DEFAULT_LLM_MODEL

In [5]:
langchain_openai_embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model="text-embedding-ada-002")
langchain_openai_llm = ChatOpenAI(model=DEFAULT_LLM_MODEL, temperature=0.1, openai_api_key=OPENAI_API_KEY, openai_organization = OPENAI_ORGANIZATION)

In [24]:
from langchain_openai import ChatOpenAI

loaded_vectorstore = Chroma(
    collection_name=COLLECTION_NAME,
    persist_directory=CHROMADB_DIRECTORY,
    embedding_function=langchain_openai_embeddings
)

_, repo_name = generate_knowledge.extract_owner_and_repo('https://github.com/dbt-labs/jaffle-shop')
dbt_models_df = pd.read_csv('../data/dbt_models_' + repo_name + '.csv')
dbt_project_df = pd.read_csv('../data/dbt_project_' + repo_name + '.csv')
dbt_repo_knowledge_df = create_rag_db.merge_dbt_models_and_project_dfs(dbt_models_df, dbt_project_df)

import nest_asyncio
nest_asyncio.apply()

import importlib
import src.llm_agents_flow
importlib.reload(src.llm_agents_flow)
from src.llm_agents_flow import dbtChatFlow

files = {
    'agents': '../config/agents.yml',
    'tasks': '../config/tasks.yml'
}

## EXECUTE FLOW

#### User OpenAI LLMs

In [None]:
flow = dbtChatFlow(files)
flow.plot()

user_input = "I want to add a new column 'overdue' to the model orders that come from raw_orders source, and have it available in customers. the overdue column is directly available in raw_orders, is not necessairy to calcylate it "
result = flow.kickoff(inputs={"request": user_input, "dbt_repo_knowledge_df": dbt_repo_knowledge_df, "vectorstore": loaded_vectorstore, "embedding_function":langchain_openai_embeddings})
display(Markdown(f"<div style='font-size: 18px;'><b>User input:</b> <i>{user_input}</i></div><hr>"))
display(Markdown(result.raw))

### Use local LLM model with LM Studio server mode

In [42]:
from crewai import LLM, Agent, Task, Crew
#local_llm_name = "Llama-3.2-3B-Instruct-4bit"
local_llm_name = "qwen2.5-coder-7b-instruct"
local_llm = LLM(model="lm_studio/"+local_llm_name, base_url="http://127.0.0.1:1234/v1")

#### Test local model

In [None]:
agent = Agent(
    role="Data Analyst",
    goal="Analyze eCommerce sales data",
    backstory="Expert in data analytics with years of experience",
    llm=local_llm
)

task = Task(
    description="Analyze sales trends from the last quarter and identify key insights.",
    agent=agent,
    expected_output="A detailed report summarizing sales trends, key insights, and recommendations."
)

crew = Crew(
    agents=[agent],
    tasks=[task],
    verbose=True
)

result = crew.kickoff() 
print(result)

#### Execute flow

In [None]:
local_flow = dbtChatFlow(files, local_llm)
local_flow.plot()

user_input = "I want to add a new column 'overdue' to the model orders that come from raw_orders source, and have it available in customers. the overdue column is directly available in raw_orders, is not necessairy to calcylate it "
result = local_flow.kickoff(inputs={"request": user_input, "dbt_repo_knowledge_df": dbt_repo_knowledge_df, "vectorstore": loaded_vectorstore, "embedding_function":langchain_openai_embeddings})
display(Markdown(f"<div style='font-size: 18px;'><b>User input:</b> <i>{user_input}</i></div><hr>"))
display(Markdown(result.raw))

## Streamlit interface

In [None]:
import streamlit as st

In [None]:
import requests

def get_available_models():
    url = "http://127.0.0.1:1234/v1/models"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            models = response.json()
            model_names = [model['id'] for model in models.get('data', [])]

            return model_names
        else:
            print(f"Error: {response.status_code}")
            return []
    except Exception as e:
        print(f"Error fetching models: {e}")
        return []

model_names = get_available_models()
print("Available models:", model_names)

In [6]:

repo_path = 'https://github.com/dbt-labs/jaffle-shop'
_, repo_name = generate_knowledge.extract_owner_and_repo(repo_path)

dbt_models_df = pd.read_csv('../data/dbt_models_' + repo_name + '.csv')
dbt_project_df = pd.read_csv('../data/dbt_project_' + repo_name + '.csv')
dbt_repo_knowledge_df = create_rag_db.merge_dbt_models_and_project_dfs(dbt_models_df, dbt_project_df)


CHROMADB_DIRECTORY = '../chromadb'
COLLECTION_NAME = repo_name

dbt_repo_knowledge_df['contextual_info'] = dbt_repo_knowledge_df.apply(create_rag_db.combine_contextual_fields, axis=1)
documents = create_rag_db.create_documents_from_df(dbt_repo_knowledge_df)
langchain_openai_embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model="text-embedding-ada-002")

documents_cleaned = create_rag_db.clean_metadata(documents)
documents_chunked = create_rag_db.chunk_documents(documents_cleaned, chunk_size=500, chunk_overlap=100)
create_rag_db.save_vectorstore_to_chroma(documents_chunked, langchain_openai_embeddings, CHROMADB_DIRECTORY, COLLECTION_NAME)
print("chromadb for " + repo_name + " successfully created!", CHROMADB_DIRECTORY, COLLECTION_NAME)

Vectorstore saved to ../chromadb
chromadb for jaffle-shop successfully created! ../chromadb jaffle-shop


In [8]:
dbt_repo_knowledge_df.drop(columns='contextual_info')

Unnamed: 0,knowledge_type,path,name,extension,code,config,materialized,is_snapshot,has_jinja_code,model_category,vertical,yml_code,tests,has_tests,sql_ids,has_select_all_in_last_select,has_group_by,primary_key,filters,is_filtered,macros,has_macros,parent_models,is_source_model,source,children_models,is_end_model,model_description,jinja_description,is_macro,packages,is_seed,is_test,description
0,models,models/staging/stg_customers.sql,stg_customers.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,customers,"{'models': [{'name': 'stg_customers', 'descrip...","{'columns': {'customer_id': ['not_null', 'uniq...",True,['customer_id'],True,False,customer_id,,False,,False,[],True,['ecom.raw_customers'],['customers'],False,"""Extracts customer data from the 'raw_customer...",,,,,,
1,models,models/staging/stg_locations.sql,stg_locations.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,locations,"{'models': [{'name': 'stg_locations', 'descrip...","{'columns': {'location_id': ['not_null', 'uniq...",True,['location_id'],True,False,location_id,,False,['dbt.date_trunc'],True,[],True,['ecom.raw_stores'],['locations'],False,"""Extracts a list of open locations from the 'r...",,,,,,
2,models,models/staging/stg_order_items.sql,stg_order_items.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,order_items,"{'models': [{'name': 'stg_order_items', 'descr...","{'columns': {'order_item_id': ['not_null', 'un...",True,"['order_item_id', 'order_id', 'product_id']",True,False,order_item_id,,False,,False,[],True,['ecom.raw_items'],['order_items'],False,"""Selects individual food and drink items from ...",,,,,,
3,models,models/staging/stg_orders.sql,stg_orders.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,orders,"{'models': [{'name': 'stg_orders', 'descriptio...","{'columns': {'order_id': ['not_null', 'unique'...",True,"['location_id', 'customer_id', 'order_id', 'st...",True,False,order_id,,False,"['cents_to_dollars', 'dbt.date_trunc']",True,[],True,['ecom.raw_orders'],"['order_items', 'orders']",False,Retrieves order data from the 'raw_orders' sou...,,,,,,
4,models,models/staging/stg_products.sql,stg_products.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,products,"{'models': [{'name': 'stg_products', 'descript...","{'columns': {'product_id': ['not_null', 'uniqu...",True,['product_id'],True,False,product_id,,False,['cents_to_dollars'],True,[],True,['ecom.raw_products'],"['order_items', 'products']",False,Retrieves product data from the 'raw_products'...,,,,,,
5,models,models/staging/stg_supplies.sql,stg_supplies.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...,,,False,False,stg,supplies,"{'models': [{'name': 'stg_supplies', 'descript...","{'columns': {'supply_uuid': ['not_null', 'uniq...",True,"['product_id', 'supply_id']",True,False,supply_uuid,,False,"['cents_to_dollars', 'dbt_utils.generate_surro...",True,[],True,['ecom.raw_supplies'],"['order_items', 'supplies']",False,Retrieves supply expense data from the 'raw_su...,,,,,,
6,models,models/marts/customers.sql,customers.sql,.sql,WITH customers AS\n (SELECT *\n FROM {{ ref...,,,False,False,other,customers,"{'models': [{'name': 'customers', 'description...","{'columns': {'customer_id': ['not_null', 'uniq...",True,"['order_id', 'customer_id']",True,True,customer_id,['customers.customer_id = customer_orders_summ...,True,,False,"['stg_customers', 'orders']",False,,[],True,"""Aggregates customer data from the 'stg_custom...",,,,,,
7,models,models/marts/locations.sql,locations.sql,.sql,WITH locations AS\n (SELECT *\n FROM {{ ref...,,,False,False,other,locations,"{'semantic_models': [{'name': 'locations', 'de...",,False,,True,False,,,False,,False,['stg_locations'],False,,[],True,"""Retrieves all location records from the 'stg_...",,,,,,
8,models,models/marts/metricflow_time_spine.sql,metricflow_time_spine.sql,.sql,-- metricflow_time_spine.sql\nWITH days AS (--...,,,False,False,other,metricflow_time_spine,,,False,,True,False,,,False,['dbt_date.get_base_dates'],True,[],False,,[],True,"""Generates a date range spanning 10 years by c...",,,,,,
9,models,models/marts/order_items.sql,order_items.sql,.sql,WITH order_items AS\n (SELECT *\n FROM {{ r...,,,False,False,other,order_items,"{'models': [{'name': 'order_items', 'columns':...","{'columns': {'order_item_id': ['not_null', 'un...",True,"['order_id', 'product_id']",True,True,order_item_id,"['order_items.order_id = orders.order_id', 'or...",True,,False,"['stg_order_items', 'stg_orders', 'stg_product...",False,,['orders'],False,"""Combines order item details from 'stg_order_i...",,,,,,
