In [1]:
import pandas as pd
import re
import yaml
import sqlparse
import os
import pandas as pd
import numpy as np
import requests
from IPython.display import display, Markdown

from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

### INIT

In [2]:
def add_repo_root_path():
    import os
    import sys
    repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
    if repo_root not in sys.path:
        sys.path.append(repo_root)
        
add_repo_root_path()
from src import generate_knowledge
from src import create_rag_db
from src import llm_chain_tools
from src.enhanced_retriever import EnhancedRetriever

In [3]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
#pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', 10) 

In [4]:
generate_knowledge.add_repo_root_path()
import openai_setup

OPENAI_API_KEY = openai_setup.conf['key']
OPENAI_PROJECT = openai_setup.conf['project']
OPENAI_ORGANIZATION = openai_setup.conf['organization']
DEFAULT_LLM_MODEL = "gpt-4o-mini"
CHROMADB_DIRECTORY = '../chromadb'
COLLECTION_NAME = "my_chromadb" 

import os
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ['OPENAI_MODEL_NAME'] = DEFAULT_LLM_MODEL

In [5]:
langchain_openai_embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model="text-embedding-ada-002")
langchain_openai_llm = ChatOpenAI(model=DEFAULT_LLM_MODEL, temperature=0.1, openai_api_key=OPENAI_API_KEY, openai_organization = OPENAI_ORGANIZATION)

In [24]:
from langchain_openai import ChatOpenAI

loaded_vectorstore = Chroma(
    collection_name=COLLECTION_NAME,
    persist_directory=CHROMADB_DIRECTORY,
    embedding_function=langchain_openai_embeddings
)

_, repo_name = generate_knowledge.extract_owner_and_repo('https://github.com/dbt-labs/jaffle-shop')
dbt_models_df = pd.read_csv('../data/dbt_models_' + repo_name + '.csv')
dbt_project_df = pd.read_csv('../data/dbt_project_' + repo_name + '.csv')
dbt_repo_knowledge_df = create_rag_db.merge_dbt_models_and_project_dfs(dbt_models_df, dbt_project_df)

import nest_asyncio
nest_asyncio.apply()

import importlib
import src.llm_agents_flow
importlib.reload(src.llm_agents_flow)
from src.llm_agents_flow import dbtChatFlow

files = {
    'agents': '../config/agents.yml',
    'tasks': '../config/tasks.yml'
}

## EXECUTE FLOW

#### User OpenAI LLMs

In [None]:
flow = dbtChatFlow(files)
flow.plot()

user_input = "I want to add a new column 'overdue' to the model orders that come from raw_orders source, and have it available in customers. the overdue column is directly available in raw_orders, is not necessairy to calcylate it "
result = flow.kickoff(inputs={"request": user_input, "dbt_repo_knowledge_df": dbt_repo_knowledge_df, "vectorstore": loaded_vectorstore, "embedding_function":langchain_openai_embeddings})
display(Markdown(f"<div style='font-size: 18px;'><b>User input:</b> <i>{user_input}</i></div><hr>"))
display(Markdown(result.raw))

### Use local LLM model with LM Studio server mode

In [42]:
from crewai import LLM, Agent, Task, Crew
#local_llm_name = "Llama-3.2-3B-Instruct-4bit"
local_llm_name = "qwen2.5-coder-7b-instruct"
local_llm = LLM(model="lm_studio/"+local_llm_name, base_url="http://127.0.0.1:1234/v1")

#### Test local model

In [None]:
agent = Agent(
    role="Data Analyst",
    goal="Analyze eCommerce sales data",
    backstory="Expert in data analytics with years of experience",
    llm=local_llm
)

task = Task(
    description="Analyze sales trends from the last quarter and identify key insights.",
    agent=agent,
    expected_output="A detailed report summarizing sales trends, key insights, and recommendations."
)

crew = Crew(
    agents=[agent],
    tasks=[task],
    verbose=True
)

result = crew.kickoff() 
print(result)

#### Execute flow

In [None]:
local_flow = dbtChatFlow(files, local_llm)
local_flow.plot()

user_input = "I want to add a new column 'overdue' to the model orders that come from raw_orders source, and have it available in customers. the overdue column is directly available in raw_orders, is not necessairy to calcylate it "
result = local_flow.kickoff(inputs={"request": user_input, "dbt_repo_knowledge_df": dbt_repo_knowledge_df, "vectorstore": loaded_vectorstore, "embedding_function":langchain_openai_embeddings})
display(Markdown(f"<div style='font-size: 18px;'><b>User input:</b> <i>{user_input}</i></div><hr>"))
display(Markdown(result.raw))

## Streamlit interface

In [None]:
import streamlit as st

In [None]:
import requests

def get_available_models():
    url = "http://127.0.0.1:1234/v1/models"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            models = response.json()
            model_names = [model['id'] for model in models.get('data', [])]

            return model_names
        else:
            print(f"Error: {response.status_code}")
            return []
    except Exception as e:
        print(f"Error fetching models: {e}")
        return []

model_names = get_available_models()
print("Available models:", model_names)

In [5]:

repo_path = 'https://github.com/dbt-labs/jaffle-shop'
_, repo_name = generate_knowledge.extract_owner_and_repo(repo_path)

dbt_models_df = pd.read_csv('../data/dbt_models_' + repo_name + '.csv')
dbt_project_df = pd.read_csv('../data/dbt_project_' + repo_name + '.csv')
dbt_repo_knowledge_df = create_rag_db.merge_dbt_models_and_project_dfs(dbt_models_df, dbt_project_df)


CHROMADB_DIRECTORY = '../chromadb'
COLLECTION_NAME = repo_name

dbt_repo_knowledge_df['contextual_info'] = dbt_repo_knowledge_df.apply(create_rag_db.combine_contextual_fields, axis=1)
documents = create_rag_db.create_documents_from_df(dbt_repo_knowledge_df)
langchain_openai_embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model="text-embedding-ada-002")

documents_cleaned = create_rag_db.clean_metadata(documents)
documents_chunked = create_rag_db.chunk_documents(documents_cleaned, chunk_size=500, chunk_overlap=100)
create_rag_db.save_vectorstore_to_chroma(documents_chunked, langchain_openai_embeddings, CHROMADB_DIRECTORY, COLLECTION_NAME)
print("chromadb for " + repo_name + " successfully created!", CHROMADB_DIRECTORY, COLLECTION_NAME)

Vectorstore saved to ../chromadb
chromadb for jaffle-shop successfully created! ../chromadb jaffle-shop


In [None]:
dbt_repo_knowledge_df.drop(columns='contextual_info')

#### Fix read folders

In [19]:
def extract_model_file_content(path, is_online, repo_base_url):
    try:
        if is_online:
            # Build complete URL
            file_url = f"{repo_base_url}/{path}" if repo_base_url else path
            response = requests.get(file_url)
            if response.status_code == 200:
                content = response.text
            else:
                return f"Error: {response.status_code} {response.reason}"
        else:
            # Read content locally
            open_path = repo_base_url.rstrip('/') + '/' + path

            print(open_path)
            with open(open_path, 'r', encoding='utf-8') as file:
                content = file.read()

        # Process content based on file type
        if path.endswith(('.yml', '.yaml')):
            try:
                return yaml.safe_load(content)  # Parse YAML and return as dictionary
            except yaml.YAMLError as e:
                return f"Error parsing YAML: {e}"
        elif path.endswith('.sql'):
            try:
                return sqlparse.format(content, reindent=True, keyword_case='upper')  # Format SQL
            except Exception as e:
                return f"Error parsing SQL: {e}"
        else:
            return content  # Return plain text for other types

    except Exception as e:
        return f"Error: {e}"

def add_model_code_column(df, is_online, repo_url):
    if is_online:
        pass
    else:
        repo_base_url = repo_url

    # Extract content for each file and process it based on type
    df['sql_code'] = df['path'].apply(lambda path: extract_model_file_content(path, is_online, repo_base_url))
    return df

In [20]:
repo_path = "/Users/jobandtalent/Documents/GitHub/llm-rag-dbt/test_repo/jaffle-shop-main"
repo_elements = generate_knowledge.list_local_repo_structure(repo_path)
print(repo_elements)

repo_dbt_elements = generate_knowledge.select_dbt_elements_by_extension(repo_elements)
print(repo_dbt_elements)

repo_dbt_models = generate_knowledge.select_dbt_models(repo_dbt_elements)
print(repo_dbt_models)

dbt_project_df =  generate_knowledge.select_dbt_project_files(repo_dbt_elements)
display(dbt_project_df.head(20))

dbt_models_df = generate_knowledge.generate_dbt_models_df(repo_dbt_models)
display(dbt_models_df)

dbt_project_df, dbt_models_df = generate_knowledge.move_snapshots_to_models(dbt_project_df, dbt_models_df)

dbt_models_df = add_model_code_column(dbt_models_df, False, repo_path)
dbt_models_df.head(3)


['.DS_Store', 'requirements.txt', '.pre-commit-config.yaml', 'package-lock.yml', 'packages.yml', '.sqlfluffignore', 'README.md', '.gitignore', 'Taskfile.yml', '.sqlfluff', 'dbt_project.yml', 'requirements.in', 'data-tests/', 'data-tests/.gitkeep', 'models/', 'models/.DS_Store', 'models/staging/', 'models/staging/stg_products.sql', 'models/staging/stg_supplies.yml', 'models/staging/stg_customers.sql', 'models/staging/stg_orders.yml', 'models/staging/stg_products.yml', 'models/staging/stg_supplies.sql', 'models/staging/stg_customers.yml', 'models/staging/stg_orders.sql', 'models/staging/__sources.yml', 'models/staging/stg_order_items.sql', 'models/staging/stg_locations.sql', 'models/staging/stg_order_items.yml', 'models/staging/stg_locations.yml', 'models/marts/', 'models/marts/products.yml', 'models/marts/supplies.sql', 'models/marts/customers.yml', 'models/marts/orders.yml', 'models/marts/products.sql', 'models/marts/customers.sql', 'models/marts/supplies.yml', 'models/marts/orders.sql

Unnamed: 0,path,name,extension
0,package-lock.yml,package-lock.yml,.yml
1,packages.yml,packages.yml,.yml
2,Taskfile.yml,Taskfile.yml,.yml
3,dbt_project.yml,dbt_project.yml,.yml
4,macros/cents_to_dollars.sql,cents_to_dollars.sql,.sql
5,macros/generate_schema_name.sql,generate_schema_name.sql,.sql
6,jaffle-data/raw_items.csv,raw_items.csv,.csv
7,jaffle-data/raw_customers.csv,raw_customers.csv,.csv
8,jaffle-data/raw_stores.csv,raw_stores.csv,.csv
9,jaffle-data/raw_orders.csv,raw_orders.csv,.csv


Unnamed: 0,path,name,extension
0,models/staging/stg_products.sql,stg_products.sql,.sql
1,models/staging/stg_supplies.yml,stg_supplies.yml,.yml
2,models/staging/stg_customers.sql,stg_customers.sql,.sql
3,models/staging/stg_orders.yml,stg_orders.yml,.yml
4,models/staging/stg_products.yml,stg_products.yml,.yml
5,models/staging/stg_supplies.sql,stg_supplies.sql,.sql
6,models/staging/stg_customers.yml,stg_customers.yml,.yml
7,models/staging/stg_orders.sql,stg_orders.sql,.sql
8,models/staging/__sources.yml,__sources.yml,.yml
9,models/staging/stg_order_items.sql,stg_order_items.sql,.sql


/Users/jobandtalent/Documents/GitHub/llm-rag-dbt/test_repo/jaffle-shop-main/models/staging/stg_products.sql
/Users/jobandtalent/Documents/GitHub/llm-rag-dbt/test_repo/jaffle-shop-main/models/staging/stg_supplies.yml
/Users/jobandtalent/Documents/GitHub/llm-rag-dbt/test_repo/jaffle-shop-main/models/staging/stg_customers.sql
/Users/jobandtalent/Documents/GitHub/llm-rag-dbt/test_repo/jaffle-shop-main/models/staging/stg_orders.yml
/Users/jobandtalent/Documents/GitHub/llm-rag-dbt/test_repo/jaffle-shop-main/models/staging/stg_products.yml
/Users/jobandtalent/Documents/GitHub/llm-rag-dbt/test_repo/jaffle-shop-main/models/staging/stg_supplies.sql
/Users/jobandtalent/Documents/GitHub/llm-rag-dbt/test_repo/jaffle-shop-main/models/staging/stg_customers.yml
/Users/jobandtalent/Documents/GitHub/llm-rag-dbt/test_repo/jaffle-shop-main/models/staging/stg_orders.sql
/Users/jobandtalent/Documents/GitHub/llm-rag-dbt/test_repo/jaffle-shop-main/models/staging/__sources.yml
/Users/jobandtalent/Documents/Git

  snapshots_filter = dbt_project_df['path'].str.contains(r'(snapshots/|^snap)', case=False, regex=True)


Unnamed: 0,path,name,extension,sql_code
0,models/staging/stg_products.sql,stg_products.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...
1,models/staging/stg_supplies.yml,stg_supplies.yml,.yml,"{'models': [{'name': 'stg_supplies', 'descript..."
2,models/staging/stg_customers.sql,stg_customers.sql,.sql,WITH SOURCE AS\n (SELECT *\n FROM {{ source...
