In [11]:
! pip install -q -U langchain-ollama  python-dotenv gradio langchain plotly scikit-learn matplotlib langchain-community psycopg2-binary

! pip install pymilvus -q

In [4]:
# imports
import os
import glob
from dotenv import load_dotenv
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# imports for langchain, plotly and Chroma
from langchain.vectorstores import Milvus
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
# from langchain_openai import OpenAIEmbeddings, ChatOpenAI
# from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings, ChatOllama
## banco postgresql
import psycopg2
from langchain.docstore.document import Document

In [2]:
from pymilvus import connections

# Conecte-se ao Milvus na porta 19530
connections.connect(alias="default", host="localhost", port="19530")

status = connections.has_connection("default")
print("Conectado:", status)


Conectado: True


In [6]:

MODEL = "nomic-embed-text"

In [7]:
folders = glob.glob("../knowledge_base/*")

# Dados diretorio knowledge_base
def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

text_loader_kwargs = {'encoding': 'utf-8'}

documents = []

for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

# Dados de conexão com o PostgreSQL
conn = psycopg2.connect(
    host="localhost",
    port="5433",
    database="app_db",
    user="admin",
    password="admin"
)

cursor = conn.cursor()

# Obter todas as tabelas do schema 'public'
cursor.execute("""
    SELECT tablename
    FROM pg_tables
    WHERE schemaname = 'public'
""")
tables = [row[0] for row in cursor.fetchall()]

documents_pg = []

# Função auxiliar para formatar cada linha como texto
def row_to_content(column_names, row):
    return "\n".join(f"{col}: {val}" for col, val in zip(column_names, row))

# Para cada tabela, buscar os dados e criar Documentos
for table in tables:
    cursor.execute(f"SELECT * FROM {table}")
    rows = cursor.fetchall()
    colnames = [desc[0] for desc in cursor.description]

    for row in rows:
        content = row_to_content(colnames, row)
        doc = Document(
            page_content=content,
            metadata={"source": "postgresql", "doc_type": table}
        )
        documents_pg.append(doc)

cursor.close()
conn.close()

all_documents = documents + documents_pg

# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200, separators=["\n\n","\n","."," ", ""])
chunks = text_splitter.split_documents(all_documents)

print(f"Total de documentos carregados de Postgresql {len(tables)} tabelas: {len(documents_pg)}")
print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in all_documents)}")

Total de documentos carregados de Postgresql 13 tabelas: 59
Total number of chunks: 91
Document types found: {'features', 'ndwi_insights', 'savi_interpretation', 'recl_interpretation', 'savi_insights', 'ndvi_interpretation', 'osavi_interpretation', 'osavi_insights', 'recl_insights', 'company', 'osavi_fenologico', 'gndvi_insights', 'ndwi_interpretation', 'gndvi_interpretation'}


In [13]:
embeddings = OllamaEmbeddings(model=MODEL)

In [9]:
vector_store = Milvus.from_documents(
    chunks,
    embedding=embeddings,
    connection_args={"host": "localhost", "port": "19530"},
    collection_name="knowledge_base_prediza"
)

In [13]:
from pymilvus import connections, utility

collections = utility.list_collections()
print("Coleções no Milvus:")
for collection in collections:
    print("-", collection)

Coleções no Milvus:
- prediza_chunks


In [15]:
from pymilvus import Collection
#collection_name = "knowledge_base_prediza"

collection_name = "prediza_chunks"
collection = Collection(name=collection_name)

num_documents = collection.num_entities
print(f"A coleção '{collection_name}' tem {num_documents} documentos.")


A coleção 'prediza_chunks' tem 91 documentos.


In [31]:
from sklearn.decomposition import PCA
import plotly.express as px
import pandas as pd

# Gera embeddings e metadados
emb_vectors = [embeddings.embed_query(chunk.page_content) for chunk in chunks]
texts = [chunk.page_content for chunk in chunks]
doc_types = [chunk.metadata.get("doc_type", "desconhecido") for chunk in chunks]

# Reduz os vetores para 2D
pca = PCA(n_components=2)
emb_2d = pca.fit_transform(emb_vectors)

# Cria DataFrame
df = pd.DataFrame({
    "x": emb_2d[:, 0],
    "y": emb_2d[:, 1],
    "text": texts,
    "doc_type": doc_types
})

# Plota com Plotly, colorido por tipo de documento
fig = px.scatter(
    df, x="x", y="y",
    color="doc_type",
    hover_data=["text"],
    title="Visualização dos Embeddings por Tipo de Documento"
)
fig.show()

In [32]:

# Reduz para 3D
pca = PCA(n_components=3)
emb_3d = pca.fit_transform(emb_vectors)

# Cria DataFrame
df = pd.DataFrame({
    "x": emb_3d[:, 0],
    "y": emb_3d[:, 1],
    "z": emb_3d[:, 2],
    "text": texts,
    "doc_type": doc_types
})

# Gráfico 3D com Plotly
fig = px.scatter_3d(
    df, x="x", y="y", z="z",
    color="doc_type",
    hover_data=["text"],
    title="Visualização 3D dos Embeddings por Tipo de Documento"
)

fig.show()

In [14]:
# create a new Chat
llm = ChatOllama(temperature=0.7, model="phi4-mini")
# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vector_store.as_retriever()

# putting it together: set up the conversation chain with the Ollama, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)


In [15]:
# teste
query = "O que ou quem é Prediza ?"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

Prediza é uma empresa de ciência da computação aplicada na agricultura, energia e meio ambiente. Ela apoia tomadas de decisões com base em inteligência derivada de dados do meio ambiente, do solo, das plantas, microclimas e manejo agrícola para aumentar a produtividade e rentabilidade. A Prediza oferece serviços como Agricultura Preditiva, que utiliza soluções de IA e IoT para fornecer informações valiosas na agricultura, incluindo detecção precoce de doenças e estresse hídrico com ações recomendadas para correção rápida.


In [16]:

system_message = "Você é especialista em responder perguntas precisas sobre a empresa Prediza. Seja breve e preciso. Se não souber a resposta, diga. Não invente nada se não tiver recebido contexto relevante."
idioma = "Sempre responda no idioma Português, Brasil."


def chat(question,system_message=system_message, idioma=idioma):
    messages = [{"role": "system", "content": system_message}]
    messages.append({"role": "user", "content": question})
    messages.append({"role": "language", "content": idioma})
    result = conversation_chain.invoke({"question": messages})
    return result["answer"]

In [17]:
# Gradio:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.
