In [None]:
import pandas as pd
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from enum import Enum
from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
import os
from dotenv import load_dotenv

In [None]:
colunas = ['cd_causa', 'cd_atendimento', 'ds_Acao_Judicial', 'ds_fatos', 'ds_Pedidos', 'ds_Qualificacao']

In [None]:
# LINHA DE SELECAO DO INPUT
df = pd.read_csv(
    "dataset_clinica20252.csv",
    sep="|",               
    encoding="utf-8"     
)

print(df.shape)
print(df.head(20))

In [None]:
df_validado_true = df.loc[
    df['ds_fatos'].str.contains("A parte Autora é beneficiária da Previdência Social, sendo tal sua única fonte de renda.  Portanto, por", 
    na=False)]
print(df_validado_true.shape)
print(df_validado_true.head())

In [None]:
load_dotenv(override=True)
def cria_vector_store():
    pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY")) 

    index_name = "clinicas"
    if not pc.has_index(index_name):
        pc.create_index(
            name=index_name,
            dimension=3072,
            metric='cosine',
            spec=ServerlessSpec(cloud='aws', region='us-east-1')
        )

    index = pc.Index(index_name)

    index.delete(delete_all=True)

    embeddings = OpenAIEmbeddings(
        model="text-embedding-3-large",
        openai_api_key=os.getenv("OPENAI_API_KEY")
    )

    docs = []
    for text in df_validado_true['ds_fatos'].dropna():
        doc = Document(page_content=text, metadata={"label": "SIM"})
        docs.append(doc)

    print(len(docs))
    print(docs[0])

    vector_store = PineconeVectorStore.from_documents(
        docs,
        embeddings,
        index_name=index_name
    )

    return vector_store