In [1]:
import time
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from pydrive.files import GoogleDriveFile

In [None]:
from tqdm.auto import tqdm
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.document_loaders import ToMarkdownLoader
from langchain.text_splitter import TokenTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.prompts import PromptTemplate
from langchain.schema import Document
import pinecone
import os
import re

In [3]:
api_key = os.getenv('TOMD_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
pinecone.init(      
	api_key=os.getenv('PINECONE_API_KEY'),      
	environment=os.getenv('PINECONE_ENVIRONMENT')    
)      
index = 'xrpl'
llm = ChatOpenAI(model='gpt-3.5-turbo-16k', temperature=0)


In [4]:
doc_splitter = TokenTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
)

In [5]:
gauth = GoogleAuth()
drive = GoogleDrive(gauth)

In [6]:
def list_files_in_folder(folder_id):
    query = f"'{folder_id}' in parents and trashed=false"
    file_list = drive.ListFile({'q': query}).GetList()
    return file_list

In [7]:
def get_file_info(file_list):
    file_info = []
    for file in file_list:
        file_info.append({
            'title': file['title'],
            'id': file['id'],
            'mimeType': file['mimeType'],
        })
    return file_info

In [8]:
def download_file(file_id, file_name, mime_type):
    file = drive.CreateFile({'id':file_id})
    file.GetContentFile(filename=file_name, mimetype=mime_type)

In [9]:
current_files_folders = os.getenv('current_files_folders')
old_files_folders = os.getenv('old_files_folders')

In [None]:
path = '/home/renisonmoita/Downloads/vega-automation/'
current_qtd = 0
documents = []
while True:
    files_info = get_file_info(list_files_in_folder(current_files_folders))
    documents.clear()

    if current_qtd == len(files_info):
        pass
    else:
        print(f'Arquivos encontrados:\n')
        try:
            for infos in files_info:
                download_file(infos['id'], infos['title'], infos['mimeType'])
                print(f"Download complet:\nTítulo: {infos['title']}\nID: {infos['id']}\n")
        except Exception as e:
            print('Erro ao baixar arquivos')
            print(e)
        
        directory_loader = DirectoryLoader(path, loader_cls=TextLoader)
        loader = directory_loader.load()

        for document in loader:
            if len(loader) > 0:
                content = document.page_content
                print(content)
                source = document.page_content.split('\n')[0]
                source_verification = re.findall('https?//\S+', source)
                
                if len(source_verification) > 0:
                    documents.append(Document(page_content=content, metadata={'source':source}))
                else:
                    documents.append(Document(page_content=content, metadata={'source': '*'}))

        print(documents)

        embedding = OpenAIEmbeddings(
            model='text-embedding-ada-002',
        )

        try:
            for document in documents:
                splitted_doc = doc_splitter.split_documents(document)
                # Pinecone.from_documents(splitted_doc, embedding, index_name=index)
            print('Upado no Pinecone\n')
        except Exception as e:
            print(f'Erro ao indexar documento\n{e}')
        
        for file in files_info:
            file = drive.CreateFile({'id':file['id']}) 
            file['parents'] = [{'kind': 'drive#fileLink', 'id': old_files_folders}]

        current_qtd = len(files_info)
