In [None]:

import os.path
import requests

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

import os
from io import BytesIO
import mimetypes

from googleapiclient.http import MediaIoBaseDownload


In [None]:
from langchain_google_community import GoogleDriveLoader
from langchain_community.document_loaders import UnstructuredFileIOLoader

loader = GoogleDriveLoader(
    folder_id="root",
    file_types=["document", "pdf"],
    credentials_path="/Users/gardille/.credentials/credentials.json",
    # Optional: configure whether to recursively fetch files from subfolders. Defaults to False.
    recursive=True,
)

items = loader.load()


In [None]:
items[0].to_json()

In [None]:
# Configuration for Dify API
DIFY_API_URL = 'http://my.vigie.ai/v1'
DIFY_API_KEY = 'dataset-eWoK4OqxwncqWQfmPDpXcr9t'
KNOWLEDGE_NAME = 'Google Drive'

# If modifying these scopes, delete the file token.json.
SCOPES = [
    "https://www.googleapis.com/auth/drive.metadata.readonly",
    "https://www.googleapis.com/auth/drive.readonly",
    "https://www.googleapis.com/auth/drive.file"
]


def get_or_create_knowledge(name):
    # Check if knowledge exists
    url = f"{DIFY_API_URL}/datasets"
    headers = {
        'Authorization': f'Bearer {DIFY_API_KEY}',
        'Content-Type': 'application/json'
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        datasets = response.json().get('data', [])
        for dataset in datasets:
            if dataset['name'] == name:
                print(f"Found knowledge base: {name}")
                return dataset['id']
    
    # Create new knowledge base if it doesn't exist
    print(f"Creating new knowledge base: {name}")
    url = f"{DIFY_API_URL}/datasets"
    data = {
        "name": name
    }
    response = requests.post(url, headers=headers, json=data)
    
    if response.status_code in [200, 201]:
        return response.json()['id']
    else:
        raise Exception(f"Failed to create knowledge base: {response.text}")

def get_existing_documents(knowledge_id):
    url = f"{DIFY_API_URL}/datasets/{knowledge_id}/documents"
    headers = {
        'Authorization': f'Bearer {DIFY_API_KEY}',
        'Content-Type': 'application/json'
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        return {doc['name']: doc['id'] for doc in response.json().get('data', [])}
    else:
        raise Exception(f"Failed to fetch existing documents: {response.text}")

def upload_document_to_dify(knowledge_id, name, content, mime_type):
    url = f"{DIFY_API_URL}/datasets/{knowledge_id}/document/create_by_file"
    headers = {
        'Authorization': f'Bearer {DIFY_API_KEY}'
    }

    # No need to convert here, content is already bytes
    files = {
        'file': (name, BytesIO(content), mime_type),
        'data': (None, '{"indexing_technique":"high_quality","process_rule":{"mode": "automatic"}}', 'application/json')
    }

    response = requests.post(url, headers=headers, files=files)

    if response.status_code == 201:
        print(f'Successfully uploaded {name} to Dify')
    else:
        error_response = response.json()
        print(f'Failed to upload {name} to Dify: {error_response.get("message", response.text)}')

def update_document_in_dify(knowledge_id, document_id, name, content):
    url = f"{DIFY_API_URL}/datasets/{knowledge_id}/documents/{document_id}/update_by_text"
    headers = {
        'Authorization': f'Bearer {DIFY_API_KEY}',
        'Content-Type': 'application/json'
    }

    data = {
        "name": name,
        "text": content.decode('utf-8'),  # Decode the bytes to a string
        "process_rule": {
            "mode": "automatic"
        }
    }
    
    response = requests.post(url, headers=headers, json=data)
    
    if response.status_code == 200:
        print(f'Successfully updated {name} in Dify')
    else:
        print(f'Failed to update {name} in Dify: {response.text}')



In [None]:
# 2. Get or Create the Knowledge Base in Dify
knowledge_id = get_or_create_knowledge(KNOWLEDGE_NAME)
existing_documents = get_existing_documents(knowledge_id)

# 3. Upload or Update Documents in Dify
for doc in items:
    page_content = doc.page_content.encode("utf-8") # Convert text to bytes
    document_name_in_dify_format = doc.metadata["source"].split("/")[-1]  
    mime_type = MIME_TYPE_MAPPING.get(doc.metadata.get('mimeType'), None)  # Get Dify mime type if it exists

    if document_name_in_dify_format in existing_documents:
        update_document_in_dify(knowledge_id, existing_documents[document_name_in_dify_format], 
                                doc.metadata["source"].split("/")[-1], page_content)
    else:
        if mime_type:  # For Google Docs, Sheets, Slides (convert to plain text)
            upload_document_to_dify(knowledge_id, document_name_in_dify_format, 
                                    page_content, mime_type)
        else:  # For other supported file types (no conversion)
            upload_document_to_dify(knowledge_id, document_name_in_dify_format,
                                    page_content, doc.metadata.get('mimeType', 'text/plain'))  


In [None]:
import os
import requests
from io import BytesIO
from langchain_google_community import GoogleDriveLoader
from langchain_community.document_loaders import UnstructuredFileIOLoader

# Configuration for Dify API
DIFY_API_URL = 'http://my.vigie.ai/v1'
DIFY_API_KEY = 'dataset-eWoK4OqxwncqWQfmPDpXcr9t'
KNOWLEDGE_NAME = 'Google Drive'

# If modifying these scopes, delete the file token.json.
SCOPES = [
    "https://www.googleapis.com/auth/drive.metadata.readonly",
    "https://www.googleapis.com/auth/drive.readonly",
    "https://www.googleapis.com/auth/drive.file"
]

def get_or_create_knowledge(name):
    url = f"{DIFY_API_URL}/datasets"
    headers = {
        'Authorization': f'Bearer {DIFY_API_KEY}',
        'Content-Type': 'application/json'
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        datasets = response.json().get('data', [])
        for dataset in datasets:
            if dataset['name'] == name:
                print(f"Found knowledge base: {name}")
                return dataset['id']
    
    print(f"Creating new knowledge base: {name}")
    data = {"name": name}
    response = requests.post(url, headers=headers, json=data)
    
    if response.status_code in [200, 201]:
        return response.json()['id']
    else:
        raise Exception(f"Failed to create knowledge base: {response.text}")

def get_existing_documents(knowledge_id):
    url = f"{DIFY_API_URL}/datasets/{knowledge_id}/documents"
    headers = {
        'Authorization': f'Bearer {DIFY_API_KEY}',
        'Content-Type': 'application/json'
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        return {doc['name']: doc['id'] for doc in response.json().get('data', [])}
    else:
        raise Exception(f"Failed to fetch existing documents: {response.text}")

def upload_document_to_dify(knowledge_id, name, content, mime_type):
    url = f"{DIFY_API_URL}/datasets/{knowledge_id}/document/create_by_file"
    headers = {
        'Authorization': f'Bearer {DIFY_API_KEY}'
    }

    files = {
        'file': (name, BytesIO(content), mime_type),
        'data': (None, '{"indexing_technique":"high_quality","process_rule":{"mode": "automatic"}}', 'application/json')
    }

    response = requests.post(url, headers=headers, files=files)

    if response.status_code == 201:
        print(f'Successfully uploaded {name} to Dify')
    else:
        error_response = response.json()
        print(f'Failed to upload {name} to Dify: {error_response.get("message", response.text)}')

def update_document_in_dify(knowledge_id, document_id, name, content):
    url = f"{DIFY_API_URL}/datasets/{knowledge_id}/documents/{document_id}/update_by_text"
    headers = {
        'Authorization': f'Bearer {DIFY_API_KEY}',
        'Content-Type': 'application/json'
    }

    data = {
        "name": name,
        "text": content.decode('utf-8'),
        "process_rule": {
            "mode": "automatic"
        }
    }
    
    response = requests.post(url, headers=headers, json=data)
    
    if response.status_code == 200:
        print(f'Successfully updated {name} in Dify')
    else:
        print(f'Failed to update {name} in Dify: {response.text}')

def main():
   # Initialize Google Drive Loader
   loader = GoogleDriveLoader(
       folder_id="root",
       file_types=["document", "pdf"],
       credentials_path="/Users/gardille/.credentials/credentials.json",
       recursive=True,
   )

   # Load items from Google Drive
   items = loader.load()

   # Get or create knowledge base ID
   knowledge_id = get_or_create_knowledge(KNOWLEDGE_NAME)

   # Get existing documents from Dify
   existing_docs = get_existing_documents(knowledge_id)

   # Synchronize each document from Google Drive with Dify
   for item in items:
       doc_name = item.metadata['title']
       doc_content_bytes = item.page_content.encode('utf-8')  # Ensure content is bytes
       mime_type = item.metadata.get('mimeType', 'application/octet-stream')

       if doc_name in existing_docs:
           # Update document if it already exists
           update_document_in_dify(knowledge_id, existing_docs[doc_name], doc_name, doc_content_bytes)
       else:
           # Upload document if it does not exist
           upload_document_to_dify(knowledge_id, doc_name, doc_content_bytes, mime_type)

if __name__ == "__main__":
   main()

In [None]:
import os
import requests
from io import BytesIO

# Configuration for Dify API
DIFY_API_URL = 'http://my.vigie.ai/v1'
DIFY_API_KEY = 'dataset-eWoK4OqxwncqWQfmPDpXcr9t'
KNOWLEDGE_NAME = 'Google Drive'

def get_or_create_knowledge(name):
    url = f"{DIFY_API_URL}/datasets"
    headers = {
        'Authorization': f'Bearer {DIFY_API_KEY}',
        'Content-Type': 'application/json'
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        datasets = response.json().get('data', [])
        for dataset in datasets:
            if dataset['name'] == name:
                print(f"Found knowledge base: {name}")
                return dataset['id']
    
    print(f"Creating new knowledge base: {name}")
    data = {"name": name}
    response = requests.post(url, headers=headers, json=data)
    
    if response.status_code in [200, 201]:
        return response.json()['id']
    else:
        raise Exception(f"Failed to create knowledge base: {response.text}")

def get_existing_documents(knowledge_id):
    url = f"{DIFY_API_URL}/datasets/{knowledge_id}/documents"
    headers = {
        'Authorization': f'Bearer {DIFY_API_KEY}',
        'Content-Type': 'application/json'
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        return {doc['name']: doc['id'] for doc in response.json().get('data', [])}
    else:
        raise Exception(f"Failed to fetch existing documents: {response.text}")

def upload_document_to_dify(knowledge_id, name, content, mime_type):
    url = f"{DIFY_API_URL}/datasets/{knowledge_id}/document/create_by_file"
    headers = {
        'Authorization': f'Bearer {DIFY_API_KEY}'
    }

    # Properly format the data field as a JSON string
    data_field = {
        "indexing_technique": "high_quality",
        "process_rule": {
            "mode": "automatic",
            "rules": {}
        }
    }

    files = {
        'file': (name, BytesIO(content), mime_type),
        'data': (None, str(data_field).replace("'", '"'), 'application/json')
    }

    # Log request details
    print("Uploading document:")
    print(f"URL: {url}")
    print(f"Headers: {headers}")
    print(f"Files: {[key for key in files.keys()]}")
    
    try:
      response = requests.post(url, headers=headers, files=files)
      print(response.text)  # Print raw response text for debugging
      
      if response.status_code == 201:
          print(f'Successfully uploaded {name} to Dify')
      else:
          error_response = response.json()
          print(f'Failed to upload {name} to Dify: {error_response.get("message", response.text)}')
      
      return response
    
    except Exception as e:
        print(f"An error occurred: {e}")

def update_document_in_dify(knowledge_id, document_id, name, content):
   url = f"{DIFY_API_URL}/datasets/{knowledge_id}/documents/{document_id}/update_by_text"
   headers = {
       'Authorization': f'Bearer {DIFY_API_KEY}',
       'Content-Type': 'application/json'
   }

   data = {
       "name": name,
       "text": content.decode('utf-8'),
       "process_rule": {
           "mode": "automatic"
       }
   }
   
   # Log request details
   print("Updating document:")
   print(f"URL: {url}")
   print(f"Headers: {headers}")
   print(f"Data: {data}")

   try:
     response = requests.post(url, headers=headers, json=data)
     if response.status_code == 200:
         print(f'Successfully updated {name} in Dify')
     else:
         error_response = response.json()
         print(f'Failed to update {name} in Dify: {error_response.get("message", response.text)}')
     
     return response
   
   except Exception as e:
     print(f"An error occurred: {e}")

def main():
   # Assuming items is already loaded and available
   items = [
       # Example item structure based on the provided JSON structure
       {
           "lc": 1,
           "type": "constructor",
           "id": ["langchain", "schema", "document", "Document"],
           "kwargs": {
               "metadata": {
                   "source": "https://docs.google.com/document/d/1deKTX0UnlqNNZVNUW1yaQX6DRb5EGT4wkW2rzA4gQLM/edit",
                   "title": "docker-compose.yaml",
                   "when": "2024-07-10T13:03:09.630Z"
               },
               "page_content": "I'm a very interesting document.\n you should read me!",
               "type": "Document"
           }
       }
   ]

   # Get or create knowledge base ID
   knowledge_id = get_or_create_knowledge(KNOWLEDGE_NAME)

   # Get existing documents from Dify
   existing_docs = get_existing_documents(knowledge_id)

   # Synchronize each document from items with Dify
   for item in items:
       doc_name = item["kwargs"]["metadata"]['title']
       doc_content_bytes = item["kwargs"]["page_content"].encode('utf-8')  # Ensure content is bytes
       mime_type = item["kwargs"]["metadata"].get('mimeType', 'application/octet-stream')

       if doc_name in existing_docs:
           # Update document if it already exists
           update_document_in_dify(knowledge_id, existing_docs[doc_name], doc_name, doc_content_bytes)
       else:
           # Upload document if it does not exist
           upload_document_to_dify(knowledge_id, doc_name, doc_content_bytes, mime_type)

#if __name__ == "__main__":
main()

In [None]:
knowledge_id