In [1]:
import os, json
import urllib.parse
from io import BytesIO
from uuid import uuid4
from llama_parse import LlamaParse
from langchain_chroma import Chroma
from office365.sharepoint.client_context import ClientContext, UserCredential
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from office365.sharepoint.files.file import File
from langchain_core.documents import Document
from tempfile import NamedTemporaryFile
from datetime import datetime, timedelta

from dotenv import load_dotenv
load_dotenv()
USERNAME = os.getenv('sharepoint_email')
PASSWORD = os.getenv('sharepoint_password')
SHAREPOINT_URL_SITE = os.getenv('sharepoint_url_site')
SHAREPOINT_SITE_NAME = os.getenv('sharepoint_site_name')
SHAREPOINT_DOC_LIB = os.getenv('sharepoint_doc_library')
LLAMA_API_KEY = os.getenv('LLAMA_CLOUD_API_KEY')
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001', google_api_key=GEMINI_API_KEY)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
properties_file = 'Metadata.json'

conn = ClientContext(SHAREPOINT_URL_SITE).with_credentials(UserCredential(USERNAME, PASSWORD))

[nltk_data] Downloading package punkt_tab to c:\Users\Devashish.Revadk
[nltk_data]     ar\AppData\Local\miniconda3\envs\presales-
[nltk_data]     pipeline\Lib\site-
[nltk_data]     packages\llama_index\core\_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def _get_files_and_folders(FOLDER_NAME=''):
    target_folder_url = f'{SHAREPOINT_DOC_LIB}/{FOLDER_NAME}' if FOLDER_NAME else SHAREPOINT_DOC_LIB
    root_folder = conn.web.get_folder_by_server_relative_url(target_folder_url)
    root_folder.expand(["Files", "Folders"]).get().execute_query()
    return root_folder.files, root_folder.folders


In [3]:
def get_file_properties(last_download, folder_name=''):
    all_files_properties = []
    file_id_list = []
    
    new_files = []
    def process_folder(CURRENT_FOLDER_NAME):
        files, folders = _get_files_and_folders(CURRENT_FOLDER_NAME)
        ENCODED_DOC_LIB = urllib.parse.quote(SHAREPOINT_DOC_LIB, safe=":/")
        for file in files:
            file_last_modified = file.time_last_modified
            if last_download is None or file_last_modified > last_download:
                new_files.append(file_last_modified)
                ENCODED_FILE_NAME = urllib.parse.quote(file.name, safe=":/")
                resource_url = f'{SHAREPOINT_URL_SITE}/{ENCODED_DOC_LIB}/{f"{CURRENT_FOLDER_NAME}/" if CURRENT_FOLDER_NAME else ""}{ENCODED_FILE_NAME}'
                file_dict = {
                    'file_name': file.name,
                    'file_unique_id': file.unique_id,
                    'resource_url': resource_url,
                    'file_size': file.length,
                }
                all_files_properties.append(file_dict)
                file_id_list.append(file.unique_id)
                print(file.name)

        for folder in folders:
            if folder.name.lower() != "forms":
                new_folder_name = f'{CURRENT_FOLDER_NAME}/{folder.name}' if CURRENT_FOLDER_NAME else folder.name
                process_folder(new_folder_name)
    process_folder(folder_name)
    return new_files
    

In [7]:
timestamp_file = 'last_download_time.json'
def get_last_download_timestamp():
    if os.path.exists(timestamp_file):
        with open(timestamp_file, 'r') as file:
            data = json.load(file)
            return datetime.fromisoformat(data['last_download'])
    return None

def save_last_download_timestamp(timestamp):
    ist_timestamp = timestamp + timedelta(hours=5, minutes=30)
    with open(timestamp_file, 'w') as file:
        json.dump({'last_download': ist_timestamp.isoformat()}, file)

In [8]:
last_download_timestamp = get_last_download_timestamp()
new_files = get_file_properties(last_download_timestamp)

if new_files:
    latest_timestamp = max(new_files)
    save_last_download_timestamp(latest_timestamp)

Machine_Learning.pptx
cricket - Copy.pdf
obesity.pdf
sample.pptx
CriticalSoftSkills.pdf
Best Practices- GenAI.pdf
