In [None]:
import os
import time
import pandas as pd
import pdfplumber
from PyPDF2 import PdfReader
from docx import Document
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler

# Folder to monitor
FOLDER_PATH = r"E:\file formats"

# Check if folder path exists
if os.path.exists(FOLDER_PATH):
    print(f"✅ Folder path exists: {FOLDER_PATH}")
else:
    print(f"❌ Folder path does NOT exist: {FOLDER_PATH}")
    exit()


def is_valid_file(file_path):
    try:
        if file_path.endswith(".csv"):
            df = pd.read_csv(file_path)
        elif file_path.endswith(".xlsx"):
            df = pd.read_excel(file_path, engine="openpyxl")
        elif file_path.endswith(".xls"):
            df = pd.read_excel(file_path, engine="xlrd")
        elif file_path.endswith(".txt"):
            df = pd.read_csv(file_path, delimiter="\t", header=None)
        elif file_path.endswith(".pdf"):
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    table = page.extract_table()
                    if table:
                        print(f"✅ {file_path} - Correct file format (PDF with tables detected).")
                        return
            print(f"❌ {file_path} - No tables found in PDF.")
            return
        elif file_path.endswith(".docx"):
            doc = Document(file_path)
            tables = doc.tables
            if len(tables) > 0:
                print(f"✅ {file_path} - DOCX with tables found.")
                return
            else:
                print(f"❌ {file_path} - No tables found in DOCX.")
                return
        else:
            print(f"❌ {file_path} - Unsupported file format.")
            return

        # Check rows and columns for valid files
        if df.shape[0] > 0 and df.shape[1] > 0:
            print(f"✅ {file_path} - Correct file format (Rows and columns present).")
        else:
            print(f"❌ {file_path} - Not Correct: Empty rows/columns.")
    except Exception as e:
        print(f"❌ {file_path} - Error: {e}")


# Custom event handler
class FileHandler(FileSystemEventHandler):
    def on_created(self, event):
        if event.is_directory:
            return
        # Ignore temporary files (starting with ~$)
        if os.path.basename(event.src_path).startswith("~$"):
            print(f"🛑 Ignored temp file: {event.src_path}")
            return
        if event.src_path.endswith(('.csv', '.xlsx', '.txt', '.pdf', '.docx', '.xls')):
            print(f"📂 New file detected: {event.src_path}")
            time.sleep(1)  # Add delay to avoid duplicate detection
            is_valid_file(event.src_path)
        else:
            print(f"❌ Ignored file: {event.src_path}")


# Setup Observer
event_handler = FileHandler()
observer = Observer()
observer.schedule(event_handler, path=FOLDER_PATH, recursive=False)

try:
    print(f"👀 Monitoring folder: {FOLDER_PATH}")
    observer.start()
    while True:
        time.sleep(1)  # Small delay to keep monitoring
except KeyboardInterrupt:
    observer.stop()
    observer.join()
    print("🛑 Monitoring stopped.")



✅ Folder path exists: E:\file formats
👀 Monitoring folder: E:\file formats
📂 New file detected: E:\file formats\processed_medical_reviewsm.csv
✅ E:\file formats\processed_medical_reviewsm.csv - Correct file format (Rows and columns present).
📂 New file detected: E:\file formats\HoI letter (3).docx
❌ E:\file formats\HoI letter (3).docx - No tables found in DOCX.
