In [16]:
import boto3
import fitz  # PyMuPDF
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import sqlite3
from datetime import datetime
import os

In [17]:
# Load spaCy model and NLTK stopwords
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

In [14]:
# Initialize MinIO connection (S3-compatible)
s3_client = boto3.client(
    's3',
    endpoint_url='http://localhost:9000',  # Adjust endpoint URL for MinIO
    aws_access_key_id='admin',
    aws_secret_access_key='admin123'
)

# Function to list PDF files from MinIO
def list_pdf_files_from_minio(bucket_name):
    response = s3_client.list_objects_v2(Bucket=bucket_name)
    pdf_files = [item['Key'] for item in response.get('Contents', []) if item['Key'].endswith('.pdf')]
    return pdf_files

In [15]:
# Function to extract text from PDF using PyMuPDF (fitz)
def extract_text_with_fitz(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text("text")
    return full_text

In [18]:
# Preprocessing function to process text
def preprocess_text(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if w.lower() not in stop_words]
    doc = nlp(" ".join(filtered_sentence))
    processed_text = " ".join([token.lemma_ for token in doc if not token.is_punct])
    return processed_text

In [19]:
# Function to download and process PDFs
def download_and_process_pdf(file_name, bucket_name='raw-reports'):
    # Download the file from MinIO
    local_file = f'/tmp/{file_name}'
    s3_client.download_file(bucket_name, file_name, local_file)

    # Extract text from the downloaded PDF
    full_text = extract_text_with_fitz(local_file)

    # Preprocess the extracted text
    processed_text = preprocess_text(full_text)

    # Clean up by removing the local file
    os.remove(local_file)

    return processed_text

In [20]:
# Function to save processed text to MinIO or locally
def save_processed_text(file_name, processed_text, bucket_name='processed-reports'):
    local_processed_file = f'/tmp/processed_{file_name}.txt'
    
    # Save the processed text locally first
    with open(local_processed_file, 'w') as f:
        f.write(processed_text)

    # Upload processed file to MinIO
    s3_client.upload_file(local_processed_file, bucket_name, f'processed_{file_name}.txt')

    # Remove local processed file
    os.remove(local_processed_file)

In [21]:
# Function to save metadata to SQLite
def save_metadata_to_sqlite(file_name, file_size, features):
    conn = sqlite3.connect('metadata.db')
    c = conn.cursor()

    # Create the metadata table if it doesn't exist
    c.execute('''CREATE TABLE IF NOT EXISTS file_metadata
                 (filename TEXT, processed_date TEXT, size INTEGER, features TEXT)''')

    # Insert metadata
    processed_date = datetime.now().strftime('%Y-%m-%d')
    c.execute("INSERT INTO file_metadata VALUES (?, ?, ?, ?)",
              (file_name, processed_date, file_size, features))

    # Commit and close
    conn.commit()
    conn.close()

In [22]:
# Main processing loop
def main():
    # List all PDF files from the MinIO bucket
    pdf_files = list_pdf_files_from_minio('raw-reports')

    for file_name in pdf_files:
        # Download and process each PDF
        processed_text = download_and_process_pdf(file_name)

        # Save processed text back to MinIO
        save_processed_text(file_name, processed_text)

        # Calculate file size and example features (e.g., keyword extraction)
        file_size = len(processed_text.encode('utf-8'))  # Size in bytes
        features = 'keywords: AI, ML'  # Example features (can be expanded)

        # Save metadata to SQLite
        save_metadata_to_sqlite(file_name, file_size, features)

if __name__ == "__main__":
    main()