## TextHomogenizer


In [18]:
# Importing useful dependencies
import io
import os
import boto3
import chardet
import docx
import fitz
import json

In [2]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [33]:
# Define all function to extract text from other format

# Extraction of docx file

def extract_from_docx(body):
    file = io.BytesIO(body)
    doc = docx.Document(file)
    
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text.strip()

def extract_from_pdf(body):
    file = io.BytesIO(body)
    doc = fitz.open(stream=file, filetype="pdf")
    text = ''
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

In [36]:
def convert_texts_to_txt(bucket, prefix=""):
    paginator = s3.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):

            if obj['Size'] == 0:  # Skip the folder itself (if the file size is 0)
                continue

            key = obj["Key"]
            resp = s3.get_object(Bucket=bucket, Key=key)
            body = resp["Body"].read()  # Read the file content
            if key.endswith(".docx"):
                content = extract_from_docx(body)
                name, ext = os.path.splitext(key)
                
                newKey= name + ".txt"
                s3.put_object(
                    Bucket=bucket,
                    Key=newKey,  # Make sure the file key (path) is correct
                    Body=content.encode('utf-8'),
                    ContentType="text/plain"
                )
                s3.delete_object(Bucket=bucket, Key=key)

            if key.endswith(".pdf"):
                content = extract_from_pdf(body)
                name, ext = os.path.splitext(key)
                
                newKey= name + "1.txt"
                s3.put_object(
                    Bucket=bucket,
                    Key=newKey,  # Make sure the file key (path) is correct
                    Body=content.encode('utf-8'),
                    ContentType="text/plain"
                )
                s3.delete_object(Bucket=bucket, Key=key)


            
            
            elif key.endswith(".txt"):
                try:
                    # Get the file object from S3
    
                    # Use chardet to detect the file encoding
                    result = chardet.detect(body)
                    current_encoding = result['encoding']
                    # Skip if the file is already in UTF-8 encoding
                    if (current_encoding == "utf-8" or current_encoding == "ascii"):
                        continue
                    print(current_encoding)
                    print(f"Converting {key} from {current_encoding} to UTF-8")

                    # Decode the content using the detected encoding and re-encode it in UTF-8
                    content = body.decode(current_encoding, errors='ignore')  # Ignore characters that can't be decoded
                    
                    # Upload the converted content back to S3 in UTF-8
                    s3.put_object(
                        Bucket=bucket,
                        Key=key,  # Make sure the file key (path) is correct
                        Body=content.encode('utf-8'),
                        ContentType="text/plain"
                    )
                    print(f"Successfully converted {key} to UTF-8.")
                    
                except Exception as e:
                    print(f"Failed to process {key}: {e}")  # Print error if something goes wrong

"""
            # New key with .png extension
            new_key = os.path.splitext(key)[0] + ".png"

            # Download the image
            resp = s3.get_object(Bucket=bucket, Key=key)
            body = resp["Body"].read()
            
            # Convert to PNG
            img = Image.open(io.BytesIO(body)).convert("RGBA")
            buf = io.BytesIO()
            img.save(buf, format="PNG")
            buf.seek(0)

            # Upload the image back as PNG (replace original with .png)
            s3.upload_fileobj(buf, Bucket=bucket, Key=new_key, ExtraArgs={"ContentType": "image/png"})

            # Delete the old image
            s3.delete_object(Bucket=bucket, Key=key)

            print(f"Replaced: {key} -> {new_key}")
"""

'\n            # New key with .png extension\n            new_key = os.path.splitext(key)[0] + ".png"\n\n            # Download the image\n            resp = s3.get_object(Bucket=bucket, Key=key)\n            body = resp["Body"].read()\n\n            # Convert to PNG\n            img = Image.open(io.BytesIO(body)).convert("RGBA")\n            buf = io.BytesIO()\n            img.save(buf, format="PNG")\n            buf.seek(0)\n\n            # Upload the image back as PNG (replace original with .png)\n            s3.upload_fileobj(buf, Bucket=bucket, Key=new_key, ExtraArgs={"ContentType": "image/png"})\n\n            # Delete the old image\n            s3.delete_object(Bucket=bucket, Key=key)\n\n            print(f"Replaced: {key} -> {new_key}")\n'

In [38]:
convert_texts_to_txt(bucket = "formatted-zone", prefix = "texts/")