In [2]:
# Importing useful dependencies
import io
import os
import boto3
import chardet
import pandas as pd
import tiktoken
# set tokenizer with openAI standard token
#enc = tiktoken.get_encoding("cl100k_base")

In [3]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [4]:
# ---- Quality checks ----
def check_text_quality(body: bytes, key: str):
    """Return a simple dict of basic quality stats for one file."""
    if not body:
        return {"key": key, "empty": True}
    
    # detect encoding and decode safely
    guess = chardet.detect(body)
    enc = guess.get("encoding") or "utf-8"
    text = body.decode(enc, errors="replace")
    
    # check printable ratio (avoid binary garbage)
    printable_ratio = sum(c.isprintable() or c.isspace() for c in text) / max(1, len(text))
    
    # --- Paragraph token stats ---
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]

    paragraph_tokens = [len(p) for p in paragraphs] if paragraphs else []
    
    avg_tokens_per_paragraph = (
        sum(paragraph_tokens) / len(paragraph_tokens) if paragraph_tokens else 0
    )
    max_tokens_paragraph = max(paragraph_tokens) if paragraph_tokens else 0
    min_tokens_paragraph = min(paragraph_tokens) if paragraph_tokens else 0
    # basic stats
    return {
        "key": key,
        "size_bytes": len(body),
        "encoding": enc,
        "empty": not bool(text.strip()),
        "too_short": len(text.strip()) < 20,
        "low_printable_ratio": printable_ratio < 0.9,
        "lines": len(text.splitlines()),
        "avg_tokens_per_paragraph": round(avg_tokens_per_paragraph, 2),
        "max_tokens_paragraph": max_tokens_paragraph,
        "min_tokens_paragraph": min_tokens_paragraph,
        "paragraph_token_list": paragraph_tokens,  # you can drop this if you don’t need full list
    }

In [5]:
# ---- Run checks on all txt files ----
def extract_datas(bucket,prefix=""):
    results = []
    paginator = s3.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            
            key = obj["Key"]

            if obj['Size'] == 0 and key.endswith("/"): # skip the folder itself
                continue
            # Download the text
            resp = s3.get_object(Bucket=bucket, Key=key)
            body = resp["Body"].read()
            stats = check_text_quality(body, key)
            results.append(stats)
    return results

In [6]:
data = extract_datas(bucket = "trusted-zone", prefix = "texts")

In [7]:
# Convert the data into a dataFrame
df_data = pd.DataFrame(data)
df_data.sort_values(by="lines",ascending=False)

Unnamed: 0,key,size_bytes,encoding,empty,too_short,low_printable_ratio,lines,avg_tokens_per_paragraph,max_tokens_paragraph,min_tokens_paragraph,paragraph_token_list
152,texts/text_1759415320342.txt,1277,utf-8,False,False,False,4,1274.0,1274,1274,[1274]
6,texts/text_1759415299210.txt,625,utf-8,False,False,False,1,575.0,575,575,[575]
7,texts/text_1759415299328.txt,1412,utf-8,False,False,False,1,1380.0,1380,1380,[1380]
8,texts/text_1759415299451.txt,742,ascii,False,False,False,1,742.0,742,742,[742]
9,texts/text_1759415299570.txt,958,ascii,False,False,False,1,958.0,958,958,[958]
...,...,...,...,...,...,...,...,...,...,...,...
589,texts/text_1759415388037.txt,580,ascii,False,False,False,1,580.0,580,580,[580]
590,texts/text_1759415388201.txt,944,ascii,False,False,False,1,944.0,944,944,[944]
591,texts/text_1759415388348.txt,2519,utf-8,False,False,False,1,2513.0,2513,2513,[2513]
592,texts/text_1759415388486.txt,950,utf-8,False,False,False,1,944.0,944,944,[944]
