<a href="https://colab.research.google.com/github/Cody9494/LEGALSKEPSIS-DATA/blob/main/LAWSKPEPSIS_STEP1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ========= VERIFY XML CAPTURE + WRITE MANIFEST =========
from google.colab import drive
drive.mount('/content/drive')

import os, pandas as pd, json
from datetime import datetime

# ---- CONFIG ----
source_folder = '/content/drive/MyDrive/LAWSKEPSIS/EN'
output_folder = '/content/drive/MyDrive/LAWSKEPSIS_OUTPUTS/XML'
os.makedirs(output_folder, exist_ok=True)

xml_paths = []
for root, _, files in os.walk(source_folder):
    for fn in files:
        l = fn.lower()
        if l.endswith('.doc.xml') or l.endswith('.xml'):
            xml_paths.append(os.path.join(root, fn))

# counts
total = len(xml_paths)
docxml = sum(p.lower().endswith('.doc.xml') for p in xml_paths)
plainxml = total - docxml

# zero-byte check
zero_size = [p for p in xml_paths if os.path.getsize(p) == 0]
zero_count = len(zero_size)

# quick header sanity for a small sample (Œ¥ŒµŒΩ Œ∫Œ¨ŒΩŒµŒπ œÄŒªŒÆœÅŒµœÇ parse)
def looks_like_xml(path, nbytes=256):
    try:
        with open(path, 'rb') as f:
            head = f.read(nbytes)
        s = head.decode('utf-8', errors='ignore').lstrip()
        return s.startswith('<')
    except Exception:
        return False

sample_check = xml_paths[:50]  # ŒµŒªŒ±œÜœÅœçœÇ Œ≠ŒªŒµŒ≥œáŒøœÇ œÉŒµ 50 œÄœÅœéœÑŒ±
sample_ok = sum(looks_like_xml(p) for p in sample_check)

# save manifest
manifest_path = os.path.join(output_folder, 'xml_manifest.parquet')
manifest_csv  = os.path.join(output_folder, 'xml_manifest.csv')
m = pd.DataFrame({'filepath': xml_paths})
m['filename'] = m['filepath'].apply(os.path.basename)
m['size_bytes'] = m['filepath'].apply(os.path.getsize)
m.to_parquet(manifest_path, index=False)
m.head(200).to_csv(manifest_csv, index=False)  # ŒºŒπŒ∫œÅœå preview

print("‚úÖ XML capture verification")
print(f"‚Ä¢ Total XML found: {total:,}  (plain .xml: {plainxml:,} | .doc.xml: {docxml:,})")
print(f"‚Ä¢ Zero-byte files: {zero_count:,}")
print(f"‚Ä¢ Quick header check (first 50): {sample_ok}/50 look like XML")
print(f"‚Ä¢ Manifest saved:\n  - {manifest_path}\n  - {manifest_csv}")

# Optional: show a few random paths
m.sample(min(10, len(m))).sort_values('filename').reset_index(drop=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ XML capture verification
‚Ä¢ Total XML found: 94,422  (plain .xml: 70,349 | .doc.xml: 24,073)
‚Ä¢ Zero-byte files: 0
‚Ä¢ Quick header check (first 50): 50/50 look like XML
‚Ä¢ Manifest saved:
  - /content/drive/MyDrive/LAWSKEPSIS_OUTPUTS/XML/xml_manifest.parquet
  - /content/drive/MyDrive/LAWSKEPSIS_OUTPUTS/XML/xml_manifest.csv


Unnamed: 0,filepath,filename,size_bytes
0,/content/drive/MyDrive/LAWSKEPSIS/EN/L_2015290...,L_2015290EN.01000701.xml,4013
1,/content/drive/MyDrive/LAWSKEPSIS/EN/L_2016205...,L_2016205EN.01000501.xml,28989
2,/content/drive/MyDrive/LAWSKEPSIS/EN/L_2018089...,L_2018089EN.01000701.doc.xml,1605
3,/content/drive/MyDrive/LAWSKEPSIS/EN/L_2019192...,L_2019192EN.01002601.xml,3185
4,/content/drive/MyDrive/LAWSKEPSIS/EN/L_2019312...,L_2019312EN.01005501.doc.xml,1887
5,/content/drive/MyDrive/LAWSKEPSIS/EN/L_2022026...,L_2022026EN.01001101.xml,24366
6,/content/drive/MyDrive/LAWSKEPSIS/EN/L_2022227...,L_2022227EN.01003801.xml,16407
7,/content/drive/MyDrive/LAWSKEPSIS/EN/L_2022256...,L_2022256EN.01000301.xml,5537
8,/content/drive/MyDrive/LAWSKEPSIS/EN/L_2023034...,L_2023034EN.01000401.doc.xml,1588
9,/content/drive/MyDrive/LAWSKEPSIS/EN/L_2024903...,L_202490324EN.toc.fmx.xml,907


In [None]:
!pip install xmltodict



In [None]:
# ========= XML ‚Üí ONE PARQUET (using manifest) =========
import os, json, pandas as pd, xmltodict
import pyarrow as pa, pyarrow.parquet as pq

output_folder = '/content/drive/MyDrive/LAWSKEPSIS_OUTPUTS/XML'
manifest_path = os.path.join(output_folder, 'xml_manifest.parquet')
OUT_FILE = os.path.join(output_folder, 'xml_all.parquet')
LOG_ERRORS = os.path.join(output_folder, 'xml_errors.csv')
BATCH_SIZE = 3000

# load manifest
manifest = pd.read_parquet(manifest_path)
xml_paths = manifest['filepath'].tolist()
print(f"üóÇ Using manifest with {len(xml_paths):,} XML files")

def parse_xml_file(fp):
    try:
        with open(fp, 'r', encoding='utf-8') as f:
            return xmltodict.parse(f.read())
    except Exception as e:
        return {'__parse_error__': str(e)}

errors_xml, buffer = [], []
writer, total = None, 0

try:
    for fp in xml_paths:
        total += 1
        data = parse_xml_file(fp)
        row = {
            'filename': os.path.basename(fp),
            'filepath': fp,
            'file_type': 'XML' if '__parse_error__' not in data else 'XML-Error',
            'content_json': json.dumps(data, ensure_ascii=False)
        }
        if '__parse_error__' in data:
            errors_xml.append({'filepath': fp, 'error': data['__parse_error__']})
        buffer.append(row)

        if len(buffer) >= BATCH_SIZE:
            df = pd.DataFrame(buffer)
            table = pa.Table.from_pandas(df, preserve_index=False)
            if writer is None:
                writer = pq.ParquetWriter(OUT_FILE, table.schema)
            writer.write_table(table)
            buffer.clear()
            if total % (BATCH_SIZE*1) == 0:
                print(f"üíæ appended row-group, processed: {total:,}")

    if buffer:
        df = pd.DataFrame(buffer)
        table = pa.Table.from_pandas(df, preserve_index=False)
        if writer is None:
            writer = pq.ParquetWriter(OUT_FILE, table.schema)
        writer.write_table(table)
        buffer.clear()
        print(f"üíæ appended FINAL row-group, total: {total:,}")
finally:
    if writer is not None:
        writer.close()

if errors_xml:
    pd.DataFrame(errors_xml).to_csv(LOG_ERRORS, index=False)
    print(f"‚ö†Ô∏è Errors logged at: {LOG_ERRORS}")

print(f"‚úÖ DONE. One Parquet at: {OUT_FILE}")


üóÇ Using manifest with 94,422 XML files
üíæ appended row-group, processed: 3,000
üíæ appended row-group, processed: 6,000
üíæ appended row-group, processed: 9,000
üíæ appended row-group, processed: 12,000
üíæ appended row-group, processed: 15,000
üíæ appended row-group, processed: 18,000
üíæ appended row-group, processed: 21,000
üíæ appended row-group, processed: 24,000
üíæ appended row-group, processed: 27,000
üíæ appended row-group, processed: 30,000
üíæ appended row-group, processed: 33,000
üíæ appended row-group, processed: 36,000
üíæ appended row-group, processed: 39,000
üíæ appended row-group, processed: 42,000
üíæ appended row-group, processed: 45,000
üíæ appended row-group, processed: 48,000
üíæ appended row-group, processed: 51,000
üíæ appended row-group, processed: 54,000
üíæ appended row-group, processed: 57,000
üíæ appended row-group, processed: 60,000
üíæ appended row-group, processed: 63,000
üíæ appended row-group, processed: 66,000
üíæ appended r