In [1]:
!git clone https://github.com/EliasSalameh/datathon2025.git


Cloning into 'datathon2025'...
remote: Enumerating objects: 51, done.[K
remote: Counting objects: 100% (51/51), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 51 (delta 26), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (51/51), 24.82 MiB | 19.89 MiB/s, done.
Resolving deltas: 100% (26/26), done.


Unzipping JSON files

In [3]:
import zipfile
import os
from pathlib import Path
import tempfile
import shutil

# Paths
top_level_zip_dir = Path("datathon2025")    # <- CHANGE THIS
final_output_dir = Path("datathon2025_2")   # <- CHANGE THIS
final_output_dir.mkdir(parents=True, exist_ok=True)

def extract_inner_zip_preserve_structure(inner_zip_path, output_base_dir):
    client_name = inner_zip_path.stem  # e.g., client_001
    output_client_dir = output_base_dir / client_name
    output_client_dir.mkdir(parents=True, exist_ok=True)

    with zipfile.ZipFile(inner_zip_path, 'r') as inner_zip:
        for file_name in inner_zip.namelist():
            if file_name.endswith('.json'):
                inner_zip.extract(file_name, output_client_dir)

# Process each top-level zip
for top_zip in top_level_zip_dir.glob("*.zip"):
    print(f"Processing: {top_zip.name}")
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir_path = Path(tmpdir)
        with zipfile.ZipFile(top_zip, 'r') as zip_ref:
            zip_ref.extractall(tmpdir_path)
        
        # Find all inner client zips inside extracted folder
        for inner_zip_path in tmpdir_path.rglob("*.zip"):
            print(f"  ↳ Extracting client zip: {inner_zip_path.name}")
            extract_inner_zip_preserve_structure(inner_zip_path, final_output_dir)

print("✅ All JSON files extracted with folder structure preserved.")

Processing: datathon_part4.zip
  ↳ Extracting client zip: client_8128.zip
  ↳ Extracting client zip: client_8617.zip
  ↳ Extracting client zip: client_8914.zip
  ↳ Extracting client zip: client_9700.zip
  ↳ Extracting client zip: client_8896.zip
  ↳ Extracting client zip: client_7723.zip
  ↳ Extracting client zip: client_8657.zip
  ↳ Extracting client zip: client_9595.zip
  ↳ Extracting client zip: client_7621.zip
  ↳ Extracting client zip: client_8250.zip
  ↳ Extracting client zip: client_9603.zip
  ↳ Extracting client zip: client_8778.zip
  ↳ Extracting client zip: client_9534.zip
  ↳ Extracting client zip: client_9912.zip
  ↳ Extracting client zip: client_7627.zip
  ↳ Extracting client zip: client_9326.zip
  ↳ Extracting client zip: client_9741.zip
  ↳ Extracting client zip: client_8378.zip
  ↳ Extracting client zip: client_7823.zip
  ↳ Extracting client zip: client_8482.zip
  ↳ Extracting client zip: client_7917.zip
  ↳ Extracting client zip: client_9663.zip
  ↳ Extracting client z

  ↳ Extracting client zip: client_8228.zip
  ↳ Extracting client zip: client_8698.zip
  ↳ Extracting client zip: client_9271.zip
  ↳ Extracting client zip: client_8117.zip
  ↳ Extracting client zip: client_8188.zip
  ↳ Extracting client zip: client_8724.zip
  ↳ Extracting client zip: client_8893.zip
  ↳ Extracting client zip: client_9905.zip
  ↳ Extracting client zip: client_7560.zip
  ↳ Extracting client zip: client_7514.zip
  ↳ Extracting client zip: client_9994.zip
  ↳ Extracting client zip: client_9825.zip
  ↳ Extracting client zip: client_7802.zip
  ↳ Extracting client zip: client_8338.zip
  ↳ Extracting client zip: client_9567.zip
  ↳ Extracting client zip: client_9264.zip
  ↳ Extracting client zip: client_8094.zip
  ↳ Extracting client zip: client_8779.zip
  ↳ Extracting client zip: client_8545.zip
  ↳ Extracting client zip: client_8818.zip
  ↳ Extracting client zip: client_9490.zip
  ↳ Extracting client zip: client_8825.zip
  ↳ Extracting client zip: client_7596.zip
  ↳ Extract

# Data Analysis

Passport MRZ consistency (All MRZ are of length 45, some have spaces, about 50% of those with spaces are accepted, turns out space can be ok if name has space in it)

In [12]:
import json
from pathlib import Path

# Change this to your extracted base folder
base_dir = Path("datathon2025_2")

def validate_mrz(mrz):
    if not isinstance(mrz, list) or len(mrz) != 2:
        return False, "MRZ should be a list of two strings."
    for i, line in enumerate(mrz):
        if not isinstance(line, str):
            return False, f"Line {i+1} is not a string."
        if len(line) != 45:
            return False, f"Line {i+1} has length {len(line)} (expected 44)."
        if " " in line:
            return False, f"Line {i+1} contains spaces."
    return True, ""
def validate_label(label):
    if not isinstance(label, str):
        return False, "Label should be a dictionary."
    if label == "Accept":
        return True, "Accepted label despite space in MRZ"
    elif label == "Reject":
        return False, "Rejected label with space in MRZ"

count_isvalid = 0
count_isrejected = 0
# Loop through each client folder
for client_dir in base_dir.iterdir():
    passport_path = client_dir / "passport.json"
    label_path = client_dir / "label.json"  
    if not passport_path.exists():
        print(f"❌ Missing passport.json in {client_dir.name}")
        continue

    with open(passport_path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError:
            print(f"❌ JSON decode error in {passport_path}")
            continue

    mrz = data.get("passport_mrz")
    is_valid, message = validate_mrz(mrz)
    if not is_valid:
        print(f"⚠️ Invalid MRZ in {client_dir.name}: {message}")
        with open(label_path, "r", encoding="utf-8") as f:
            try:
                data_label = json.load(f)
            except json.JSONDecodeError:
                print(f"❌ JSON decode error in {label_path}")
                continue
            label = data_label.get("label")
            is_valid_label, message_label = validate_label(label)
            if not is_valid_label:
                print(f"⚠️ Invalid label in {client_dir.name}: {message_label}")
                count_isrejected += 1
            else:
                print(f"⚠️ Valid label in {client_dir.name}: {message_label} despite space in MRZ")
                

    else:
        count_isvalid += 1
        print(f"✅ Valid MRZ in {client_dir.name}")
print(f"Total valid MRZ: {count_isvalid}")
print(f"Total rejected with space in MRZ: {count_isrejected}")

✅ Valid MRZ in client_1487
✅ Valid MRZ in client_7559
✅ Valid MRZ in client_7515
⚠️ Invalid MRZ in client_1784: Line 1 contains spaces.
⚠️ Invalid label in client_1784: Rejected label with space in MRZ
✅ Valid MRZ in client_9817
✅ Valid MRZ in client_3415
✅ Valid MRZ in client_2772
✅ Valid MRZ in client_613
✅ Valid MRZ in client_976
✅ Valid MRZ in client_1694
✅ Valid MRZ in client_7501
✅ Valid MRZ in client_3284
✅ Valid MRZ in client_9531
✅ Valid MRZ in client_9820
✅ Valid MRZ in client_4511
✅ Valid MRZ in client_8030
✅ Valid MRZ in client_5149
✅ Valid MRZ in client_2146
✅ Valid MRZ in client_4218
✅ Valid MRZ in client_1047
✅ Valid MRZ in client_2440
✅ Valid MRZ in client_9243
✅ Valid MRZ in client_8255
✅ Valid MRZ in client_2088
✅ Valid MRZ in client_7573
✅ Valid MRZ in client_2779
✅ Valid MRZ in client_9876
✅ Valid MRZ in client_6633
✅ Valid MRZ in client_5097
✅ Valid MRZ in client_5648
✅ Valid MRZ in client_3434
✅ Valid MRZ in client_3017
⚠️ Invalid MRZ in client_9995: Line 1 contai

⚠️ Valid label in client_9995: Accepted label despite space in MRZ despite space in MRZ
✅ Valid MRZ in client_4706
✅ Valid MRZ in client_3059
⚠️ Invalid MRZ in client_6515: Line 1 contains spaces.
⚠️ Valid label in client_6515: Accepted label despite space in MRZ despite space in MRZ
✅ Valid MRZ in client_1239
✅ Valid MRZ in client_7654
✅ Valid MRZ in client_3128
✅ Valid MRZ in client_1595
✅ Valid MRZ in client_3377
✅ Valid MRZ in client_6657
✅ Valid MRZ in client_7874
✅ Valid MRZ in client_9103
✅ Valid MRZ in client_4494
✅ Valid MRZ in client_5307
✅ Valid MRZ in client_6550
⚠️ Invalid MRZ in client_1603: Line 1 contains spaces.
⚠️ Invalid label in client_1603: Rejected label with space in MRZ
✅ Valid MRZ in client_5931
⚠️ Invalid MRZ in client_8554: Line 1 contains spaces.
⚠️ Valid label in client_8554: Accepted label despite space in MRZ despite space in MRZ
✅ Valid MRZ in client_7315
✅ Valid MRZ in client_2493
✅ Valid MRZ in client_5581
✅ Valid MRZ in client_3855
✅ Valid MRZ in clie

Date Format Consistency (Date Format is always YYYY-MM-DD)

In [14]:
import json
from pathlib import Path
from datetime import datetime

# Folder containing client subdirectories
base_dir = Path("datathon2025_2")  # 🔁 Change to your path

def is_valid_yyyy_mm_dd(date_str):
    try:
        datetime.strptime(date_str, "%Y-%m-%d")
        return True
    except (ValueError, TypeError):
        return False

date_fields = ["birth_date", "passport_issue_date", "passport_expiry_date"]

for client_dir in base_dir.iterdir():
    if not client_dir.is_dir():
        continue

    passport_path = client_dir / "passport.json"
    label_path = client_dir / "label.json"

    # Skip if required files are missing
    if not passport_path.exists() or not label_path.exists():
        print(f"⚠️ Missing passport.json or label.json in {client_dir.name}")
        continue

    try:
        passport_data = json.load(passport_path.open("r", encoding="utf-8"))
        label_data = json.load(label_path.open("r", encoding="utf-8"))
    except json.JSONDecodeError as e:
        print(f"❌ JSON decode error in {client_dir.name}: {e}")
        continue

    label = label_data.get("label", "Unknown")

    # Check date format for each field
    for field in date_fields:
        date_value = passport_data.get(field)
        print(date_value)
        if not is_valid_yyyy_mm_dd(date_value):
            print(f"❌ {client_dir.name} - Invalid {field}: {date_value} | Label: {label}")


1990-05-28
2022-12-16
2032-12-15
1993-03-18
2024-01-14
2034-01-13
1983-06-19
2022-05-13
2032-05-12
1969-04-13
2017-11-14
2027-11-13
1970-09-06
2016-09-21
2026-09-20
1977-10-23
2022-03-20
2032-03-19
1977-03-02
2023-05-14
2033-05-13
2001-05-31
2023-11-23
2033-11-22
1961-04-09
2021-04-11
2031-04-10
2004-02-12
2019-04-08
2029-04-07
2003-08-12
2022-11-02
2032-11-01
1966-03-30
2023-08-12
2033-08-11
1995-09-11
2017-06-07
2027-06-06
1996-07-23
2023-07-05
2030-07-04
1980-08-04
2019-12-23
2029-12-22
1956-11-17
2018-07-02
2028-07-01
1996-09-10
2020-12-06
2030-12-05
1992-09-28
2019-01-20
2029-01-19
1976-08-19
2016-04-08
2026-04-07
1969-03-10
2017-06-18
2037-06-17
1987-03-02
2022-05-02
2032-05-01
1972-08-31
2019-05-08
2029-05-07
1983-12-31
2023-04-18
2030-04-17
1979-08-27
2021-09-16
2031-09-15
1992-03-10
2022-05-05
2032-05-04
1971-12-30
2018-01-08
2028-01-07
1978-09-28
1973-12-25
1983-12-24
1977-05-09
2022-05-15
2032-05-14
2002-11-29
2024-03-31
2034-03-30
1976-05-27
2017-08-19
2027-08-18
1964-09-14

Putting Country, Country code and Nationality of valid clients in a JSON file with Country as key