In [None]:
!git clone https://github.com/EliasSalameh/datathon2025.git


Unzipping JSON files

In [None]:
import zipfile
import os
from pathlib import Path
import tempfile
import shutil

# Paths
top_level_zip_dir = Path("datathon2025")    # <- CHANGE THIS
final_output_dir = Path("datathon2025_2")   # <- CHANGE THIS
final_output_dir.mkdir(parents=True, exist_ok=True)

def extract_inner_zip_preserve_structure(inner_zip_path, output_base_dir):
    client_name = inner_zip_path.stem  # e.g., client_001
    output_client_dir = output_base_dir / client_name
    output_client_dir.mkdir(parents=True, exist_ok=True)

    with zipfile.ZipFile(inner_zip_path, 'r') as inner_zip:
        for file_name in inner_zip.namelist():
            if file_name.endswith('.json'):
                inner_zip.extract(file_name, output_client_dir)

# Process each top-level zip
for top_zip in top_level_zip_dir.glob("*.zip"):
    print(f"Processing: {top_zip.name}")
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir_path = Path(tmpdir)
        with zipfile.ZipFile(top_zip, 'r') as zip_ref:
            zip_ref.extractall(tmpdir_path)
        
        # Find all inner client zips inside extracted folder
        for inner_zip_path in tmpdir_path.rglob("*.zip"):
            print(f"  ↳ Extracting client zip: {inner_zip_path.name}")
            extract_inner_zip_preserve_structure(inner_zip_path, final_output_dir)

print("✅ All JSON files extracted with folder structure preserved.")

# Data Analysis

Passport MRZ consistency (All MRZ are of length 45, some have spaces, about 50% of those with spaces are accepted, turns out space can be ok if name has space in it)

In [None]:
import json
from pathlib import Path

# Change this to your extracted base folder
base_dir = Path("datathon2025_2")

def validate_mrz(mrz):
    if not isinstance(mrz, list) or len(mrz) != 2:
        return False, "MRZ should be a list of two strings."
    for i, line in enumerate(mrz):
        if not isinstance(line, str):
            return False, f"Line {i+1} is not a string."
        if len(line) != 45:
            return False, f"Line {i+1} has length {len(line)} (expected 44)."
        if " " in line:
            return False, f"Line {i+1} contains spaces."
    return True, ""
def validate_label(label):
    if not isinstance(label, str):
        return False, "Label should be a dictionary."
    if label == "Accept":
        return True, "Accepted label despite space in MRZ"
    elif label == "Reject":
        return False, "Rejected label with space in MRZ"

count_isvalid = 0
count_isrejected = 0
# Loop through each client folder
for client_dir in base_dir.iterdir():
    passport_path = client_dir / "passport.json"
    label_path = client_dir / "label.json"  
    if not passport_path.exists():
        print(f"❌ Missing passport.json in {client_dir.name}")
        continue

    with open(passport_path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError:
            print(f"❌ JSON decode error in {passport_path}")
            continue

    mrz = data.get("passport_mrz")
    is_valid, message = validate_mrz(mrz)
    if not is_valid:
        print(f"⚠️ Invalid MRZ in {client_dir.name}: {message}")
        with open(label_path, "r", encoding="utf-8") as f:
            try:
                data_label = json.load(f)
            except json.JSONDecodeError:
                print(f"❌ JSON decode error in {label_path}")
                continue
            label = data_label.get("label")
            is_valid_label, message_label = validate_label(label)
            if not is_valid_label:
                print(f"⚠️ Invalid label in {client_dir.name}: {message_label}")
                count_isrejected += 1
            else:
                print(f"⚠️ Valid label in {client_dir.name}: {message_label} despite space in MRZ")
                

    else:
        count_isvalid += 1
        print(f"✅ Valid MRZ in {client_dir.name}")
print(f"Total valid MRZ: {count_isvalid}")
print(f"Total rejected with space in MRZ: {count_isrejected}")

Date Format Consistency (Date Format is always YYYY-MM-DD)

In [None]:
import json
from pathlib import Path
from datetime import datetime

# Folder containing client subdirectories
base_dir = Path("datathon2025_2")  # 🔁 Change to your path

def is_valid_yyyy_mm_dd(date_str):
    try:
        datetime.strptime(date_str, "%Y-%m-%d")
        return True
    except (ValueError, TypeError):
        return False

date_fields = ["birth_date", "passport_issue_date", "passport_expiry_date"]

for client_dir in base_dir.iterdir():
    if not client_dir.is_dir():
        continue

    passport_path = client_dir / "passport.json"
    label_path = client_dir / "label.json"

    # Skip if required files are missing
    if not passport_path.exists() or not label_path.exists():
        print(f"⚠️ Missing passport.json or label.json in {client_dir.name}")
        continue

    try:
        passport_data = json.load(passport_path.open("r", encoding="utf-8"))
        label_data = json.load(label_path.open("r", encoding="utf-8"))
    except json.JSONDecodeError as e:
        print(f"❌ JSON decode error in {client_dir.name}: {e}")
        continue

    label = label_data.get("label", "Unknown")

    # Check date format for each field
    for field in date_fields:
        date_value = passport_data.get(field)
        print(date_value)
        if not is_valid_yyyy_mm_dd(date_value):
            print(f"❌ {client_dir.name} - Invalid {field}: {date_value} | Label: {label}")


Putting Country, Country code and Nationality of valid clients in a JSON file with Country as key

In [1]:
import json
from pathlib import Path

base_dir = Path("datathon2025_2")  # 🔁 Change to your actual path
output = {}

for client_dir in base_dir.iterdir():
    if not client_dir.is_dir():
        continue

    passport_path = client_dir / "passport.json"
    label_path = client_dir / "label.json"

    if not passport_path.exists() or not label_path.exists():
        continue

    try:
        passport = json.load(passport_path.open("r", encoding="utf-8"))
        label = json.load(label_path.open("r", encoding="utf-8")).get("label")
    except json.JSONDecodeError:
        continue

    if label != "Accept":
        continue

    country = passport.get("country")
    country_code = passport.get("country_code")
    nationality = passport.get("nationality")

    if not all([country, country_code, nationality]):
        continue

    key_base = country
    value = (country_code, nationality)

    # If country already exists with same value, skip
    if key_base in output and output[key_base] == value:
        continue

    # If country exists but with a different value, find a unique key
    if key_base in output and output[key_base] != value:
        i = 1
        new_key = f"{key_base}_{i}"
        while new_key in output and output[new_key] != value:
            i += 1
            new_key = f"{key_base}_{i}"
        output[new_key] = value
    else:
        output[key_base] = value

# Save to output file
with open("country_mappings.json", "w", encoding="utf-8") as f:
    json.dump(output, f, indent=4)

print("✅ Finished writing country_mappings.json")


✅ Finished writing country_mappings.json


Now, I want to check if there is a country for clients with label reject that is NOT in country_mappings.json

In [2]:
import json
from pathlib import Path

base_dir = Path("datathon2025_2")  # 🔁 Update to your actual folder
mapping_file = Path("country_mappings.json")

# Load your Accept country mappings
with mapping_file.open("r", encoding="utf-8") as f:
    mappings = json.load(f)

# Extract all base country names from keys
mapped_countries = {key.split("_")[0] for key in mappings.keys()}

# Track countries found only in rejected clients
reject_countries = set()

for client_dir in base_dir.iterdir():
    if not client_dir.is_dir():
        continue

    passport_path = client_dir / "passport.json"
    label_path = client_dir / "label.json"

    if not passport_path.exists() or not label_path.exists():
        continue

    try:
        passport = json.load(passport_path.open("r", encoding="utf-8"))
        label = json.load(label_path.open("r", encoding="utf-8")).get("label")
    except json.JSONDecodeError:
        continue

    if label != "Reject":
        continue

    country = passport.get("country")
    if country and country not in mapped_countries:
        reject_countries.add(country)

# Report results
if reject_countries:
    print("❌ Countries in rejected clients NOT found in mappings:")
    for country in sorted(reject_countries):
        print(f" - {country}")
else:
    print("✅ All reject countries are represented in the Accept mappings.")


✅ All reject countries are represented in the Accept mappings.
