In [None]:
!git clone https://github.com/EliasSalameh/datathon2025.git


Unzipping JSON files

In [None]:
import zipfile
import os
from pathlib import Path
import tempfile
import shutil

# Paths
top_level_zip_dir = Path("datathon2025")    # <- CHANGE THIS
final_output_dir = Path("datathon2025_2")   # <- CHANGE THIS
final_output_dir.mkdir(parents=True, exist_ok=True)

def extract_inner_zip_preserve_structure(inner_zip_path, output_base_dir):
    client_name = inner_zip_path.stem  # e.g., client_001
    output_client_dir = output_base_dir / client_name
    output_client_dir.mkdir(parents=True, exist_ok=True)

    with zipfile.ZipFile(inner_zip_path, 'r') as inner_zip:
        for file_name in inner_zip.namelist():
            if file_name.endswith('.json'):
                inner_zip.extract(file_name, output_client_dir)

# Process each top-level zip
for top_zip in top_level_zip_dir.glob("*.zip"):
    print(f"Processing: {top_zip.name}")
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir_path = Path(tmpdir)
        with zipfile.ZipFile(top_zip, 'r') as zip_ref:
            zip_ref.extractall(tmpdir_path)
        
        # Find all inner client zips inside extracted folder
        for inner_zip_path in tmpdir_path.rglob("*.zip"):
            print(f"  ↳ Extracting client zip: {inner_zip_path.name}")
            extract_inner_zip_preserve_structure(inner_zip_path, final_output_dir)

print("✅ All JSON files extracted with folder structure preserved.")

# Data Analysis

Passport MRZ consistency (All MRZ are of length 45, some have spaces, about 50% of those with spaces are accepted, turns out space can be ok if name has space in it)

In [None]:
import json
from pathlib import Path

# Change this to your extracted base folder
base_dir = Path("datathon2025_2")

def validate_mrz(mrz):
    if not isinstance(mrz, list) or len(mrz) != 2:
        return False, "MRZ should be a list of two strings."
    for i, line in enumerate(mrz):
        if not isinstance(line, str):
            return False, f"Line {i+1} is not a string."
        if len(line) != 45:
            return False, f"Line {i+1} has length {len(line)} (expected 44)."
        if " " in line:
            return False, f"Line {i+1} contains spaces."
    return True, ""
def validate_label(label):
    if not isinstance(label, str):
        return False, "Label should be a dictionary."
    if label == "Accept":
        return True, "Accepted label despite space in MRZ"
    elif label == "Reject":
        return False, "Rejected label with space in MRZ"

count_isvalid = 0
count_isrejected = 0
# Loop through each client folder
for client_dir in base_dir.iterdir():
    passport_path = client_dir / "passport.json"
    label_path = client_dir / "label.json"  
    if not passport_path.exists():
        print(f"❌ Missing passport.json in {client_dir.name}")
        continue

    with open(passport_path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError:
            print(f"❌ JSON decode error in {passport_path}")
            continue

    mrz = data.get("passport_mrz")
    is_valid, message = validate_mrz(mrz)
    if not is_valid:
        print(f"⚠️ Invalid MRZ in {client_dir.name}: {message}")
        with open(label_path, "r", encoding="utf-8") as f:
            try:
                data_label = json.load(f)
            except json.JSONDecodeError:
                print(f"❌ JSON decode error in {label_path}")
                continue
            label = data_label.get("label")
            is_valid_label, message_label = validate_label(label)
            if not is_valid_label:
                print(f"⚠️ Invalid label in {client_dir.name}: {message_label}")
                count_isrejected += 1
            else:
                print(f"⚠️ Valid label in {client_dir.name}: {message_label} despite space in MRZ")
                

    else:
        count_isvalid += 1
        print(f"✅ Valid MRZ in {client_dir.name}")
print(f"Total valid MRZ: {count_isvalid}")
print(f"Total rejected with space in MRZ: {count_isrejected}")

Date Format Consistency (Date Format is always YYYY-MM-DD)

In [None]:
import json
from pathlib import Path
from datetime import datetime

# Folder containing client subdirectories
base_dir = Path("datathon2025_2")  # 🔁 Change to your path

def is_valid_yyyy_mm_dd(date_str):
    try:
        datetime.strptime(date_str, "%Y-%m-%d")
        return True
    except (ValueError, TypeError):
        return False

date_fields = ["birth_date", "passport_issue_date", "passport_expiry_date"]

for client_dir in base_dir.iterdir():
    if not client_dir.is_dir():
        continue

    passport_path = client_dir / "passport.json"
    label_path = client_dir / "label.json"

    # Skip if required files are missing
    if not passport_path.exists() or not label_path.exists():
        print(f"⚠️ Missing passport.json or label.json in {client_dir.name}")
        continue

    try:
        passport_data = json.load(passport_path.open("r", encoding="utf-8"))
        label_data = json.load(label_path.open("r", encoding="utf-8"))
    except json.JSONDecodeError as e:
        print(f"❌ JSON decode error in {client_dir.name}: {e}")
        continue

    label = label_data.get("label", "Unknown")

    # Check date format for each field
    for field in date_fields:
        date_value = passport_data.get(field)
        print(date_value)
        if not is_valid_yyyy_mm_dd(date_value):
            print(f"❌ {client_dir.name} - Invalid {field}: {date_value} | Label: {label}")


Putting Country, Country code and Nationality of valid clients in a JSON file with Country as key

In [1]:
import json
from pathlib import Path

base_dir = Path("datathon2025_2")  # 🔁 Change to your actual path
output = {}

for client_dir in base_dir.iterdir():
    if not client_dir.is_dir():
        continue

    passport_path = client_dir / "passport.json"
    label_path = client_dir / "label.json"

    if not passport_path.exists() or not label_path.exists():
        continue

    try:
        passport = json.load(passport_path.open("r", encoding="utf-8"))
        label = json.load(label_path.open("r", encoding="utf-8")).get("label")
    except json.JSONDecodeError:
        continue

    if label != "Accept":
        continue

    country = passport.get("country")
    country_code = passport.get("country_code")
    nationality = passport.get("nationality")

    if not all([country, country_code, nationality]):
        continue

    key_base = country
    value = (country_code, nationality)

    # If country already exists with same value, skip
    if key_base in output and output[key_base] == value:
        continue

    # If country exists but with a different value, find a unique key
    if key_base in output and output[key_base] != value:
        i = 1
        new_key = f"{key_base}_{i}"
        while new_key in output and output[new_key] != value:
            i += 1
            new_key = f"{key_base}_{i}"
        output[new_key] = value
    else:
        output[key_base] = value

# Save to output file
with open("country_mappings.json", "w", encoding="utf-8") as f:
    json.dump(output, f, indent=4)

print("✅ Finished writing country_mappings.json")


✅ Finished writing country_mappings.json


Now, I want to check if there is a country for clients with label reject that is NOT in country_mappings.json

In [2]:
import json
from pathlib import Path

base_dir = Path("datathon2025_2")  # 🔁 Update to your actual folder
mapping_file = Path("country_mappings.json")

# Load your Accept country mappings
with mapping_file.open("r", encoding="utf-8") as f:
    mappings = json.load(f)

# Extract all base country names from keys
mapped_countries = {key.split("_")[0] for key in mappings.keys()}

# Track countries found only in rejected clients
reject_countries = set()

for client_dir in base_dir.iterdir():
    if not client_dir.is_dir():
        continue

    passport_path = client_dir / "passport.json"
    label_path = client_dir / "label.json"

    if not passport_path.exists() or not label_path.exists():
        continue

    try:
        passport = json.load(passport_path.open("r", encoding="utf-8"))
        label = json.load(label_path.open("r", encoding="utf-8")).get("label")
    except json.JSONDecodeError:
        continue

    if label != "Reject":
        continue

    country = passport.get("country")
    if country and country not in mapped_countries:
        reject_countries.add(country)

# Report results
if reject_countries:
    print("❌ Countries in rejected clients NOT found in mappings:")
    for country in sorted(reject_countries):
        print(f" - {country}")
else:
    print("✅ All reject countries are represented in the Accept mappings.")


✅ All reject countries are represented in the Accept mappings.


Checking for "birth_date", "passport_issue_date", "passport_expiry_date" with regards to current date (No passport issued after current_date, passports expiring before current date can get accepted, but those expired a long time ago seem to be rejected. 9 people were under 18 and all were reject)

In [None]:
import json
from pathlib import Path
from datetime import datetime

CURRENT_DATE = datetime(2025, 3, 1)
MIN_BIRTHDATE = datetime(2007, 3, 1)  # 18 years ago

base_dir = Path("datathon2025_2")  # 🔁 Replace with your actual path

def parse_date(date_str, field, client_id):
    try:
        return datetime.strptime(date_str, "%Y-%m-%d")
    except Exception as e:
        print(f"❗ Error parsing '{field}' for client {client_id}: '{date_str}' — {e}")
        return None

for client_dir in base_dir.iterdir():
    if not client_dir.is_dir():
        continue

    passport_path = client_dir / "passport.json"
    label_path = client_dir / "label.json"
    client_id = client_dir.name

    if not passport_path.exists() or not label_path.exists():
        print(f"⚠️ Missing passport or label file for client {client_id}")
        continue

    try:
        with passport_path.open("r", encoding="utf-8") as f:
            passport = json.load(f)
        with label_path.open("r", encoding="utf-8") as f:
            label = json.load(f).get("label", "Unknown")
    except Exception as e:
        print(f"❌ Failed to load JSON for client {client_id}: {e}")
        continue

    birth_date = parse_date(passport.get("birth_date", ""), "birth_date", client_id)
    issue_date = parse_date(passport.get("passport_issue_date", ""), "passport_issue_date", client_id)
    expiry_date = parse_date(passport.get("passport_expiry_date", ""), "passport_expiry_date", client_id)

    # 1. Passport issued in future
    if issue_date and issue_date > CURRENT_DATE:
        print(f"📌 [{client_id}] Passport issued in the future: {issue_date.date()} — Label: {label}")

    # 2. Client under 21
    if birth_date and birth_date > MIN_BIRTHDATE:
        age = (CURRENT_DATE - birth_date).days // 365
        print(f"👶 [{client_id}] Client is under 18 (age {age}) — Label: {label}")

    # 3. Passport expired
    if expiry_date and expiry_date < CURRENT_DATE:
        print(f"⌛ [{client_id}] Passport expired on {expiry_date.date()} — Label: {label}")

    # 4. Expiry before issue
    if expiry_date and issue_date and expiry_date < issue_date:
        print(f"⚠️ [{client_id}] Expiry date {expiry_date.date()} is before issue date {issue_date.date()} — Label: {label}")

    # 5. Expiry before birth
    if expiry_date and birth_date and expiry_date < birth_date:
        print(f"⚠️ [{client_id}] Expiry date {expiry_date.date()} is before birth date {birth_date.date()} — Label: {label}")


Checking if there are genders other than "M" or "F" (Turns out there are empty gender and they are all rejected)

In [None]:
import json
from pathlib import Path

base_dir = Path("datathon2025_2")  # 🔁 Replace with your actual path

for client_dir in base_dir.iterdir():
    if not client_dir.is_dir():
        continue

    passport_path = client_dir / "passport.json"
    label_path = client_dir / "label.json"
    client_id = client_dir.name

    if not passport_path.exists() or not label_path.exists():
        print(f"⚠️ Missing passport or label file for client {client_id}")
        continue

    try:
        with passport_path.open("r", encoding="utf-8") as f:
            passport = json.load(f)
        with label_path.open("r", encoding="utf-8") as f:
            label = json.load(f).get("label", "Unknown")
    except Exception as e:
        print(f"❌ Failed to load JSON for client {client_id}: {e}")
        continue

    gender = passport.get("gender")
    
    if gender not in ["F", "M"]:
        print(f"⚠️ [{client_id}] Invalid gender value '{gender}' — Label: {label}")


Checking if client graduated at a very young age and whether the client was accepted (This won't work but dropped since Omar did it)

In [None]:
import json
from pathlib import Path
from datetime import datetime

# Current date is assumed to be 2025-03-01
current_date = datetime(2025, 3, 1)

def calculate_age(birth_date_str, event_year):
    birth_date = datetime.strptime(birth_date_str, "%Y-%m-%d")
    age_at_event = event_year - birth_date.year - ((current_date.month, current_date.day) < (birth_date.month, birth_date.day))
    return age_at_event

def check_inconsistencies(client_id, profile, label):
    birth_date = profile.get("birth_date")
    
    # Check if birth_date exists
    if not birth_date:
        print(f"❌ [{client_id}] Missing birth_date")
        return

    # High School Graduation Year (secondary_school)
    high_school_graduation = profile.get("secondary_school", {}).get("graduation_year")
    # University Graduation Year (higher_education)
    higher_education = profile.get("higher_education", {}) #[0].get("graduation_year")
    for uni in higher_education:
        if uni.get("graduation_year"):
            university_graduation = uni.get("graduation_year")
            break
    # Employment Start Year (employment_history)
    employment_start = profile.get("employment_history", {})[0].get("start_year")

    # 1. Calculate Age at High School Start (typically 12-14 years old)
    if high_school_graduation:
        age_at_high_school = calculate_age(birth_date, high_school_graduation - 4)  # Assuming high school start around age 12-14
        if age_at_high_school < 12:
            print(f"❗ [{client_id}] Started high school at an unusually young age ({age_at_high_school}) — Label: {label}")

    # 2. Calculate Age at University Start (typically 18-20 years old)
    if university_graduation:
        age_at_university = calculate_age(birth_date, university_graduation - 3)  # Assuming university start around age 18-20
        if age_at_university < 18:
            print(f"❗ [{client_id}] Started university at an unusually young age ({age_at_university}) — Label: {label}")

    # 3. Calculate Age at Employment Start (typically 18+ years old)
    if employment_start:
        age_at_employment = calculate_age(birth_date, employment_start)
        if age_at_employment < 18:
            print(f"❗ [{client_id}] Started employment at an unusually young age ({age_at_employment}) — Label: {label}")


for client_dir in Path("datathon2025_2").iterdir():
    if not client_dir.is_dir():
        continue

    profile_path = client_dir / "client_profile.json"
    passport_path = client_dir / "passport.json"
    label_path = client_dir / "label.json"
    client_id = client_dir.name

    if not profile_path.exists() or not passport_path.exists() or not label_path.exists():
        print(f"⚠️ Missing profile, passport, or label file for client {client_id}")
        continue

    try:
        with profile_path.open("r", encoding="utf-8") as f:
            profile = json.load(f)
        with passport_path.open("r", encoding="utf-8") as f:
            passport = json.load(f)
        with label_path.open("r", encoding="utf-8") as f:
            label = json.load(f).get("label", "Unknown")
    except Exception as e:
        print(f"❌ Failed to load JSON for client {client_id}: {e}")
        continue

    # Check for inconsistencies
    check_inconsistencies(client_id, profile, label)


❗ [client_1784] Started high school at an unusually young age (2) — Label: Reject


IndexError: list index out of range

Checking consistency between country of domicile and postal code (No mismatch in country patterns, but some postal_codes are empty and the client is always rejected in this case. Some clients have multiple countries of domicile and always get rejectd)

In [None]:
import json
import re
import os

# Define the postal code format for different countries
postal_code_formats = {
    "Netherlands": r"\d{4} [A-Za-z0-9]{2}",
    "Denmark": r"\d{4}",
    "France": r"\d{5}",
    "Italy": r"\d{5}",
    "Austria": r"\d{4}",
    "Spain": r"\d{5}",
    "Germany": r"\d{5}",
    "Belgium": r"\d{4}",
    "Switzerland": r"\d{4}",
    "Finland": r"\d{5}",
}

# Function to check postal code format and client status
def check_postal_code_format(client_dir, account_form_path, label_path):
    # Read the client data from the JSON file
    with open(account_form_path, "r") as file:
        client_data = json.load(file)
        
    with open(label_path, "r") as file:
        label_data = json.load(file)

    country = client_data["country_of_domicile"]
    postal_code = client_data["address"]["postal code"]
    label = label_data["label"]

    # Get the correct regex pattern for the country
    pattern = postal_code_formats.get(country)

    if pattern:
        # Check if the postal code matches the format
        if not re.match(pattern, postal_code):
            print(f"❗ Client {client_dir.name} has an invalid postal code ({postal_code}) for {country}. Status: {label}")
            
        # else:
        #     print(f"Client {client_data['name']} has a valid postal code ({postal_code}) for {country}. Status: {label}")
    else:
        print(f"❗ Client {client_dir.name}'s Postal code format not defined for {country}. Satatus: {label}")

# Example usage: Assuming you have multiple client files stored in a directory

directory = "datathon2025_2"  # Change this to your directory path

# Loop through all JSON files in the directory
for client_dir in Path(directory).iterdir():
    account_form_path = client_dir / "account_form.json"
    label_path = client_dir / "label.json"
    if account_form_path.exists():
        check_postal_code_format(client_dir, account_form_path, label_path)
