In [None]:
import uuid
from datetime import timezone

import numpy as np
import polars as pl
from faker import Faker

# Initialize Faker for generating realistic data
fake = Faker()

# Generate sample data
n_records = 100

# Create data with proper types from the start
data = {
    "id": [str(uuid.uuid4()) for _ in range(n_records)],
    "email": [fake.email() for _ in range(n_records)],
    "username": [fake.user_name() for _ in range(n_records)],
    "first_name": [fake.first_name() for _ in range(n_records)],
    "last_name": [fake.last_name() for _ in range(n_records)],
    "permission": np.random.choice(["Admin", "Manager", "Collaborator", "Viewer"],
                                 size=n_records,
                                 p=[0.1, 0.2, 0.3, 0.4]),
    "system_admin": np.random.choice([True, False],
                                   size=n_records,
                                   p=[0.1, 0.9]),
    "created_at": [fake.date_time_this_year(tzinfo=timezone.utc) for _ in range(n_records)],
    "updated_at": [fake.date_time_this_year(tzinfo=timezone.utc) for _ in range(n_records)]
}

# Create DataFrame with explicit schema
df = pl.DataFrame(
    data,
    schema={
        "id": pl.Utf8,
        "email": pl.Utf8,
        "username": pl.Utf8,
        "first_name": pl.Utf8,
        "last_name": pl.Utf8,
        "permission": pl.Categorical,
        "system_admin": pl.Boolean,
        "created_at": pl.Datetime(time_zone="UTC"),
        "updated_at": pl.Datetime(time_zone="UTC")
    }
)

# Write to parquet with optimized settings
df.write_parquet(
    "users.parquet",
    statistics=True
)