In [2]:
!pip install pyiceberg==0.10.0
!pip install pyarrow==22.0.0



In [3]:
from pyiceberg.catalog.rest import RestCatalog
import pyarrow as pa
import pyarrow.parquet as pq


catalog = RestCatalog(
    name='polaris',
    uri='http://polaris:8181/api/catalog',
    warehouse='demo_catalog',
    credential='admin:admin',
    scope='PRINCIPAL_ROLE:ALL',
)

In [4]:
print(catalog.list_namespaces())

[('my_warehouse',), ('test_db',)]


In [5]:
db = "test_db"

In [6]:
schema = pa.schema(
    [
        pa.field("f1", pa.int64(), nullable=True),
        pa.field("f2", pa.string(), nullable=True),
        pa.field("rand", pa.int64(), nullable=True),
    ]
)

catalog.drop_table("test_db.pydemo")

table = catalog.create_table(
    identifier="test_db.pydemo",
    schema=schema,
)


print(catalog.list_tables(db))

[('test_db', 'test_table'), ('test_db', 'users'), ('test_db', 'call_center'), ('test_db', 'pydemo')]


In [7]:

# Примерные данные
data = {
    "f1": [1, 2, 3],
    "f2": ["Alice", "Bob", "Charlie"],
    "rand": [42, 99, 123],
}

# Создаём Arrow-таблицу
arrow_table = pa.Table.from_pydict(data)

# # Приводим f1 → int32
# arrow_table = arrow_table.set_column(
#     0, "f1", arrow_table["f1"].cast(pa.int32())
# )

table.append(arrow_table)

In [10]:
# Чтение самих данных - метод scan.
# Возможна конвертация to_arrow, to_pandas, to_duckdb

table.scan().to_pandas()

Unnamed: 0,f1,f2,rand
0,1,Alice,42
1,2,Bob,99
2,3,Charlie,123


In [11]:
len(table.scan().to_arrow())

3

# Исследование параметров таблиц - table.inspect.method()

## Возвращает arrow DS. Для просмотре удобно перегоднять через to_pydict()

Аналогичную информацию можно достать через Trino в виде служебных таблиц
В PyIceberg больше возможностей чем в аналогах Трино. Работает быстрее.

In [12]:
table.inspect.snapshots()

pyarrow.Table
committed_at: timestamp[ms] not null
snapshot_id: int64 not null
parent_id: int64
operation: string
manifest_list: string not null
summary: map<string, string>
  child 0, entries: struct<key: string not null, value: string> not null
      child 0, key: string not null
      child 1, value: string
----
committed_at: [[2025-12-12 09:30:34.399]]
snapshot_id: [[6952705983024221710]]
parent_id: [[null]]
operation: [["append"]]
manifest_list: [["s3://warehouse/test_db/pydemo/metadata/snap-6952705983024221710-0-fb333b7b-488e-4e28-ba09-348a9926 (... 9 chars omitted)"]]
summary: [[keys:["added-files-size","added-data-files","added-records","total-data-files","total-delete-files","total-records","total-files-size","total-position-deletes","total-equality-deletes"]values:["1342","1","3","1","0","3","1342","0","0"]]]

In [13]:
table.inspect.partitions()

pyarrow.Table
record_count: int64 not null
file_count: int32 not null
total_data_file_size_in_bytes: int64 not null
position_delete_record_count: int64 not null
position_delete_file_count: int32 not null
equality_delete_record_count: int64 not null
equality_delete_file_count: int32 not null
last_updated_at: timestamp[ms]
last_updated_snapshot_id: int64
----
record_count: [[3]]
file_count: [[1]]
total_data_file_size_in_bytes: [[1342]]
position_delete_record_count: [[0]]
position_delete_file_count: [[0]]
equality_delete_record_count: [[0]]
equality_delete_file_count: [[0]]
last_updated_at: [[2025-12-12 09:30:34.399]]
last_updated_snapshot_id: [[6952705983024221710]]

In [14]:
table.inspect.entries()

pyarrow.Table
status: int8 not null
snapshot_id: int64 not null
sequence_number: int64 not null
file_sequence_number: int64 not null
data_file: struct<content: int8 not null, file_path: string not null, file_format: string not null, partition:  (... 404 chars omitted) not null
  child 0, content: int8 not null
  child 1, file_path: string not null
  child 2, file_format: string not null
  child 3, partition: struct<> not null
  child 4, record_count: int64 not null
  child 5, file_size_in_bytes: int64 not null
  child 6, column_sizes: map<int32, int64>
      child 0, entries: struct<key: int32 not null, value: int64> not null
          child 0, key: int32 not null
          child 1, value: int64
  child 7, value_counts: map<int32, int64>
      child 0, entries: struct<key: int32 not null, value: int64> not null
          child 0, key: int32 not null
          child 1, value: int64
  child 8, null_value_counts: map<int32, int64>
      child 0, entries: struct<key: int32 not null, value:

In [15]:
table.inspect.refs()

pyarrow.Table
name: string not null
type: dictionary<values=string, indices=int32, ordered=0> not null
snapshot_id: int64 not null
max_reference_age_in_ms: int64
min_snapshots_to_keep: int32
max_snapshot_age_in_ms: int64
----
name: [["main"]]
type: [  -- dictionary:
["BRANCH"]  -- indices:
[0]]
snapshot_id: [[6952705983024221710]]
max_reference_age_in_ms: [[null]]
min_snapshots_to_keep: [[null]]
max_snapshot_age_in_ms: [[null]]

In [16]:
table.inspect.manifests()

pyarrow.Table
content: int8 not null
path: string not null
length: int64 not null
partition_spec_id: int32 not null
added_snapshot_id: int64 not null
added_data_files_count: int32 not null
existing_data_files_count: int32 not null
deleted_data_files_count: int32 not null
added_delete_files_count: int32 not null
existing_delete_files_count: int32 not null
deleted_delete_files_count: int32 not null
partition_summaries: list<item: struct<contains_null: bool not null, contains_nan: bool, lower_bound: string, upper_bound (... 10 chars omitted) not null
  child 0, item: struct<contains_null: bool not null, contains_nan: bool, lower_bound: string, upper_bound: string>
      child 0, contains_null: bool not null
      child 1, contains_nan: bool
      child 2, lower_bound: string
      child 3, upper_bound: string
----
content: [[0]]
path: [["s3://warehouse/test_db/pydemo/metadata/fb333b7b-488e-4e28-ba09-348a9926dd3b-m0.avro"]]
length: [[4348]]
partition_spec_id: [[0]]
added_snapshot_id: [[695

In [17]:
table.inspect.metadata_log_entries()

pyarrow.Table
timestamp: timestamp[ms] not null
file: string not null
latest_snapshot_id: int64
latest_schema_id: int32
latest_sequence_number: int64
----
timestamp: [[2025-12-12 09:29:56.000,2025-12-12 09:30:34.399]]
file: [["s3://warehouse/test_db/pydemo/metadata/00000-c548083c-386c-4134-81a9-a07886b5cbfb.metadata.json","s3://warehouse/test_db/pydemo/metadata/00001-eadcba65-a775-404f-b9f4-05264d19a12d.metadata.json"]]
latest_snapshot_id: [[null,6952705983024221710]]
latest_schema_id: [[null,0]]
latest_sequence_number: [[null,1]]

In [18]:
table.inspect.history()

pyarrow.Table
made_current_at: timestamp[ms] not null
snapshot_id: int64 not null
parent_id: int64
is_current_ancestor: bool not null
----
made_current_at: [[2025-12-12 09:30:34.399]]
snapshot_id: [[6952705983024221710]]
parent_id: [[null]]
is_current_ancestor: [[true]]

In [19]:
table.inspect.files()

pyarrow.Table
content: int8 not null
file_path: string not null
file_format: dictionary<values=string, indices=int32, ordered=0> not null
spec_id: int32 not null
partition: struct<> not null
record_count: int64 not null
file_size_in_bytes: int64 not null
column_sizes: map<int32, int64>
  child 0, entries: struct<key: int32 not null, value: int64> not null
      child 0, key: int32 not null
      child 1, value: int64
value_counts: map<int32, int64>
  child 0, entries: struct<key: int32 not null, value: int64> not null
      child 0, key: int32 not null
      child 1, value: int64
null_value_counts: map<int32, int64>
  child 0, entries: struct<key: int32 not null, value: int64> not null
      child 0, key: int32 not null
      child 1, value: int64
nan_value_counts: map<int32, int64>
  child 0, entries: struct<key: int32 not null, value: int64> not null
      child 0, key: int32 not null
      child 1, value: int64
lower_bounds: map<int32, binary>
  child 0, entries: struct<key: int32 n