In [70]:
import pandas as pd
import trino
import json

# Cấu hình để Pandas hiển thị đầy đủ nội dung (quan trọng khi xem JSON)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

print("Các thư viện đã sẵn sàng.")

Các thư viện đã sẵn sàng.


In [71]:
# Kết nối tới Trino service
conn = trino.dbapi.connect(
    host="127.0.0.1",
    port=8083,
    user="don",
    catalog="lakehouse",   # hoặc "iceberg" tùy SHOW CATALOGS
    schema="rva",       # đây là database trong iceberg
)


cursor = conn.cursor()
print(f"Kết nối Trino (Catalog: {conn.catalog}, Schema: {conn.schema}) thành công!")

Kết nối Trino (Catalog: lakehouse, Schema: rva) thành công!


In [72]:
cursor.execute("SHOW TABLES")
tables = cursor.fetchall()
print("Các bảng có trong schema 'retail':")
print(tables)

# Kết quả mong đợi: [('bronze_raw',)]

Các bảng có trong schema 'retail':
[['bronze_raw'], ['gold_people_per_minute'], ['gold_track_summary'], ['gold_zone_dwell'], ['gold_zone_heatmap'], ['silver_detections_v2'], ['silver_detections_v3']]

[['bronze_raw'], ['gold_people_per_minute'], ['gold_track_summary'], ['gold_zone_dwell'], ['gold_zone_heatmap'], ['silver_detections_v2'], ['silver_detections_v3']]


#### Đọc dữ liệu Bronze bằng Trino

In [73]:
cursor.execute("SELECT * FROM bronze_raw LIMIT 10")
rows = cursor.fetchall()
print(rows)


TrinoExternalError: TrinoExternalError(type=EXTERNAL, name=ICEBERG_CATALOG_ERROR, message="Failed to load table: rva.bronze_raw", query_id=20251124_120923_00078_7v8pb)

#### Kiểm tra cấu trúc bảng (Schema)

In [None]:
cursor.execute("DESCRIBE bronze_raw")
columns = cursor.fetchall()
print("Cấu trúc bảng bronze_raw:")
pd.DataFrame(columns, columns=['Column', 'Type', 'Extra', 'Comment'])

Cấu trúc bảng bronze_raw:


Unnamed: 0,Column,Type,Extra,Comment
0,schema_version,varchar,,
1,pipeline_run_id,varchar,,
2,frame_index,bigint,,
3,payload,varchar,,
4,camera_id,varchar,,
5,store_id,varchar,,
6,ingest_ts,timestamp(6),,


#### Số lượng record trong Bronze

In [None]:

cursor.execute("SELECT COUNT(*) FROM bronze_raw")
rows = cursor.fetchall()
print(rows)


[[464]]


#### Xem schema bảng bronze_raw

In [None]:
cursor.execute("DESCRIBE bronze_raw")
schema_rows = cursor.fetchall()
for r in schema_rows:
    print(r)


['schema_version', 'varchar', '', '']
['pipeline_run_id', 'varchar', '', '']
['frame_index', 'bigint', '', '']
['payload', 'varchar', '', '']
['camera_id', 'varchar', '', '']
['store_id', 'varchar', '', '']
['ingest_ts', 'timestamp(6)', '', '']


1) schema_version (varchar)

Phiên bản schema của metadata JSON từ module vision.
Giúp bạn biết lúc nào format JSON thay đổi.
Ví dụ: "v1".

2) pipeline_run_id (varchar)

ID duy nhất cho mỗi lần chạy vision pipeline.
Một video chạy lại → một pipeline_run_id khác.
Dùng để debug hoặc trace theo từng lần chạy.

3) frame_index (bigint)

Số thứ tự frame trong video.
Ví dụ: 1, 2, 3, 4…
Quan trọng cho tracking thời gian thực.

4) payload (varchar)

JSON thô từ vision.
Đây là phần to nhất: detections, bbox, timestamp, centroid…
Silver sẽ parse từ đây.

5) camera_id (varchar)

ID camera gửi dữ liệu (ví dụ: "cam_01").
Bạn dùng để partition/cluster data theo camera.

6) store_id (varchar)

ID cửa hàng (ví dụ "store_01").
Giúp truy vấn BI theo từng cửa hàng.

7) ingest_ts (timestamp(6))

Thời điểm Flink ghi record vào Iceberg.
Không phải capture_ts từ JSON → mà là timestamp ingestion.
Dùng để kiểm tra trễ, latency pipeline.

---

In [None]:
cursor.execute("""
SELECT 
    json_extract(payload, '$.capture_ts') AS ts,
    json_extract(payload, '$.detections') AS dets
FROM bronze_raw
LIMIT 3
""")
print(cursor.fetchall())


[['"2025-11-24T12:00:25.321241+00:00"', '[{"det_id":"1-0","class":"person","class_id":0,"conf":0.8935845494270325,"bbox":{"x1":1001.053955078125,"y1":255.43377685546875,"x2":1156.245361328125,"y2":677.5283203125},"bbox_norm":{"x":0.5213822682698568,"y":0.23651275634765626,"w":0.080828857421875,"h":0.3908282809787326},"centroid":{"x":1078,"y":466},"centroid_norm":{"x":0.5617966969807943,"y":0.43192689683702257},"track_id":1},{"det_id":"1-1","class":"person","class_id":0,"conf":0.6988767981529236,"bbox":{"x1":1216.485595703125,"y1":294.24261474609375,"x2":1359.431396484375,"y2":746.9419555664062},"bbox_norm":{"x":0.6335862477620443,"y":0.27244686550564234,"w":0.07445093790690104,"h":0.4191660563151042},"centroid":{"x":1287,"y":520},"centroid_norm":{"x":0.6708117167154948,"y":0.48202989366319443},"track_id":2},{"det_id":"1-2","class":"person","class_id":0,"conf":0.6859533786773682,"bbox":{"x1":680.6466064453125,"y1":53.300048828125,"x2":745.2310180664062,"y2":210.07730102539062},"bbox_nor

### Lớp silver

#### Lấy capture_ts + số người / frame

In [None]:
cursor.execute("""
SELECT
    json_extract_scalar(payload, '$.frame_index') AS frame_index,
    json_extract_scalar(payload, '$.capture_ts') AS capture_ts_str,
    json_array_length(json_extract(payload, '$.detections')) AS num_person,
    camera_id,
    store_id,
    ingest_ts
FROM bronze_raw
LIMIT 5
""")
rows = cursor.fetchall()
for r in rows:
    print(r)


['2', '2025-11-24T12:00:25.517747+00:00', 4, 'cam_01', 'store_01', datetime.datetime(2025, 11, 24, 11, 58, 38, 911000)]
['3', '2025-11-24T12:00:25.569256+00:00', 5, 'cam_01', 'store_01', datetime.datetime(2025, 11, 24, 11, 58, 38, 914000)]
['1', '2025-11-24T12:00:25.321241+00:00', 4, 'cam_01', 'store_01', datetime.datetime(2025, 11, 24, 11, 58, 38, 164000)]
['4', '2025-11-24T12:00:25.614361+00:00', 5, 'cam_01', 'store_01', datetime.datetime(2025, 11, 24, 11, 58, 38, 917000)]
['5', '2025-11-24T12:00:25.671653+00:00', 5, 'cam_01', 'store_01', datetime.datetime(2025, 11, 24, 11, 58, 38, 919000)]


#### Bung mảng detections thành từng dòng (UNNEST)

In [None]:
cursor.execute("""
SELECT
    CAST(json_extract_scalar(payload, '$.frame_index') AS bigint) AS frame_index,
    from_iso8601_timestamp(json_extract_scalar(payload, '$.capture_ts')) AS capture_ts,
    camera_id,
    store_id,
    json_extract_scalar(det, '$.det_id') AS det_id,
    json_extract_scalar(det, '$.class') AS class,
    CAST(json_extract_scalar(det, '$.class_id') AS integer) AS class_id,
    CAST(json_extract_scalar(det, '$.conf') AS double) AS conf,
    CAST(json_extract_scalar(det, '$.bbox.x1') AS double) AS bbox_x1,
    CAST(json_extract_scalar(det, '$.bbox.y1') AS double) AS bbox_y1,
    CAST(json_extract_scalar(det, '$.bbox.x2') AS double) AS bbox_x2,
    CAST(json_extract_scalar(det, '$.bbox.y2') AS double) AS bbox_y2,
    CAST(json_extract_scalar(det, '$.centroid.x') AS double) AS centroid_x,
    CAST(json_extract_scalar(det, '$.centroid.y') AS double) AS centroid_y,
    CAST(json_extract_scalar(det, '$.track_id') AS bigint) AS track_id,
    ingest_ts
FROM bronze_raw
CROSS JOIN UNNEST(
    CAST(json_extract(payload, '$.detections') AS array(json))
) AS t(det)
LIMIT 10
""")
rows = cursor.fetchall()
for r in rows:
    print(r)


[1, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000, tzinfo=zoneinfo.ZoneInfo(key='UTC')), 'cam_01', 'store_01', '1-0', 'person', 0, 0.8935845494270325, 1001.053955078125, 255.43377685546875, 1156.245361328125, 677.5283203125, 1078.0, 466.0, 1, datetime.datetime(2025, 11, 24, 11, 58, 38, 164000)]
[1, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000, tzinfo=zoneinfo.ZoneInfo(key='UTC')), 'cam_01', 'store_01', '1-1', 'person', 0, 0.6988767981529236, 1216.485595703125, 294.24261474609375, 1359.431396484375, 746.9419555664062, 1287.0, 520.0, 2, datetime.datetime(2025, 11, 24, 11, 58, 38, 164000)]
[1, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000, tzinfo=zoneinfo.ZoneInfo(key='UTC')), 'cam_01', 'store_01', '1-2', 'person', 0, 0.6859533786773682, 680.6466064453125, 53.300048828125, 745.2310180664062, 210.07730102539062, 712.0, 131.0, 3, datetime.datetime(2025, 11, 24, 11, 58, 38, 164000)]
[1, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000, tzinfo=zoneinfo.ZoneInfo(key='UTC')), 'cam

#### Tạo bảng Silver trong Iceberg (Trino)

In [None]:
cursor.execute("""
CREATE TABLE silver_detections_v2 (
    frame_index bigint,
    capture_ts timestamp(6),
    camera_id varchar,
    store_id varchar,
    det_id varchar,
    class varchar,
    class_id integer,
    conf double,
    bbox_x1 double,
    bbox_y1 double,
    bbox_x2 double,
    bbox_y2 double,
    centroid_x double,
    centroid_y double,
    track_id bigint,
    ingest_ts timestamp(6)
)
WITH (
    format = 'PARQUET'
)
""")
print("created silver_detections_v2")


created silver_detections_v3


#### Insert dữ liệu từ Bronze → Silver

In [None]:
cursor.execute("""
INSERT INTO silver_detections_v2
SELECT
    CAST(json_extract_scalar(payload, '$.frame_index') AS bigint) AS frame_index,
    CAST(from_iso8601_timestamp(json_extract_scalar(payload, '$.capture_ts')) AS timestamp(6)) AS capture_ts,
    camera_id,
    store_id,
    json_extract_scalar(det, '$.det_id') AS det_id,
    json_extract_scalar(det, '$.class') AS class,
    CAST(json_extract_scalar(det, '$.class_id') AS integer) AS class_id,
    CAST(json_extract_scalar(det, '$.conf') AS double) AS conf,
    CAST(json_extract_scalar(det, '$.bbox.x1') AS double) AS bbox_x1,
    CAST(json_extract_scalar(det, '$.bbox.y1') AS double) AS bbox_y1,
    CAST(json_extract_scalar(det, '$.bbox.x2') AS double) AS bbox_x2,
    CAST(json_extract_scalar(det, '$.bbox.y2') AS double) AS bbox_y2,
    CAST(json_extract_scalar(det, '$.centroid.x') AS double) AS centroid_x,
    CAST(json_extract_scalar(det, '$.centroid.y') AS double) AS centroid_y,
    CAST(json_extract_scalar(det, '$.track_id') AS bigint) AS track_id,
    CAST(ingest_ts AS timestamp(6)) AS ingest_ts
FROM bronze_raw
CROSS JOIN UNNEST(
    CAST(json_extract(payload, '$.detections') AS array(json))
) AS t(det)
""")
print("inserted into silver_detections_v2")


inserted into silver_detections_v3


#### Kiểm tra lại

In [None]:
cursor.execute("SELECT COUNT(*) FROM silver_detections_v2")
print(cursor.fetchall())

cursor.execute("""
SELECT * FROM silver_detections_v2
ORDER BY capture_ts
LIMIT 10
""")
print(cursor.fetchall())


[[3562]]
[['1a70d903a6e147a187bd28d0cd31f54c', 1, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), 'cam_01', 'store_01', '1-2', 'person', 0, 0.6859533786773682, 680.6466064453125, 53.300048828125, 745.2310180664062, 210.07730102539062, 64.58441162109375, 156.77725219726562, 712.0, 131.0, 3, datetime.datetime(2025, 11, 24, 11, 58, 38, 164000)], ['1a70d903a6e147a187bd28d0cd31f54c', 1, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), 'cam_01', 'store_01', '1-0', 'person', 0, 0.8935845494270325, 1001.053955078125, 255.43377685546875, 1156.245361328125, 677.5283203125, 155.19140625, 422.09454345703125, 1078.0, 466.0, 1, datetime.datetime(2025, 11, 24, 11, 58, 38, 164000)], ['1a70d903a6e147a187bd28d0cd31f54c', 1, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), 'cam_01', 'store_01', '1-1', 'person', 0, 0.6988767981529236, 1216.485595703125, 294.24261474609375, 1359.431396484375, 746.9419555664062, 142.94580078125, 452.6993408203125, 1287.0, 520.0, 2, datetime.datetime(2025, 11, 24, 

#### Khám phá dữ liệu (silver)

##### 1.1 – Tổng số detection, frame, track

In [None]:
cursor.execute("""
SELECT
    COUNT(*) AS total_detections,
    COUNT(DISTINCT frame_index) AS total_frames,
    COUNT(DISTINCT track_id) AS total_tracks
FROM silver_detections_v2
""")
print(cursor.fetchall())


[[3562, 464, 34]]


##### 1.2 – Kiểm tra missing frame (frame bị nhảy)

In [None]:
cursor.execute("""
WITH f AS (
    SELECT DISTINCT frame_index
    FROM silver_detections_v2
)
SELECT
    MIN(frame_index) AS min_frame,
    MAX(frame_index) AS max_frame,
    COUNT(*) AS frame_count,
    (MAX(frame_index) - MIN(frame_index) + 1) AS expected_frames,
    (MAX(frame_index) - MIN(frame_index) + 1) - COUNT(*) AS missing_frames
FROM f
""")
print(cursor.fetchall())


[[1, 464, 464, 464, 0]]



#### 1.3 – Kiểm tra số người mỗi frame (để xem có outlier)

In [None]:
cursor.execute("""
SELECT
    frame_index,
    COUNT(*) AS num_person
FROM silver_detections_v2
GROUP BY frame_index
ORDER BY frame_index
LIMIT 20
""")
print(cursor.fetchall())


[[1, 4], [2, 4], [3, 5], [4, 5], [5, 5], [6, 5], [7, 5], [8, 6], [9, 6], [10, 6], [11, 6], [12, 6], [13, 5], [14, 6], [15, 6], [16, 6], [17, 6], [18, 7], [19, 7], [20, 5]]


#### 1.4 – Kiểm tra phân bố confidence (độ tự tin của YOLO)

In [None]:
cursor.execute("""
SELECT
    approx_percentile(conf, 0.1) AS p10,
    approx_percentile(conf, 0.5) AS p50,
    approx_percentile(conf, 0.9) AS p90
FROM silver_detections_v2
""")
print(cursor.fetchall())


[[0.41667386115430954, 0.7273711076560747, 0.9092994358552537]]


##### 1.5 – Kiểm tra timestamp đều hay không

In [None]:
cursor.execute("""
WITH t AS (
    SELECT
        capture_ts,
        LAG(capture_ts) OVER (ORDER BY capture_ts) AS prev_ts,
        ROW_NUMBER() OVER (ORDER BY capture_ts) AS rn
    FROM silver_detections_v2
)
SELECT
    capture_ts,
    prev_ts,
    (capture_ts - prev_ts) AS gap
FROM t
WHERE rn <= 20
""")
print(cursor.fetchall())


[[datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.timedelta(0)], [datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.timedelta(0)], [datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), None, None], [datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.timedelta(0)], [datetime.datetime(2025, 11, 24, 12, 0, 25, 517000), datetime.datetime(2025, 11, 24, 12, 0, 25, 517000), datetime.timedelta(0)], [datetime.datetime(2025, 11, 24, 12, 0, 25, 517000), datetime.datetime(2025, 11, 24, 12, 0, 25, 517000), datetime.timedelta(0)], [datetime.datetime(2025, 11, 24, 12, 0, 25, 517000), datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.timedelta(microseconds=196000)], [datetime.datetime(2025, 11, 24, 12, 0, 25, 517000), datetime.datetime(2025, 11, 24, 12, 0, 25, 517000), datetime.timedelta(0)], [datetime.da

#### Bước 2 – Phân tích track_id (hành vi từng người)

##### 2.1. Thống kê mỗi track_id

In [None]:
cursor.execute("""
SELECT
    track_id,
    COUNT(*) AS frames_visible,
    MIN(capture_ts) AS start_time,
    MAX(capture_ts) AS end_time,
    MAX(capture_ts) - MIN(capture_ts) AS duration
FROM silver_detections_v2
GROUP BY track_id
ORDER BY track_id
""")
print(cursor.fetchall())


[[1, 70, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.datetime(2025, 11, 24, 12, 0, 28, 408000), datetime.timedelta(seconds=3, microseconds=87000)], [2, 414, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.datetime(2025, 11, 24, 12, 0, 44, 755000), datetime.timedelta(seconds=19, microseconds=434000)], [3, 455, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.datetime(2025, 11, 24, 12, 0, 44, 755000), datetime.timedelta(seconds=19, microseconds=434000)], [4, 98, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.datetime(2025, 11, 24, 12, 0, 29, 544000), datetime.timedelta(seconds=4, microseconds=223000)], [5, 442, datetime.datetime(2025, 11, 24, 12, 0, 25, 569000), datetime.datetime(2025, 11, 24, 12, 0, 43, 950000), datetime.timedelta(seconds=18, microseconds=381000)], [6, 22, datetime.datetime(2025, 11, 24, 12, 0, 25, 802000), datetime.datetime(2025, 11, 24, 12, 0, 26, 908000), datetime.timedelta(seconds=1, microseconds=106000)], [7, 

##### 2.2. Vị trí trung bình mỗi người (để sau này dựa vào làm heatmap / zone)

In [None]:
cursor.execute("""
SELECT
    track_id,
    AVG(centroid_x) AS avg_x,
    AVG(centroid_y) AS avg_y
FROM silver_detections_v2
GROUP BY track_id
ORDER BY track_id
""")
print(cursor.fetchall())


[[1, 1250.9285714285713, 716.4857142857143], [2, 1283.6183574879226, 448.65458937198065], [3, 719.1736263736263, 128.73626373626374], [4, 1087.908163265306, 83.91836734693878], [5, 1195.2262443438915, 558.8122171945702], [6, 1358.909090909091, 477.22727272727275], [7, 1241.95, 358.60714285714283], [8, 335.10769230769233, 399.0923076923077], [9, 211.3170731707317, 372.219512195122], [10, 719.5454545454545, 63.90909090909091], [12, 1130.4313725490197, 105.50980392156863], [13, 785.5760869565217, 63.92391304347826], [14, 1000.5535714285714, 90.91071428571429], [15, 58.45161290322581, 455.48387096774195], [17, 1181.6666666666667, 55.69047619047619], [18, 1290.75, 280.5], [19, 1153.436018957346, 770.5545023696683], [21, 556.3698630136986, 606.2465753424658], [22, 1202.4, 298.8702702702703], [23, 136.672131147541, 597.8688524590164], [24, 310.1333333333333, 207.4], [26, 1214.8724279835392, 115.69135802469135], [27, 60.111111111111114, 408.44444444444446], [28, 1128.576923076923, 85.692307692

##### 2.3. Di chuyển (độ lệch x, y) mỗi track

In [None]:
cursor.execute("""
SELECT
    track_id,
    MIN(centroid_x) AS min_x,
    MAX(centroid_x) AS max_x,
    MAX(centroid_x) - MIN(centroid_x) AS delta_x,
    MIN(centroid_y) AS min_y,
    MAX(centroid_y) AS max_y,
    MAX(centroid_y) - MIN(centroid_y) AS delta_y
FROM silver_detections_v2
GROUP BY track_id
ORDER BY track_id
""")
print(cursor.fetchall())


[[1, 1078.0, 1479.0, 401.0, 466.0, 1021.0, 555.0], [2, 1221.0, 1350.0, 129.0, 310.0, 532.0, 222.0], [3, 709.0, 738.0, 29.0, 99.0, 138.0, 39.0], [4, 991.0, 1213.0, 222.0, 77.0, 89.0, 12.0], [5, 1040.0, 1834.0, 794.0, 284.0, 995.0, 711.0], [6, 1355.0, 1362.0, 7.0, 407.0, 488.0, 81.0], [7, 1201.0, 1380.0, 179.0, 276.0, 403.0, 127.0], [8, 190.0, 449.0, 259.0, 283.0, 429.0, 146.0], [9, 97.0, 311.0, 214.0, 242.0, 478.0, 236.0], [10, 705.0, 735.0, 30.0, 59.0, 68.0, 9.0], [12, 1025.0, 1264.0, 239.0, 87.0, 128.0, 41.0], [13, 783.0, 791.0, 8.0, 57.0, 88.0, 31.0], [14, 962.0, 1065.0, 103.0, 44.0, 133.0, 89.0], [15, 11.0, 172.0, 161.0, 327.0, 537.0, 210.0], [17, 1162.0, 1200.0, 38.0, 40.0, 71.0, 31.0], [18, 1286.0, 1295.0, 9.0, 280.0, 281.0, 1.0], [19, 1027.0, 1454.0, 427.0, 437.0, 1003.0, 566.0], [21, 552.0, 560.0, 8.0, 598.0, 611.0, 13.0], [22, 1188.0, 1245.0, 57.0, 239.0, 371.0, 132.0], [23, 54.0, 275.0, 221.0, 385.0, 925.0, 540.0], [24, 295.0, 331.0, 36.0, 189.0, 220.0, 31.0], [26, 1204.0, 124

#### BƯỚC 3 – TẠO HEATMAP

##### 3.1 – SQL tạo heatmap grid 10×10

In [None]:
cursor.execute("""
WITH grid AS (
    SELECT
        floor(centroid_x / (1280 / 10)) AS gx,
        floor(centroid_y / (720 / 10)) AS gy
    FROM silver_detections_v2
)
SELECT
    gx, gy,
    COUNT(*) AS hits
FROM grid
GROUP BY gx, gy
ORDER BY gy, gx
""")
rows = cursor.fetchall()
for r in rows:
    print(r)


[5.0, 0.0, 33]
[6.0, 0.0, 82]
[7.0, 0.0, 141]
[9.0, 0.0, 84]
[5.0, 1.0, 460]
[6.0, 1.0, 12]
[7.0, 1.0, 218]
[8.0, 1.0, 159]
[9.0, 1.0, 297]
[2.0, 2.0, 9]
[7.0, 2.0, 5]
[8.0, 2.0, 1]
[1.0, 3.0, 10]
[2.0, 3.0, 17]
[7.0, 3.0, 3]
[8.0, 3.0, 60]
[9.0, 3.0, 146]
[10.0, 3.0, 4]
[1.0, 4.0, 31]
[2.0, 4.0, 6]
[8.0, 4.0, 45]
[9.0, 4.0, 161]
[10.0, 4.0, 21]
[0.0, 5.0, 23]
[1.0, 5.0, 59]
[2.0, 5.0, 20]
[3.0, 5.0, 70]
[8.0, 5.0, 39]
[9.0, 5.0, 143]
[10.0, 5.0, 23]
[0.0, 6.0, 16]
[1.0, 6.0, 24]
[8.0, 6.0, 180]
[9.0, 6.0, 128]
[10.0, 6.0, 189]
[0.0, 7.0, 15]
[1.0, 7.0, 2]
[8.0, 7.0, 13]
[9.0, 7.0, 48]
[10.0, 7.0, 70]
[0.0, 8.0, 5]
[4.0, 8.0, 73]
[8.0, 8.0, 17]
[9.0, 8.0, 16]
[10.0, 8.0, 31]
[8.0, 9.0, 9]
[9.0, 9.0, 37]
[10.0, 9.0, 7]
[0.0, 10.0, 5]
[9.0, 10.0, 29]
[10.0, 10.0, 16]
[0.0, 11.0, 4]
[8.0, 11.0, 102]
[9.0, 11.0, 6]
[10.0, 11.0, 23]
[11.0, 11.0, 3]
[0.0, 12.0, 20]
[8.0, 12.0, 6]
[9.0, 12.0, 9]
[10.0, 12.0, 7]
[11.0, 12.0, 14]
[12.0, 12.0, 11]
[13.0, 12.0, 3]
[0.0, 13.0, 6]
[9.0, 13.0, 2]
[1

#### BƯỚC 4 – PHÂN TÍCH DWELL TIME (người đứng bao lâu)

In [None]:
cursor.execute("""
WITH grid AS (
    SELECT
        track_id,
        floor(centroid_x / (1280 / 10)) AS gx,
        floor(centroid_y / (720 / 10)) AS gy,
        capture_ts
    FROM silver_detections_v2
),
dwell AS (
    SELECT
        track_id,
        gx,
        gy,
        MIN(capture_ts) AS start_time,
        MAX(capture_ts) AS end_time,
        MAX(capture_ts) - MIN(capture_ts) AS duration
    FROM grid
    GROUP BY track_id, gx, gy
)
SELECT *
FROM dwell
ORDER BY duration DESC
LIMIT 20
""")
print(cursor.fetchall())


[[3, 5.0, 1.0, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.datetime(2025, 11, 24, 12, 0, 44, 755000), datetime.timedelta(seconds=19, microseconds=434000)], [2, 10.0, 5.0, datetime.datetime(2025, 11, 24, 12, 0, 26, 908000), datetime.datetime(2025, 11, 24, 12, 0, 44, 755000), datetime.timedelta(seconds=17, microseconds=847000)], [2, 10.0, 6.0, datetime.datetime(2025, 11, 24, 12, 0, 26, 833000), datetime.datetime(2025, 11, 24, 12, 0, 44, 116000), datetime.timedelta(seconds=17, microseconds=283000)], [7, 10.0, 4.0, datetime.datetime(2025, 11, 24, 12, 0, 26, 92000), datetime.datetime(2025, 11, 24, 12, 0, 36, 440000), datetime.timedelta(seconds=10, microseconds=348000)], [14, 7.0, 1.0, datetime.datetime(2025, 11, 24, 12, 0, 28, 491000), datetime.datetime(2025, 11, 24, 12, 0, 38, 834000), datetime.timedelta(seconds=10, microseconds=343000)], [26, 9.0, 1.0, datetime.datetime(2025, 11, 24, 12, 0, 34, 575000), datetime.datetime(2025, 11, 24, 12, 0, 44, 755000), datetime.timedelt

#### BƯỚC 5 – PATTERN THEO THỜI GIAN (Temporal Analytics)

Trong bước này ta phân tích:

số người mỗi giây

số người mỗi frame

peak time

flow direction (trend di chuyển theo thời gian)

timeline của từng track

##### 5.1 – Số người theo giây

In [None]:
cursor.execute("""
SELECT
    date_trunc('second', capture_ts) AS sec,
    COUNT(*) AS detections,
    COUNT(DISTINCT track_id) AS unique_people
FROM silver_detections_v2
GROUP BY 1
ORDER BY 1
""")
print(cursor.fetchall())


[[datetime.datetime(2025, 11, 24, 12, 0, 25), 63, 6], [datetime.datetime(2025, 11, 24, 12, 0, 26), 134, 8], [datetime.datetime(2025, 11, 24, 12, 0, 27), 162, 8], [datetime.datetime(2025, 11, 24, 12, 0, 28), 236, 12], [datetime.datetime(2025, 11, 24, 12, 0, 29), 230, 12], [datetime.datetime(2025, 11, 24, 12, 0, 30), 203, 12], [datetime.datetime(2025, 11, 24, 12, 0, 31), 223, 10], [datetime.datetime(2025, 11, 24, 12, 0, 32), 224, 9], [datetime.datetime(2025, 11, 24, 12, 0, 33), 175, 11], [datetime.datetime(2025, 11, 24, 12, 0, 34), 209, 12], [datetime.datetime(2025, 11, 24, 12, 0, 35), 216, 12], [datetime.datetime(2025, 11, 24, 12, 0, 36), 204, 13], [datetime.datetime(2025, 11, 24, 12, 0, 37), 158, 9], [datetime.datetime(2025, 11, 24, 12, 0, 38), 181, 8], [datetime.datetime(2025, 11, 24, 12, 0, 39), 162, 9], [datetime.datetime(2025, 11, 24, 12, 0, 40), 177, 8], [datetime.datetime(2025, 11, 24, 12, 0, 41), 184, 10], [datetime.datetime(2025, 11, 24, 12, 0, 42), 146, 8], [datetime.datetime(

##### 5.2 – Hướng di chuyển theo thời gian (x movement)

In [None]:
cursor.execute("""
SELECT
    track_id,
    MIN(centroid_x) AS start_x,
    MAX(centroid_x) AS end_x,
    MAX(centroid_x) - MIN(centroid_x) AS movement_x
FROM silver_detections_v2
GROUP BY track_id
ORDER BY track_id
""")
print(cursor.fetchall())


[[1, 1078.0, 1479.0, 401.0], [2, 1221.0, 1350.0, 129.0], [3, 709.0, 738.0, 29.0], [4, 991.0, 1213.0, 222.0], [5, 1040.0, 1834.0, 794.0], [6, 1355.0, 1362.0, 7.0], [7, 1201.0, 1380.0, 179.0], [8, 190.0, 449.0, 259.0], [9, 97.0, 311.0, 214.0], [10, 705.0, 735.0, 30.0], [12, 1025.0, 1264.0, 239.0], [13, 783.0, 791.0, 8.0], [14, 962.0, 1065.0, 103.0], [15, 11.0, 172.0, 161.0], [17, 1162.0, 1200.0, 38.0], [18, 1286.0, 1295.0, 9.0], [19, 1027.0, 1454.0, 427.0], [21, 552.0, 560.0, 8.0], [22, 1188.0, 1245.0, 57.0], [23, 54.0, 275.0, 221.0], [24, 295.0, 331.0, 36.0], [26, 1204.0, 1249.0, 45.0], [27, 47.0, 74.0, 27.0], [28, 1091.0, 1169.0, 78.0], [30, 137.0, 253.0, 116.0], [32, 326.0, 341.0, 15.0], [33, 49.0, 65.0, 16.0], [34, 1023.0, 1202.0, 179.0], [35, 1203.0, 1210.0, 7.0], [36, 98.0, 188.0, 90.0], [37, 202.0, 281.0, 79.0], [38, 1210.0, 1271.0, 61.0], [39, 749.0, 775.0, 26.0], [40, 981.0, 983.0, 2.0]]


##### 5.3 – Timeline của từng track

In [None]:
cursor.execute("""
SELECT
    track_id,
    MIN(capture_ts),
    MAX(capture_ts),
    MAX(capture_ts) - MIN(capture_ts) AS duration
FROM silver_detections_v2
GROUP BY track_id
ORDER BY track_id
""")
print(cursor.fetchall())


[[1, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.datetime(2025, 11, 24, 12, 0, 28, 408000), datetime.timedelta(seconds=3, microseconds=87000)], [2, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.datetime(2025, 11, 24, 12, 0, 44, 755000), datetime.timedelta(seconds=19, microseconds=434000)], [3, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.datetime(2025, 11, 24, 12, 0, 44, 755000), datetime.timedelta(seconds=19, microseconds=434000)], [4, datetime.datetime(2025, 11, 24, 12, 0, 25, 321000), datetime.datetime(2025, 11, 24, 12, 0, 29, 544000), datetime.timedelta(seconds=4, microseconds=223000)], [5, datetime.datetime(2025, 11, 24, 12, 0, 25, 569000), datetime.datetime(2025, 11, 24, 12, 0, 43, 950000), datetime.timedelta(seconds=18, microseconds=381000)], [6, datetime.datetime(2025, 11, 24, 12, 0, 25, 802000), datetime.datetime(2025, 11, 24, 12, 0, 26, 908000), datetime.timedelta(seconds=1, microseconds=106000)], [7, datetime.datetime(2025, 11,

### Lớp Gold

#### Bảng Gold 1: gold_people_per_minute
Ý nghĩa:
Mỗi dòng = 1 phút / 1 camera / 1 store:

có bao nhiêu detection (số bounding box)

có bao nhiêu người unique (track_id)

In [None]:
cursor.execute("""
CREATE TABLE IF NOT EXISTS gold_people_per_minute (
    store_id varchar,
    camera_id varchar,
    ts_minute timestamp(6),
    detections bigint,
    unique_people bigint
)
WITH (format = 'PARQUET')
""")
print("Created gold_people_per_minute")


Created gold_people_per_minute


In [None]:
cursor.execute("""
INSERT INTO gold_people_per_minute
SELECT
    store_id,
    camera_id,
    date_trunc('minute', capture_ts) AS ts_minute,
    COUNT(*) AS detections,
    COUNT(DISTINCT track_id) AS unique_people
FROM silver_detections_v2
GROUP BY store_id, camera_id, date_trunc('minute', capture_ts)
""")
print("Inserted gold_people_per_minute")


Inserted gold_people_per_minute


#### Bảng Gold 2: gold_zone_heatmap
Ý nghĩa:
Heatmap tổng hợp: mỗi dòng = 1 zone (ô lưới) trong khung hình + số lần xuất hiện:

dùng cho heatmap overlay

cluster xem khu vực nào đông khách

Ta tiếp tục dùng lưới 10×10 (như lúc nãy):

zone_x = 0..9 (trái → phải)

zone_y = 0..9 (trên → dưới)

In [None]:
cursor.execute("""
CREATE TABLE IF NOT EXISTS gold_zone_heatmap (
    store_id varchar,
    camera_id varchar,
    zone_x integer,
    zone_y integer,
    hits bigint,
    unique_tracks bigint
)
WITH (format = 'PARQUET')
""")
print("Created gold_zone_heatmap")


Created gold_zone_heatmap


In [None]:
cursor.execute("""
INSERT INTO gold_zone_heatmap
SELECT
    store_id,
    camera_id,
    CAST(floor(centroid_x / (1280 / 10)) AS integer) AS zone_x,
    CAST(floor(centroid_y / (720 / 10)) AS integer) AS zone_y,
    COUNT(*) AS hits,
    COUNT(DISTINCT track_id) AS unique_tracks
FROM silver_detections_v2
GROUP BY
    store_id,
    camera_id,
    CAST(floor(centroid_x / (1280 / 10)) AS integer),
    CAST(floor(centroid_y / (720 / 10)) AS integer)
""")
print("Inserted gold_zone_heatmap")


Inserted gold_zone_heatmap


#### Bảng Gold 3: gold_zone_dwell

Ý nghĩa:
Đây là bảng rất “xịn”:

mỗi dòng = 1 store + camera + zone

chứa thông tin thời gian khách đứng lại (dwell)

Từ Silver mình đã tính dwell theo track_id + zone_x + zone_y.
Giờ Gold sẽ tổng hợp lại:

tổng thời gian đứng (sum_dwell_seconds)

dwell trung bình (avg_dwell_seconds)

số lượt (số track) vào zone (visits)

In [None]:
cursor.execute("""
CREATE TABLE IF NOT EXISTS gold_zone_dwell (
    store_id varchar,
    camera_id varchar,
    zone_x integer,
    zone_y integer,
    visits bigint,
    total_dwell_seconds double,
    avg_dwell_seconds double
)
WITH (format = 'PARQUET')
""")
print("Created gold_zone_dwell")


Created gold_zone_dwell


In [None]:
cursor.execute("""
INSERT INTO gold_zone_dwell
WITH per_track_zone AS (
    SELECT
        store_id,
        camera_id,
        track_id,
        CAST(floor(centroid_x / (1280 / 10)) AS integer) AS zone_x,
        CAST(floor(centroid_y / (720 / 10)) AS integer) AS zone_y,
        MIN(capture_ts) AS start_time,
        MAX(capture_ts) AS end_time,
        date_diff('millisecond', MIN(capture_ts), MAX(capture_ts)) / 1000.0 AS dwell_seconds
    FROM silver_detections_v2
    GROUP BY
        store_id,
        camera_id,
        track_id,
        CAST(floor(centroid_x / (1280 / 10)) AS integer),
        CAST(floor(centroid_y / (720 / 10)) AS integer)
)
SELECT
    store_id,
    camera_id,
    zone_x,
    zone_y,
    COUNT(*) AS visits,
    SUM(dwell_seconds) AS total_dwell_seconds,
    AVG(dwell_seconds) AS avg_dwell_seconds
FROM per_track_zone
GROUP BY store_id, camera_id, zone_x, zone_y
""")
print("Inserted gold_zone_dwell")


Inserted gold_zone_dwell


#### Thiết kế bảng gold_track_summary
Schema đề xuất:

store_id – cửa hàng

camera_id – camera

track_id – id của người/track

frames – số frame xuất hiện

start_time, end_time – thời gian bắt đầu/kết thúc

duration_seconds – thời gian tồn tại của track

min_x, max_x, delta_x – biên độ di chuyển theo chiều ngang

min_y, max_y, delta_y – biên độ di chuyển theo chiều dọc

avg_x, avg_y – vị trí trung bình (điểm trung tâm “quen thuộc”)

avg_conf – confidence trung bình của YOLO cho track đó

In [None]:
cursor.execute("""
CREATE TABLE IF NOT EXISTS gold_track_summary (
    store_id varchar,
    camera_id varchar,
    track_id bigint,
    frames bigint,
    start_time timestamp(6),
    end_time timestamp(6),
    duration_seconds double,
    min_x double,
    max_x double,
    delta_x double,
    min_y double,
    max_y double,
    delta_y double,
    avg_x double,
    avg_y double,
    avg_conf double
)
WITH (format = 'PARQUET')
""")
print("Created gold_track_summary")


Created gold_track_summary


In [None]:
cursor.execute("""
INSERT INTO gold_track_summary
SELECT
    store_id,
    camera_id,
    CAST(track_id AS bigint) AS track_id,
    COUNT(*) AS frames,
    MIN(capture_ts) AS start_time,
    MAX(capture_ts) AS end_time,
    date_diff('millisecond', MIN(capture_ts), MAX(capture_ts)) / 1000.0 AS duration_seconds,
    MIN(centroid_x) AS min_x,
    MAX(centroid_x) AS max_x,
    MAX(centroid_x) - MIN(centroid_x) AS delta_x,
    MIN(centroid_y) AS min_y,
    MAX(centroid_y) AS max_y,
    MAX(centroid_y) - MIN(centroid_y) AS delta_y,
    AVG(centroid_x) AS avg_x,
    AVG(centroid_y) AS avg_y,
    AVG(conf) AS avg_conf
FROM silver_detections_v2
GROUP BY
    store_id,
    camera_id,
    CAST(track_id AS bigint)
""")
print("Inserted gold_track_summary")


Inserted gold_track_summary


##### query kiểm tra nhanh 

In [None]:
# Top track đứng lâu nhất
cursor.execute("""
SELECT track_id, duration_seconds
FROM gold_track_summary
ORDER BY duration_seconds DESC
LIMIT 10
""")
print(cursor.fetchall())

# Track di chuyển xa nhất theo trục X
cursor.execute("""
SELECT track_id, delta_x, delta_y
FROM gold_track_summary
ORDER BY delta_x DESC
LIMIT 10
""")
print(cursor.fetchall())


[[3, 19.4], [2, 19.4], [5, 18.4], [14, 16.3], [7, 11.4], [22, 11.2], [26, 10.2], [19, 8.7], [8, 6.3], [34, 5.4]]
[[5, 794.0, 711.0], [19, 427.0, 566.0], [1, 401.0, 555.0], [8, 259.0, 146.0], [12, 239.0, 41.0], [4, 222.0, 12.0], [23, 221.0, 540.0], [9, 214.0, 236.0], [7, 179.0, 127.0], [34, 179.0, 261.0]]
[[5, 794.0, 711.0], [19, 427.0, 566.0], [1, 401.0, 555.0], [8, 259.0, 146.0], [12, 239.0, 41.0], [4, 222.0, 12.0], [23, 221.0, 540.0], [9, 214.0, 236.0], [7, 179.0, 127.0], [34, 179.0, 261.0]]
