In [2]:
%pip install duckdb numpy pandas

Collecting numpy
  Downloading numpy-2.3.5-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m388.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pandas
  Downloading pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m653.6 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0mm
Downloading numpy-2.3.5-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m169.3 kB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
[?25hDownloading pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.8 MB)
[2K   [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/12.8 MB[0m [31m102.4 kB/s[0m eta [36m0:01:41[0m^C
[

In [3]:
import duckdb
import pandas as pd
import numpy as np


# -----------------------------------------
# MinIO / S3 configuration
# -----------------------------------------

AWS_ACCESS_KEY = "amin"
AWS_SECRET_KEY = "password"
S3_ENDPOINT    = "http://localhost:9000"
WAREHOUSE_PATH = "s3://warehouse/default"   # محل ذخیره Iceberg روی MinIO
TABLE          = "user_evens"            # اسم جدول Iceberg

# -----------------------------------------
# Connect DuckDB + S3 + Iceberg
# -----------------------------------------

con = duckdb.connect()

# فعال‌سازی S3 در DuckDB
con.execute(f"""
SET s3_url='{S3_ENDPOINT}';
SET s3_access_key_id='{AWS_ACCESS_KEY}';
SET s3_secret_access_key='{AWS_SECRET_KEY}';
SET s3_use_ssl=false;
SET s3_endpoint='{S3_ENDPOINT.replace("http://", "").replace("https://", "")}';
SET enable_http_metadata_cache=true;
""")

# ثبت کاتالوگ Iceberg از MinIO
con.execute(f"""
CREATE OR REPLACE DATABASE iceberg_catalog
LOCATION '{WAREHOUSE_PATH}';
""")

print("Connected to Iceberg via MinIO successfully.")


# -----------------------------------------
# 1) خواندن داده‌ها از Iceberg
# -----------------------------------------

df = con.execute(f"""
    SELECT * 
    FROM iceberg_catalog.{TABLE}
""").df()

print("Data loaded from Iceberg:")
print(df.head())
print("\nRows:", len(df))


# -----------------------------------------
# 2) تحلیل‌های اولیه با Pandas / NumPy
# -----------------------------------------

print("\n--- Analysis ---")

# تعداد اکشن‌ها
action_counts = df["action"].value_counts()
print("\nAction distribution:")
print(action_counts)

# متوسط مدت زمان حضور (duration)
avg_duration = df["duration"].mean()
print("\nAverage duration:", avg_duration)

# پرکاربرترین 10 کاربر
top_users = df["user_id"].value_counts().head(10)
print("\nTop users:")
print(top_users)

# کاربرانی که بیشتر از 10 بار خرید یا checkout انجام داده‌اند
active_buyers = df[df["action"] == "checkout"]["user_id"].value_counts()
print("\nActive buyers:")
print(active_buyers.head(10))

# بخش‌هایی که بیشترین ترافیک را دارند (sections)
section_stats = df["section"].value_counts()
print("\nSection stats:")
print(section_stats)


# -----------------------------------------
# 3) مثال کوئری تحلیلی مستقیم با DuckDB
# -----------------------------------------

print("\n--- DuckDB SQL Analysis ---")

sql_result = con.execute(f"""
    SELECT action, COUNT(*) AS total
    FROM iceberg_catalog.{TABLE}
    GROUP BY action
    ORDER BY total DESC
""").df()

print(sql_result)

print("\nDone.")


ModuleNotFoundError: No module named 'pandas'