## 2.2 ibis主要功能

### 2.2.1 构建具有便携性的分析逻辑

In [1]:
import numpy as np
import pandas as pd

# 生成测试用数据并导出为parquet格式
(
    pd
    .DataFrame(
        {
            '类别': np.random.choice(list('abcdef'), 10000000),
            '数值': np.random.uniform(0, 100, 10000000)
        }
    )
    .assign(数值=lambda df: df['数值'].round(3))
    .to_parquet('demo.parquet')
)

In [2]:
import ibis

ibis.options.interactive = True # 开启适用jupyter环境的交互模式

In [3]:
%%time

# 默认使用DuckDB后端
demo_t = ibis.read_parquet('demo.parquet')

# 执行简单的分组运算

expr = (
    demo_t
    .group_by('类别')
    .aggregate(
        组内样本数=lambda t: t.数值.count(),
        组内平均数=lambda t: t.数值.mean(),
        组内中位数=lambda t: t.数值.median(),
    )
    .order_by('类别')
)
expr

CPU times: total: 172 ms
Wall time: 161 ms


In [4]:
%%time

# 切换到pandas后端
con = ibis.pandas.connect()
demo_t = con.read_parquet('demo.parquet')

# 执行简单的分组运算

expr = (
    demo_t
    .group_by('类别')
    .aggregate(
        组内样本数=lambda t: t.数值.count(),
        组内平均数=lambda t: t.数值.mean(),
        组内中位数=lambda t: t.数值.median(),
    )
    .order_by('类别')
)
expr

CPU times: total: 1.83 s
Wall time: 1.54 s


### 2.2.2 充分搭配Python与SQL

In [5]:
# 默认使用DuckDB后端
demo_t = ibis.read_parquet('demo.parquet')

expr = (
    demo_t
    .group_by('类别')
    .aggregate(
        组内样本数=lambda t: t.数值.count(),
        组内平均数=lambda t: t.数值.mean(),
        组内中位数=lambda t: t.数值.median(),
    )
    .order_by('类别')
)

In [6]:
# 转换为sql
ibis.to_sql(expr)

```sql
SELECT
  *
FROM (
  SELECT
    "t0"."类别",
    COUNT("t0"."数值") AS "组内样本数",
    AVG("t0"."数值") AS "组内平均数",
    MEDIAN("t0"."数值") AS "组内中位数"
  FROM "ibis_read_parquet_qvyjomcxznf5hfsdjtgvenbsiq" AS "t0"
  GROUP BY
    1
) AS "t1"
ORDER BY
  "t1"."类别" ASC
```

In [7]:
# 获取SQL中可以使用的表名
ibis.to_sql(demo_t)

```sql
SELECT
  *
FROM "ibis_read_parquet_qvyjomcxznf5hfsdjtgvenbsiq"
```

In [8]:
# 注意下方表名需与上方打印结果一致
demo_t.sql('''
SELECT
  *
FROM (
  SELECT
    "t0"."类别",
    COUNT("t0"."数值") AS "组内样本数",
    AVG("t0"."数值") AS "组内平均数",
    MEDIAN("t0"."数值") AS "组内中位数"
  FROM "ibis_read_parquet_qvyjomcxznf5hfsdjtgvenbsiq" AS "t0"
  GROUP BY
    1
) AS "t1"
ORDER BY
  "t1"."类别" ASC
''')