In [6]:
!pip install duckdb



In [7]:
import duckdb

In [15]:
# สร้างการเชื่อมต่อแบบ in-memory
con = duckdb.connect(database=':memory:', read_only=False)

In [18]:
# Query ข้อมูลจาก CSV โดยตรง
query = "SELECT * FROM 'customers.csv'"

In [19]:
result = con.execute(query).fetchdf()

In [20]:
result.head(2)

Unnamed: 0,customer_id,first_name,last_name,email,created_at,created_date,status,country
0,1001,John,Doe,john.doe@example.com,2024-01-15 08:23:45,2024-01-15,active,US
1,1002,Emily,Smith,emily.smith@example.co.uk,2024-02-10 14:12:30,2024-02-10,active,UK


In [23]:
result.shape

(10, 8)

In [25]:
filename = 'Titanic-Dataset.csv'

In [26]:
query = f"SELECT * FROM '{filename}'"

In [28]:
titanic_result = con.execute(query).fetchdf()

In [29]:
titanic_result.shape

(891, 12)

In [30]:
titanic_result.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [31]:
query = "select * from read_json_auto('products.json')"

In [33]:
product_result = con.execute(query).fetchdf()

In [34]:
product_result.head()

Unnamed: 0,id,name,price,category,in_stock
0,1,Laptop,35000,Electronics,True
1,2,Phone,18000,Electronics,False
2,3,Desk Chair,4500,Furniture,True


In [35]:
query = """
SELECT
  customer_id,
  first_name,
  last_name,
  email,
  country,
  created_date
FROM 'customers.csv'
WHERE status = 'active'
  AND created_date >= '2024-01-01'
  AND country IN ('TH');
"""

thai_customers = con.execute(query).fetchdf()

In [36]:
titanic_result.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [41]:
query = """
SELECT
    PassengerId,
    Name,
    Age,
    Fare,
    CASE
        WHEN Survived = 1 THEN 'Survived'
        ELSE 'Not Survived'
    END as SurvivalStatus
FROM 'Titanic-Dataset.csv'
"""

transformed_data = con.execute(query).fetchdf()

In [43]:
query = """
SELECT
    c.customer_id,
    c.customer_name,
    c.segment,
    COUNT(DISTINCT o.order_id) as order_count,
    SUM(o.amount) as total_revenue,
    AVG(o.amount) as avg_order_value
FROM new_customers.csv c
LEFT JOIN orders.csv o
    ON c.customer_id = o.customer_id
    AND o.status != 'cancelled'
WHERE c.is_active = true
GROUP BY 1, 2, 3
"""

customer_metrics = con.execute(query).fetchdf()

In [45]:
customers_metric = con.execute(query).fetchdf()

In [46]:
customers_metric.head()

Unnamed: 0,customer_id,customer_name,segment,orders_last_12m,revenue_last_12m,avg_order_value
0,C007,Cancelled Cust,Consumer,1,100.0,100.0
1,C009,Future Buyer,Enterprise,1,1000.0,1000.0
2,C010,Small Biz,SMB,1,45.0,45.0
3,C001,John Doe,Consumer,0,0.0,
4,C002,Jane Smith,Corporate,0,0.0,


In [47]:
query = """
with
active_orders AS (
    SELECT
        customer_id,
        count(*) as order_count,
        sum(amount) as total_amount
    FROM 'orders.csv'
    WHERE order_date >= date_add(current_date(), interval '-12 month')
    GROUP BY 1
),

active_customers AS (
    SELECT
        customer_id,
        customer_name,
        segment
    FROM new_customers.csv
    WHERE is_active = true
),

final_result AS (
    SELECT
        ac.*,
    from active_customers ac
    left join active_orders ao
    on ac.customer_id = ao.customer_id
)

select * from final_result
"""

In [48]:
result2 = con.execute(query).fetchdf()

In [49]:
result2.head()

Unnamed: 0,customer_id,customer_name,segment
0,C007,Cancelled Cust,Consumer
1,C009,Future Buyer,Enterprise
2,C010,Small Biz,SMB
3,C001,John Doe,Consumer
4,C002,Jane Smith,Corporate


In [50]:
query = """
WITH enriched_orders AS (
    SELECT
        o.*,
        c.customer_name,
        c.segment
    FROM orders.csv o
    LEFT JOIN new_customers.csv c
        ON o.customer_id = c.customer_id
)
SELECT
    order_id,
    customer_id,
    customer_name,
    order_date,
    amount,

    -- 1. ลำดับของ order ของลูกค้าแต่ละคน
    ROW_NUMBER() OVER (
        PARTITION BY customer_id
        ORDER BY order_date
    ) as order_sequence,

    -- 2. อันดับยอดขายในเดือนนั้น
    RANK() OVER (
        PARTITION BY strftime('%Y-%m', order_date)
        ORDER BY amount DESC
    ) as monthly_sales_rank,

    -- 3. ยอดสะสมของลูกค้า
    SUM(amount) OVER (
        PARTITION BY customer_id
        ORDER BY order_date
    ) as running_total,

    -- 4. ค่าเฉลี่ยเคลื่อนที่ 4 orders
    AVG(amount) OVER (
        PARTITION BY customer_id
        ORDER BY order_date
        ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
    ) as moving_avg_4_orders,

    -- 5. ยอด order ก่อนหน้า
    LAG(amount, 1, 0) OVER (
        PARTITION BY customer_id
        ORDER BY order_date
    ) as previous_order,

    -- 6. ยอด order ถัดไป
    LEAD(amount, 1, 0) OVER (
        PARTITION BY customer_id
        ORDER BY order_date
    ) as next_order,

    -- 7. ยอด order แรกของลูกค้า
    FIRST_VALUE(amount) OVER (
        PARTITION BY customer_id
        ORDER BY order_date
    ) as first_order_amount

FROM enriched_orders
ORDER BY customer_id, order_date;
"""


In [51]:
window_analysis = con.execute(query).fetchdf()

In [52]:
window_analysis.head()

Unnamed: 0,order_id,customer_id,customer_name,order_date,amount,order_sequence,monthly_sales_rank,running_total,moving_avg_4_orders,previous_order,next_order,first_order_amount
0,O1001,C001,John Doe,2024-01-15,120.5,1,1,120.5,120.5,0.0,75.0,120.5
1,O1002,C001,John Doe,2024-02-20,75.0,2,1,195.5,97.75,120.5,50.0,120.5
2,O1003,C001,John Doe,2024-03-05,50.0,3,1,245.5,81.833333,75.0,0.0,120.5
3,O1004,C002,Jane Smith,2024-04-25,200.0,1,1,200.0,200.0,0.0,150.0,200.0
4,O1005,C002,Jane Smith,2024-05-01,150.0,2,1,350.0,175.0,200.0,0.0,200.0
