In [None]:
!pip install polars

In [None]:
!pip install seaborn

In [None]:
!pip install pyarrow

In [1]:
import os
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [3]:
def read_parquet_by_type(train_path: str):
    # Lấy tất cả các file parquet trong thư mục
    files = [os.path.join(train_path, f) for f in os.listdir(train_path) if f.endswith('.parquet')]
    
    # Phân loại các file theo loại tên
    user_chunk_files = [file for file in files if 'user_chunk' in file]
    purchase_history_chunk_files = [file for file in files if 'purchase_history_daily_chunk' in file]
    item_chunk_files = [file for file in files if 'item_chunk' in file]
    
    # Đọc các file riêng biệt thành DataFrame
    user_chunk_df = pl.concat([pl.read_parquet(file) for file in user_chunk_files]) if user_chunk_files else None
    purchase_history_chunk_df = pl.concat([pl.read_parquet(file) for file in purchase_history_chunk_files]) if purchase_history_chunk_files else None
    item_chunk_df = pl.concat([pl.read_parquet(file) for file in item_chunk_files]) if item_chunk_files else None
    
    # Trả về một dictionary chứa các DataFrame
    return {
        "user_chunk": user_chunk_df,
        "purchase_history_chunk": purchase_history_chunk_df,
        "item_chunk": item_chunk_df
    }

In [4]:
train_path = 'E:/KHMT2023_CS_UIT/05_C_Python_For_ML/recommendation_dataset'
dataframes = read_parquet_by_type(train_path)

df_user = dataframes["user_chunk"]
df_purchase = dataframes["purchase_history_chunk"]
df_item = dataframes["item_chunk"]

In [5]:
print("User Chunk DataFrame:")
print(df_user.head())

User Chunk DataFrame:
shape: (5, 18)
┌────────────┬────────┬──────────┬────────────┬───┬────────────┬──────────┬────────────┬───────────┐
│ customer_i ┆ gender ┆ location ┆ province   ┆ … ┆ install_da ┆ district ┆ user_id    ┆ is_delete │
│ d          ┆ ---    ┆ ---      ┆ ---        ┆   ┆ te         ┆ ---      ┆ ---        ┆ d         │
│ ---        ┆ str    ┆ i32      ┆ str        ┆   ┆ ---        ┆ str      ┆ str        ┆ ---       │
│ i32        ┆        ┆          ┆            ┆   ┆ i64        ┆          ┆            ┆ bool      │
╞════════════╪════════╪══════════╪════════════╪═══╪════════════╪══════════╪════════════╪═══════════╡
│ 14732      ┆ Nam    ┆ 155      ┆ Hồ Chí     ┆ … ┆ 1306281600 ┆ 7        ┆ e1e4820665 ┆ false     │
│            ┆        ┆          ┆ Minh       ┆   ┆            ┆          ┆ 2bf8c279ff ┆           │
│            ┆        ┆          ┆            ┆   ┆            ┆          ┆ 0206c69a80 ┆           │
│            ┆        ┆          ┆            ┆   ┆   

In [7]:
print("Purchase Chunk DataFrame:")
print(df_purchase.head())

Purchase Chunk DataFrame:
shape: (5, 16)
┌────────────┬───────────┬───────────┬───────────┬───┬──────────┬──────────┬───────────┬───────────┐
│ timestamp  ┆ user_id   ┆ item_id   ┆ event_typ ┆ … ┆ payment  ┆ location ┆ discount  ┆ is_delete │
│ ---        ┆ ---       ┆ ---       ┆ e         ┆   ┆ ---      ┆ ---      ┆ ---       ┆ d         │
│ i64        ┆ str       ┆ str       ┆ ---       ┆   ┆ str      ┆ i32      ┆ decimal[3 ┆ ---       │
│            ┆           ┆           ┆ str       ┆   ┆          ┆          ┆ 8,4]      ┆ bool      │
╞════════════╪═══════════╪═══════════╪═══════════╪═══╪══════════╪══════════╪═══════════╪═══════════╡
│ 1735064221 ┆ ca12702dd ┆ 711500000 ┆ Purchase  ┆ … ┆ VietQR   ┆ 656      ┆ 0.0000    ┆ false     │
│            ┆ f55acaa9f ┆ 0004      ┆           ┆   ┆          ┆          ┆           ┆           │
│            ┆ b767e10fa ┆           ┆           ┆   ┆          ┆          ┆           ┆           │
│            ┆ aa6…      ┆           ┆           ┆

In [8]:
print("Item Chunk DataFrame:")
print(df_item.head())

Item Chunk DataFrame:
shape: (5, 34)
┌───────┬────────────┬────────────┬────────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ p_id  ┆ item_id    ┆ price      ┆ category_l ┆ … ┆ volume    ┆ material  ┆ sale_stat ┆ descripti │
│ ---   ┆ ---        ┆ ---        ┆ 1_id       ┆   ┆ ---       ┆ ---       ┆ us        ┆ on_new    │
│ i32   ┆ str        ┆ decimal[38 ┆ ---        ┆   ┆ str       ┆ str       ┆ ---       ┆ ---       │
│       ┆            ┆ ,4]        ┆ i32        ┆   ┆           ┆           ┆ i32       ┆ str       │
╞═══════╪════════════╪════════════╪════════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 17065 ┆ 0502020000 ┆ 99000.0000 ┆ 1          ┆ … ┆ Không xác ┆ Không xác ┆ 0         ┆ Chi tiết  │
│       ┆ 004        ┆            ┆            ┆   ┆ định      ┆ định      ┆           ┆ sản phẩm  │
│       ┆            ┆            ┆            ┆   ┆           ┆           ┆           ┆ …         │
│ 72370 ┆ 0010290040 ┆ 69000.0000 ┆ 3292       ┆ … ┆ K