**Load Library**

In [1]:
import pandas as pd
import numpy as np
import polars as pl
from tqdm import tqdm
from collections import defaultdict

**Load CSV File**

In [3]:
session_train = pd.read_csv('/root/KDDCUP/KDDCUP数据集/sessions_train.csv')
session_test_phase1 = pd.read_csv('/root/KDDCUP/KDDCUP数据集/sessions_test_task1.csv')
session_test_phase2 = pd.read_csv('/root/KDDCUP/KDDCUP数据集/sessions_test_task1_phase1.csv')

**Use pl data type**

In [4]:
session_train = pl.from_pandas(session_train)
session_test_phase1 = pl.from_pandas(session_test_phase1)
session_test_phase2 = pl.from_pandas(session_test_phase2)

**Task1 Only use data from DE, UK and JP Area, Delete the irrelevant ones**

In [5]:
location = ["DE", "UK", "JP"]
session_train = session_train.filter(pl.col("locale").is_in(location))
session_test_phase1 = session_test_phase1.filter(pl.col("locale").is_in(location))
session_test_phase2 = session_test_phase2.filter(pl.col("locale").is_in(location))

**Have a look with the data we found that session_test_phase1/2.prev_item contains characters only, not list of characters**

In [6]:
session_train.head()

prev_items,next_item,locale
str,str,str
"""['B09W9FND7K' 'B09JSPLN1M']""","""B09M7GY217""","""DE"""
"""['B076THCGSG' 'B007MO8IME' 'B0…","""B001B4THSA""","""DE"""
"""['B0B1LGXWDS' 'B00AZYORS2' 'B0…","""B0767DTG2Q""","""DE"""
"""['B09XMTWDVT' 'B0B4MZZ8MB' 'B0…","""B0B4R9NN4B""","""DE"""
"""['B09Y5CSL3T' 'B09Y5DPTXN' 'B0…","""B0BGVBKWGZ""","""DE"""


**Therefore, furter step is needed**

In [7]:
%%time
session_train = session_train.with_columns(
    pl.col("prev_items")
    .str.replace_all(r"[\[\]']", "")  # Remove [ ] and ' 
    .str.split(" ")  # Return a list of strings
    .alias("prev_items")
)

session_test_phase1 = session_test_phase1.with_columns(
    pl.col("prev_items")
    .str.replace_all(r"[\[\]']", "")
    .str.split(" ")
    .alias("prev_items")
)

session_test_phase2 = session_test_phase2.with_columns(
    pl.col("prev_items")
    .str.replace_all(r"[\[\]']", "")
    .str.split(" ")
    .alias("prev_items")
)


CPU times: user 1.38 s, sys: 165 ms, total: 1.54 s
Wall time: 1.54 s


**Create the sessin id**

In [8]:
session_train = session_train.with_columns(pl.Series(name="session_id", values=["train_" + str(i) for i in range(len(session_train))]))
session_test_phase1 = session_test_phase1.with_columns(pl.Series(name="session_id", values=["test_phase1_" + str(i) for i in range(len(session_test_phase1))]))
session_test_phase2 = session_test_phase2.with_columns(pl.Series(name="session_id", values=["test_phase2_" + str(i) for i in range(len(session_test_phase2))]))

**Done, save them**

In [10]:
session_train.write_parquet('/root/KDDCUP/KDDCUP_DATA_CLEANED/task1_session_train.parquet')
session_test_phase1.write_parquet('/root/KDDCUP/KDDCUP_DATA_CLEANED/task1_session_test_phase1.parquet')
session_test_phase2.write_parquet('/root/KDDCUP/KDDCUP_DATA_CLEANED/task1_session_test_phase2.parquet')


**Now, clean the product dataset**

In [12]:
product = pl.read_csv('/root/KDDCUP/KDDCUP数据集/products_train.csv')

In [13]:
# there is an outlier in price
product = product.with_columns(
    pl.when(product['price'] == 40000000.07).then(None).otherwise(product['price']).alias("price")
)

**Add columns indicate the avaliable regions for each product**

In [14]:
product_id2locales = defaultdict(list)

ids = product["id"].to_list()
locales = product["locale"].to_list()

for id, locale in tqdm(zip(ids, locales), total=len(ids)):
    product_id2locales[id].append(locale)
    
product = product.with_columns(
    pl.col("id").map_elements(lambda x: product_id2locales.get(x, [])).alias("available_locales")
)

 16%|█▌        | 126467/778639 [00:00<00:01, 609854.58it/s]

100%|██████████| 778639/778639 [00:00<00:00, 793957.44it/s]
  product = product.with_columns(


**Save**

In [15]:
product.write_parquet("/root/KDDCUP/KDDCUP_DATA_CLEANED/product.parquet")