# Dataset Cleaning 

The main objectives of this notebook are: 
- Import and merge together multiple datasets into a features df 
- Get familiar with the features df 
- Determine if there are data quality issues 
- Resolve any data quality issues 

In [1]:
import polars as pl 
import polars.selectors as cs 

In [2]:
pl.Config.set_tbl_width_chars(200)

polars.config.Config

In [3]:
df = pl.read_csv("../data/2019-Nov.csv")

In [4]:
df.head()

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
str,str,i64,i64,str,str,f64,i64,str
"""2019-11-01 00:00:00 UTC""","""view""",1003461,2053013555631882655,"""electronics.smartphone""","""xiaomi""",489.07,520088904,"""4d3b30da-a5e4-49df-b1a8-ba5943…"
"""2019-11-01 00:00:00 UTC""","""view""",5000088,2053013566100866035,"""appliances.sewing_machine""","""janome""",293.65,530496790,"""8e5f4f83-366c-4f70-860e-ca7417…"
"""2019-11-01 00:00:01 UTC""","""view""",17302664,2053013553853497655,,"""creed""",28.31,561587266,"""755422e7-9040-477b-9bd2-6a6e8f…"
"""2019-11-01 00:00:01 UTC""","""view""",3601530,2053013563810775923,"""appliances.kitchen.washer""","""lg""",712.87,518085591,"""3bfb58cd-7892-48cc-8020-2f17e6…"
"""2019-11-01 00:00:01 UTC""","""view""",1004775,2053013555631882655,"""electronics.smartphone""","""xiaomi""",183.27,558856683,"""313628f1-68b8-460d-84f6-cec7a8…"


In [5]:
df.shape

(67501979, 9)

In [6]:
df.sample

<bound method DataFrame.sample of shape: (67_501_979, 9)
┌─────────────────────────┬────────────┬────────────┬─────────────────────┬───┬─────────┬─────────┬───────────┬─────────────────────────────────┐
│ event_time              ┆ event_type ┆ product_id ┆ category_id         ┆ … ┆ brand   ┆ price   ┆ user_id   ┆ user_session                    │
│ ---                     ┆ ---        ┆ ---        ┆ ---                 ┆   ┆ ---     ┆ ---     ┆ ---       ┆ ---                             │
│ str                     ┆ str        ┆ i64        ┆ i64                 ┆   ┆ str     ┆ f64     ┆ i64       ┆ str                             │
╞═════════════════════════╪════════════╪════════════╪═════════════════════╪═══╪═════════╪═════════╪═══════════╪═════════════════════════════════╡
│ 2019-11-01 00:00:00 UTC ┆ view       ┆ 1003461    ┆ 2053013555631882655 ┆ … ┆ xiaomi  ┆ 489.07  ┆ 520088904 ┆ 4d3b30da-a5e4-49df-b1a8-ba5943… │
│ 2019-11-01 00:00:00 UTC ┆ view       ┆ 5000088    ┆ 2053013566100

In [7]:
df.columns

['event_time',
 'event_type',
 'product_id',
 'category_id',
 'category_code',
 'brand',
 'price',
 'user_id',
 'user_session']

# Column Selection

In [8]:
df.select(
    pl.col('event_type'),
    pl.col('brand')
    )

event_type,brand
str,str
"""view""","""xiaomi"""
"""view""","""janome"""
"""view""","""creed"""
"""view""","""lg"""
"""view""","""xiaomi"""
…,…
"""view""",
"""view""","""baden"""
"""view""","""samsung"""
"""view""","""samsung"""


# Row selection

In [9]:
df.filter(pl.col('price') > 1000)

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
str,str,i64,i64,str,str,f64,i64,str
"""2019-11-01 00:00:27 UTC""","""view""",1004237,2053013555631882655,"""electronics.smartphone""","""apple""",1091.33,565404816,"""d9cf0ac4-c3ec-4e1f-83aa-326b1e…"
"""2019-11-01 00:00:40 UTC""","""view""",1005116,2053013555631882655,"""electronics.smartphone""","""apple""",1013.86,532647354,"""d2d3d2c6-631d-489e-9fb5-06f340…"
"""2019-11-01 00:00:41 UTC""","""view""",1004241,2053013555631882655,"""electronics.smartphone""","""apple""",1173.6,565404816,"""d9cf0ac4-c3ec-4e1f-83aa-326b1e…"
"""2019-11-01 00:00:41 UTC""","""view""",1004237,2053013555631882655,"""electronics.smartphone""","""apple""",1091.33,519277091,"""62fadce6-aa4a-4dde-92fc-c04c22…"
"""2019-11-01 00:00:51 UTC""","""view""",1307012,2053013558920217191,"""computers.notebook""","""apple""",2342.15,530857208,"""c69f9c63-7098-426c-97d9-8cb94a…"
…,…,…,…,…,…,…,…,…
"""2019-11-30 23:59:17 UTC""","""view""",1005124,2053013555631882655,"""electronics.smartphone""","""apple""",1436.56,571072384,"""8f414ee3-5bf5-48ce-a57c-f19a9b…"
"""2019-11-30 23:59:29 UTC""","""view""",1005105,2053013555631882655,"""electronics.smartphone""","""apple""",1302.48,556695836,"""ca5eefc5-11f9-450c-91ed-380285…"
"""2019-11-30 23:59:38 UTC""","""view""",1005105,2053013555631882655,"""electronics.smartphone""","""apple""",1302.48,561181104,"""b2c61552-576c-4532-8d1e-ad2a0a…"
"""2019-11-30 23:59:55 UTC""","""view""",16800304,2053013558316237377,"""furniture.kitchen.table""","""aero""",1106.85,579175262,"""b8ff34a9-41be-4497-b373-408c6e…"


In [10]:
df.select(pl.all().is_null().sum())

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,21898171,9218235,0,0,10


In [15]:
df.filter(pl.col('event_type') == 'purchase')

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
str,str,i64,i64,str,str,f64,i64,str
"""2019-11-01 00:00:41 UTC""","""purchase""",13200605,2053013557192163841,"""furniture.bedroom.bed""",,566.3,559368633,"""d6034fa2-41fb-4ac0-9051-55ea9f…"
"""2019-11-01 00:01:04 UTC""","""purchase""",1005161,2053013555631882655,"""electronics.smartphone""","""xiaomi""",211.92,513351129,"""e6b7ce9b-1938-4e20-976c-8b4163…"
"""2019-11-01 00:04:51 UTC""","""purchase""",1004856,2053013555631882655,"""electronics.smartphone""","""samsung""",128.42,562958505,"""0f039697-fedc-40fa-8830-39c1a0…"
"""2019-11-01 00:05:34 UTC""","""purchase""",26401669,2053013563651392361,,"""lucente""",109.66,541854711,"""c41c44d5-ef9b-41b9-9cd6-8d96dd…"
"""2019-11-01 00:06:33 UTC""","""purchase""",1801881,2053013554415534427,"""electronics.video.tv""","""samsung""",488.8,557746614,"""4d76d6d3-fff5-4880-8327-e9e57b…"
…,…,…,…,…,…,…,…,…
"""2019-11-30 23:58:14 UTC""","""purchase""",1004874,2053013555631882655,"""electronics.smartphone""","""samsung""",346.7,547804983,"""717566cf-ef93-4078-ba8f-169a3a…"
"""2019-11-30 23:58:22 UTC""","""purchase""",1005130,2053013555631882655,"""electronics.smartphone""","""apple""",1437.02,515582054,"""829c20b5-696e-4a8a-8a9f-171014…"
"""2019-11-30 23:58:57 UTC""","""purchase""",1004767,2053013555631882655,"""electronics.smartphone""","""samsung""",235.6,579876821,"""ca50e291-43f3-4ca2-9e13-20ee6b…"
"""2019-11-30 23:59:15 UTC""","""purchase""",3701309,2053013565983425517,"""appliances.environment.vacuum""","""polaris""",89.32,543733099,"""a65116f4-ac53-4a41-ad68-660678…"
