# Dataset Cleaning 

The main objectives of this notebook are: 
- Import dataset 
- Get familiar with the data 
- Determine if there are data quality issues 
- Resolve any data quality issues 

In [17]:
import polars as pl 
import polars.selectors as cs 

In [18]:
pl.Config.set_tbl_width_chars(200)

polars.config.Config

In [19]:
df = pl.read_csv("../data/events.csv")

In [20]:
df.head()

timestamp,visitorid,event,itemid,transactionid
i64,i64,str,i64,str
1433221332117,257597,"""view""",355908,
1433224214164,992329,"""view""",248676,
1433221999827,111016,"""view""",318965,
1433221955914,483717,"""view""",253185,
1433221337106,951259,"""view""",367447,


In [21]:
df.shape

(2756101, 5)

In [22]:
df.sample

<bound method DataFrame.sample of shape: (2_756_101, 5)
┌───────────────┬───────────┬───────┬────────┬───────────────┐
│ timestamp     ┆ visitorid ┆ event ┆ itemid ┆ transactionid │
│ ---           ┆ ---       ┆ ---   ┆ ---    ┆ ---           │
│ i64           ┆ i64       ┆ str   ┆ i64    ┆ str           │
╞═══════════════╪═══════════╪═══════╪════════╪═══════════════╡
│ 1433221332117 ┆ 257597    ┆ view  ┆ 355908 ┆ null          │
│ 1433224214164 ┆ 992329    ┆ view  ┆ 248676 ┆ null          │
│ 1433221999827 ┆ 111016    ┆ view  ┆ 318965 ┆ null          │
│ 1433221955914 ┆ 483717    ┆ view  ┆ 253185 ┆ null          │
│ 1433221337106 ┆ 951259    ┆ view  ┆ 367447 ┆ null          │
│ …             ┆ …         ┆ …     ┆ …      ┆ …             │
│ 1438398785939 ┆ 591435    ┆ view  ┆ 261427 ┆ null          │
│ 1438399813142 ┆ 762376    ┆ view  ┆ 115946 ┆ null          │
│ 1438397820527 ┆ 1251746   ┆ view  ┆ 78144  ┆ null          │
│ 1438398530703 ┆ 1184451   ┆ view  ┆ 283392 ┆ null          │

In [23]:
df.columns

['timestamp', 'visitorid', 'event', 'itemid', 'transactionid']

In [25]:
df.select(
    pl.col('event'),
    pl.col('itemid')
    )

event,itemid
str,i64
"""view""",355908
"""view""",248676
"""view""",318965
"""view""",253185
"""view""",367447
…,…
"""view""",261427
"""view""",115946
"""view""",78144
"""view""",283392


In [27]:
df.filter(pl.col('event') == 'transaction')

timestamp,visitorid,event,itemid,transactionid
i64,i64,str,i64,str
1433222276276,599528,"""transaction""",356475,"""4000"""
1433193500981,121688,"""transaction""",15335,"""11117"""
1433193915008,552148,"""transaction""",81345,"""5444"""
1433176736375,102019,"""transaction""",150318,"""13556"""
1433174518180,189384,"""transaction""",310791,"""7244"""
…,…,…,…,…
1438377176570,1050575,"""transaction""",31640,"""8354"""
1438379878779,861299,"""transaction""",456602,"""3643"""
1438357730123,855941,"""transaction""",235771,"""4385"""
1438355560300,548772,"""transaction""",29167,"""13872"""


In [28]:
df.select(pl.all().is_null().sum())

timestamp,visitorid,event,itemid,transactionid
u32,u32,u32,u32,u32
0,0,0,0,2733644
