# Dataset Cleaning 

The main objectives of this notebook are: 
- Import dataset 
- Get familiar with the data 
- Determine if there are data quality issues 
- Resolve any data quality issues 

In [29]:
import polars as pl 
import polars.selectors as cs 

In [30]:
pl.Config.set_tbl_width_chars(200)

polars.config.Config

In [46]:
# Read in the files 
events_df = pl.read_csv("../data/events.csv")
category_tree_df = pl.read_csv("../data/category_tree.csv")
item1 = pl.read_csv("../data/item_properties_part1.csv")
item2 = pl.read_csv("../data/item_properties_part2.csv")

In [54]:
# Concat & save the item property files 
item_df = pl.concat([item1, item2])
item_df.write_csv("../data/item_properties.csv")

In [32]:
events_df.head()

timestamp,visitorid,event,itemid,transactionid
i64,i64,str,i64,str
1433221332117,257597,"""view""",355908,
1433224214164,992329,"""view""",248676,
1433221999827,111016,"""view""",318965,
1433221955914,483717,"""view""",253185,
1433221337106,951259,"""view""",367447,


In [33]:
events_df.shape

(2756101, 5)

In [35]:
events_df.columns

['timestamp', 'visitorid', 'event', 'itemid', 'transactionid']

In [38]:
events_df.select(pl.all().is_null().sum())

timestamp,visitorid,event,itemid,transactionid
u32,u32,u32,u32,u32
0,0,0,0,2733644


### Data Pre-processing Pipeline

In [64]:
def preprocessing_pipeline(df: pl.DataFrame):
    """
    Args:
        df : pl.DataFrame

    Returns:
        pl.DataFrames: dataframe insights 
    """

    print("Dataframe first 5 rows")
    print(df.head())
    print(" ")
    print("Dataframe shape")
    print(df.shape)
    print(" ")
    print("Dataframe columns")
    print(df.columns)
    print(" ")
    print("Dataframe null values")
    print(df.select(pl.all().is_null().sum()))

In [66]:
preprocessing_pipeline(category_tree_df)

Dataframe first 5 rows
shape: (5, 2)
┌────────────┬──────────┐
│ categoryid ┆ parentid │
│ ---        ┆ ---      │
│ i64        ┆ i64      │
╞════════════╪══════════╡
│ 1016       ┆ 213      │
│ 809        ┆ 169      │
│ 570        ┆ 9        │
│ 1691       ┆ 885      │
│ 536        ┆ 1691     │
└────────────┴──────────┘
 
Dataframe shape
(1669, 2)
 
Dataframe columns
['categoryid', 'parentid']
 
Dataframe null values
shape: (1, 2)
┌────────────┬──────────┐
│ categoryid ┆ parentid │
│ ---        ┆ ---      │
│ u32        ┆ u32      │
╞════════════╪══════════╡
│ 0          ┆ 25       │
└────────────┴──────────┘


In [67]:
preprocessing_pipeline(item_df)

Dataframe first 5 rows
shape: (5, 4)
┌───────────────┬────────┬────────────┬─────────────────────────────────┐
│ timestamp     ┆ itemid ┆ property   ┆ value                           │
│ ---           ┆ ---    ┆ ---        ┆ ---                             │
│ i64           ┆ i64    ┆ str        ┆ str                             │
╞═══════════════╪════════╪════════════╪═════════════════════════════════╡
│ 1435460400000 ┆ 460429 ┆ categoryid ┆ 1338                            │
│ 1441508400000 ┆ 206783 ┆ 888        ┆ 1116713 960601 n277.200         │
│ 1439089200000 ┆ 395014 ┆ 400        ┆ n552.000 639502 n720.000 42456… │
│ 1431226800000 ┆ 59481  ┆ 790        ┆ n15360.000                      │
│ 1431831600000 ┆ 156781 ┆ 917        ┆ 828513                          │
└───────────────┴────────┴────────────┴─────────────────────────────────┘
 
Dataframe shape
(20275902, 4)
 
Dataframe columns
['timestamp', 'itemid', 'property', 'value']
 
Dataframe null values
shape: (1, 4)
┌───────────┬─

### Conclusions 
- The datasets are very clean with virtually no null values. 
- Consequently they will not require any cleaning 