# Import modules & data

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

pd.set_option("display.precision", 3)

In [2]:
# import csv files and convert to dataframes

# path to data : csv files

orders_path = 'clean_data/orders_cl.csv'
orderlines_path = 'clean_data/orderlines_cl.csv'
products_path = 'clean_data/products_cl.csv'
brands_path = 'clean_data/brands.csv'

# convert to dataframe

orders_df = pd.read_csv(orders_path)
orderlines_df = pd.read_csv(orderlines_path)
products_df = pd.read_csv(products_path)
brands_df = pd.read_csv(brands_path)

In [3]:
# create copy for further operations
orders = orders_df.copy()
orderlines = orderlines_df.copy()
products = products_df.copy()
brands = brands_df.copy()

# Quick Overview of all tables

In [4]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226904 entries, 0 to 226903
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   order_id      226904 non-null  int64  
 1   created_date  226904 non-null  object 
 2   total_paid    226904 non-null  float64
 3   status        226904 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 6.9+ MB


In [6]:
orderlines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216250 entries, 0 to 216249
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                216250 non-null  int64  
 1   order_id          216250 non-null  int64  
 2   product_quantity  216250 non-null  int64  
 3   sku               216250 non-null  object 
 4   unit_price        216250 non-null  float64
 5   date              216250 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.9+ MB


In [7]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9992 entries, 0 to 9991
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sku       9992 non-null   object 
 1   name      9992 non-null   object 
 2   desc      9992 non-null   object 
 3   price     9992 non-null   float64
 4   in_stock  9992 non-null   bool   
 5   p_type    9992 non-null   object 
dtypes: bool(1), float64(1), object(4)
memory usage: 400.2+ KB


In [50]:
brands.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   short   187 non-null    object
 1   long    187 non-null    object
dtypes: object(2)
memory usage: 3.1+ KB


# Data Quality

## 1. Order Consistency
### Orders present in both orders and orderlines tables

In [12]:
# keeps orders which are presents in both orderlines and orders tables
oo = orders.merge(orderlines, on='order_id',how='inner')
oo.info()
print('-'*50)

# order_qu table which have orders present in merged- oo
orders_qu = orders.loc[orders.order_id.isin(oo.order_id)]
orders_qu.info()
print('-'*50)

# orderlines_qu table which have orders present in merged- oo
orderlines_qu = orderlines.loc[orderlines.order_id.isin(oo.order_id)]
orderlines_qu.info()
print('-'*50)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 216057 entries, 0 to 216056
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   order_id          216057 non-null  int64  
 1   created_date      216057 non-null  object 
 2   total_paid        216057 non-null  float64
 3   status            216057 non-null  object 
 4   id                216057 non-null  int64  
 5   product_quantity  216057 non-null  int64  
 6   sku               216057 non-null  object 
 7   unit_price        216057 non-null  float64
 8   date              216057 non-null  object 
dtypes: float64(2), int64(3), object(4)
memory usage: 16.5+ MB
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 170069 entries, 0 to 226903
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   order_id      170069 non-null  int64  
 1   c

In [13]:
# no of orders in orders & orderlines table
display(orderlines_qu.order_id.nunique())
display(orders_qu.order_id.nunique())

170069

170069

In [10]:
orders_qu.total_paid.sum()

39598242.38

In [11]:
orderlines_qu.unit_price.sum()

35573658.82999999

## 2. Products - SKU Consistency
### Exclude orders with unknown products

In [14]:
# total unique skus in orderlines
orderlines_qu.sku.nunique()

6798

In [22]:
# no of unknown sku's in orderlines
orderlines_qu.loc[~(orderlines_qu.sku.isin(products.sku))]['sku'].nunique()

368

In [26]:
# number of orderlines for known & unknown sku
orderlines_qu.sku.isin(products["sku"]).value_counts()

True     209510
False      6547
Name: sku, dtype: int64

In [34]:
# number of orders containing unknown skus
len(orderlines_qu.loc[~(orderlines_qu.sku.isin(products.sku))]['order_id'].unique())

6319

In [40]:
# number of orders containing known skus
len(orderlines_qu.loc[(orderlines_qu.sku.isin(products.sku))]['order_id'].unique())

166144

We will drop all orders and corresponding orderlines that contains unknown skus.

#### List of order_id with unknown skus

In [36]:
# orderlines with unknown sku's (sku which are not listed in product table)
orderlines_unknown_sku = orderlines_qu.loc[~(orderlines_qu.sku.isin(products.sku))]

# list of order_id which have unknown sku's
bl_order_id = list(orderlines_unknown_sku['order_id'].unique())

#### Orderlines containing known sku's

In [42]:
# orderlines_qu --> exclude black listed : unknown skus orderlines
orderlines_qu = orderlines_qu.loc[~(orderlines_qu.order_id.isin(bl_order_id))]
orderlines_qu.info()
print('-'*50)

print("no of orders in orderlines = ",orderlines_qu.order_id.nunique())
print('no of known skus : ',orderlines_qu.sku.nunique())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 205403 entries, 0 to 216249
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                205403 non-null  int64  
 1   order_id          205403 non-null  int64  
 2   product_quantity  205403 non-null  int64  
 3   sku               205403 non-null  object 
 4   unit_price        205403 non-null  float64
 5   date              205403 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 11.0+ MB
--------------------------------------------------
no of orders in orderlines =  163750
no of known skus :  6411


#### Orders containing known sku's

In [44]:
# order_qu --> exclude black listed : unknown skus containing orders
orders_qu = orders_qu.loc[~(orders_qu.order_id.isin(bl_order_id))]
orders_qu.info()

f'There are {orders_qu.order_id.nunique()} orders which have known skus.'

<class 'pandas.core.frame.DataFrame'>
Int64Index: 163750 entries, 0 to 226903
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   order_id      163750 non-null  int64  
 1   created_date  163750 non-null  object 
 2   total_paid    163750 non-null  float64
 3   status        163750 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 6.2+ MB


'There are 163750 orders which have known skus.'

#### Products containing known sku's

In [45]:
# product_qu --> exclude black listed : unknown skus - orders not present
products_qu = products.loc[(products.sku.isin(orderlines_qu.sku))]
products_qu.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6411 entries, 0 to 9986
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sku       6411 non-null   object 
 1   name      6411 non-null   object 
 2   desc      6411 non-null   object 
 3   price     6411 non-null   float64
 4   in_stock  6411 non-null   bool   
 5   p_type    6411 non-null   object 
dtypes: bool(1), float64(1), object(4)
memory usage: 306.8+ KB


Now we have orders and orderlines table which have known products or products listed in `product` table as well as orders and orderlines table have same orders. Now we can say 3 of these tables have consistent data.

## Export quality checked data as csv file

In [49]:
# comment this code after exporting file or uncomment to export after changes

#products_qu.to_csv('quality_data/products_qu.csv',index=False)
#orders_qu.to_csv('quality_data/orders_qu.csv',index=False)
#orderlines_qu.to_csv('quality_data/orderlines_qu.csv',index=False)

Outlier removal part in Analysis file