# orders.csv 
Every row in this file represents an order.

* **order_id** – a unique identifier for each order
* **created_date** – a timestamp for when the order was created
* **total_paid** – the total amount paid by the customer for this order, in euros
* **state** –
    * “Shopping basket” - products have been placed in the shopping basket
    - “Place Order” – the order has been placed, but is awaiting shipment details 
    - “Pending” – the order is awaiting payment confirmation
    - “Completed” – the order has been placed and paid, and the transaction is completed.
    - “Cancelled” – the order has been cancelled and the payment returned to the customer.

## Import the data

In [92]:
import pandas as pd
import numpy as np
import re

pd.options.display.max_rows = 500

orders = pd.read_csv('../data/orders.csv')
orders.head()

Unnamed: 0,order_id,created_date,total_paid,state
0,241319,2017-01-02 13:35:40,44.99,Cancelled
1,241423,2017-11-06 13:10:02,136.15,Completed
2,242832,2017-12-31 17:40:03,15.76,Completed
3,243330,2017-02-16 10:59:38,84.98,Completed
4,243784,2017-11-24 13:35:19,157.86,Cancelled


## Data exploration

In [93]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226909 entries, 0 to 226908
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   order_id      226909 non-null  int64  
 1   created_date  226909 non-null  object 
 2   total_paid    226904 non-null  float64
 3   state         226909 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 6.9+ MB


<div class="alert alert-block alert-danger">
    One variable need to be modified: 
    <br>
    <span>&#8226;</span>  <b>created_date</b>: needs to be transformed to a date time format.
</div>

In [94]:
orders['created_date'] = pd.to_datetime(orders['created_date'])
orders.describe()

Unnamed: 0,order_id,created_date,total_paid
count,226909.0,226909,226904.0
mean,413296.48248,2017-09-06 15:00:34.791753472,569.225818
min,241319.0,2017-01-01 00:07:19,0.0
25%,356263.0,2017-05-17 08:18:59,34.19
50%,413040.0,2017-10-16 21:11:16,112.99
75%,470553.0,2017-12-24 15:52:56,525.98
max,527401.0,2018-03-14 13:58:36,214747.53
std,65919.250331,,1761.778002


In [95]:
orders.nunique()

order_id        226909
created_date    224828
total_paid       31236
state                5
dtype: int64

In [96]:
orders.isna().sum()

order_id        0
created_date    0
total_paid      5
state           0
dtype: int64

In [97]:
orders[orders.total_paid.isna()]

Unnamed: 0,order_id,created_date,total_paid,state
127701,427314,2017-11-20 18:54:39,,Pending
132013,431655,2017-11-22 12:15:24,,Pending
147316,447411,2017-11-27 10:32:37,,Pending
148833,448966,2017-11-27 18:54:15,,Pending
149434,449596,2017-11-27 21:52:08,,Pending


<div class="alert alert-block alert-danger">
    There are 5 NaN values in the <b>total_paid</b> column which we should remove.
</div>

In [98]:
orders_original = orders.copy()

# Remove nan values
orders_before = orders.shape[0]
orders.dropna(inplace=True)
orders_after = orders.shape[0]

rows_removed_from_orders = orders_before - orders_after
print(f'{rows_removed_from_orders} rows were removed from orders after removing NaN values')
print(f'This represents {round(rows_removed_from_orders/orders_before*100, 4)}% of the data')

5 rows were removed from orders after removing NaN values
This represents 0.0022% of the data


In [99]:
orders.duplicated().sum() # parameters keep=False
# df.drop_duplicates()

0

## Compare data with orderlines to ensure data consistency

In [110]:
orderlines = pd.read_csv('../data/orderlines.csv', 
                         dtype={'id': int, 
                                'id_order': int, 
                                'product_id': int,
                                'sku': str, 
                                'unit_price': str}, 
                         parse_dates=['date'])

In [101]:
orders

Unnamed: 0,order_id,created_date,total_paid,state
0,241319,2017-01-02 13:35:40,44.99,Cancelled
1,241423,2017-11-06 13:10:02,136.15,Completed
2,242832,2017-12-31 17:40:03,15.76,Completed
3,243330,2017-02-16 10:59:38,84.98,Completed
4,243784,2017-11-24 13:35:19,157.86,Cancelled
...,...,...,...,...
226904,527397,2018-03-14 13:56:38,42.99,Place Order
226905,527398,2018-03-14 13:57:25,42.99,Shopping Basket
226906,527399,2018-03-14 13:57:34,141.58,Shopping Basket
226907,527400,2018-03-14 13:57:41,19.98,Shopping Basket


In [102]:
# Find order_id values in orders that are not in orderlines
orders_not_in_orderlines = orders[~orders['order_id'].isin(orderlines['id_order'])]

print(f"There are {orders_not_in_orderlines.shape[0]} order IDs in 'orders' that are not in 'orderlines'.")

There are 22213 order IDs in 'orders' that are not in 'orderlines'.


In [103]:
orders_not_in_orderlines.state.value_counts()

state
Place Order        12304
Shopping Basket     9810
Completed             45
Cancelled             41
Pending               13
Name: count, dtype: int64

In [104]:
# Find id_order values in orderlines that are not in orders
orderlines_not_in_orders = orderlines[~orderlines['id_order'].isin(orders['order_id'])]

print(f"There are {orderlines_not_in_orders.shape[0]} order IDs in 'orderlines' that are not in 'orders'.")

There are 240 order IDs in 'orderlines' that are not in 'orders'.


**After** cleaning `orders` we will have to remove any rows which do not have a corresponding order ID in `order_lines` and then **before** cleaning `order_lines` we will remove and rows which do not have a corresponding order ID in `orders`

<div class="alert alert-block alert-info">
     <br>
     <b>After</b>  we will have to remove any rows which do not have a corresponding order ID in <b>order_lines</b>  and then <b>before</b>  cleaning <b>order_lines</b>  we will remove and rows which do not have a corresponding order ID in <b>orders</b>
    
</div>

## Data cleaning

In [105]:
orders = orders_original.copy()
print(orders.shape)

# Remove nan values
orders_before = orders.shape[0]
orders.dropna(inplace=True)
orders_after = orders.shape[0]

rows_removed_from_orders = orders_before - orders_after
print(f'{rows_removed_from_orders} rows were removed from orders after removing NaN values')
print(f'This represents {round(rows_removed_from_orders/orders_before*100, 2)}% of the data')

# Remove 22213 order ids in orders that aren't in orderlines
orders_before = orders.shape[0]
orders = orders[orders.order_id.isin(orderlines_original.id_order)]
orders_after = orders.shape[0]
rows_removed_from_orders = orders_before - orders_after

# Of the order ids in orders but not in orderlines, 12304 orders have state = Place Order and 
# 9810 have state = Shopping Basket for a total of 22114.
# There are a total of 40883 orders with state = Place Order and 117809 with state = Shopping Basket.
print('\n')
print(f'{rows_removed_from_orders} rows were removed from orders after removing order_ids which do not exist in orderlines')
print(f'This represents {rows_removed_from_orders/orders_before*100:.2f}% of the data')

orders.created_date.dt.minute

#orders.to_csv(path + 'orders_clean.csv', index=False)
orders.shape


(226909, 4)
5 rows were removed from orders after removing NaN values
This represents 0.0% of the data


22213 rows were removed from orders after removing order_ids which do not exist in orderlines
This represents 9.79% of the data


(204691, 4)

In [106]:
def start_pipeline(df):
    '''Make a copy of the pipeline to prevent corrupting the original data'''
    return df.copy()

def remove_missing_data(df, col):
    return df[~df[col].isna()]

def drop_unmatched_rows(df, comparison_df, col, comparison_col):
    return df[df[col].isin(comparison_df[comparison_col])]
    

print(orders_original.shape)
orders_clean = (orders_original
        .pipe(start_pipeline)
        .pipe(remove_missing_data, col='total_paid')
        .pipe(drop_unmatched_rows, comparison_df=orderlines, col='order_id', comparison_col='id_order')
)
orders_clean.shape

(226909, 4)


(204691, 4)