# orderlines.csv 
Every row represents each one of the different products involved in an order.

* **id** – a unique identifier for each row in this file
* **id_order** – corresponds to orders.order_id
* **product_id** – an old identifier for each product, nowadays not in use
* **product_quantity** – how many units of that product were purchased on that order
* **sku** – stock keeping unit: a unique identifier for each product
* **unit_price** – the unitary price (in euros) of each product at the moment of placing that order
* **date** – timestamp for the processing of that product

## Import the data

In [38]:
import pandas as pd

orderlines = pd.read_csv('../data/orderlines.csv')
orderlines.head()

Unnamed: 0,id,id_order,product_id,product_quantity,sku,unit_price,date
0,1119109,299539,0,1,OTT0133,18.99,2017-01-01 00:07:19
1,1119110,299540,0,1,LGE0043,399.0,2017-01-01 00:19:45
2,1119111,299541,0,1,PAR0071,474.05,2017-01-01 00:20:57
3,1119112,299542,0,1,WDT0315,68.39,2017-01-01 00:51:40
4,1119113,299543,0,1,JBL0104,23.74,2017-01-01 01:06:38


## Data exploration

In [39]:
orderlines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                293983 non-null  int64 
 1   id_order          293983 non-null  int64 
 2   product_id        293983 non-null  int64 
 3   product_quantity  293983 non-null  int64 
 4   sku               293983 non-null  object
 5   unit_price        293983 non-null  object
 6   date              293983 non-null  object
dtypes: int64(4), object(3)
memory usage: 15.7+ MB


<div class="alert alert-block alert-danger">
    Two variables need to be modified: 
    <br>
    <span>&#8226;</span> <b>unit_price</b>: it is detected as an object but it should be a float.
    <br>
    <span>&#8226;</span> <b>date</b>: needs to be transformed to a date time format.
</div>

In [40]:
print("The number of rows is", orderlines_original.shape[0])
print("The number of columns is", orderlines_original.shape[1])
print("The number of values is ", orderlines_original.size)

The number of rows is 293983
The number of columns is 7
The number of values is  2057881


In [43]:
orderlines.describe()

Unnamed: 0,id,id_order,product_id,product_quantity,date
count,293983.0,293983.0,293983.0,293983.0,293983
mean,1397918.0,419999.116544,0.0,1.121126,2017-09-19 03:19:26.305779712
min,1119109.0,241319.0,0.0,1.0,2017-01-01 00:07:19
25%,1262542.0,362258.5,0.0,1.0,2017-06-06 16:20:34.500000
50%,1406940.0,425956.0,0.0,1.0,2017-11-13 21:13:53
75%,1531322.0,478657.0,0.0,1.0,2018-01-02 04:47:03
max,1650203.0,527401.0,0.0,999.0,2018-03-14 13:58:36
std,153009.6,66344.486479,0.0,3.396569,


In [19]:
orderlines.nunique()

id                  293983
id_order            204855
product_id               1
product_quantity        67
sku                   7951
unit_price           11329
date                251631
dtype: int64

In [21]:
orderlines[orderlines.product_id != 0]

Unnamed: 0,id,id_order,product_id,product_quantity,sku,unit_price,date


We should drop the `product_id` column because it contains no information

In [22]:
orderlines['sku'].unique().tolist()[:10]

['OTT0133',
 'LGE0043',
 'PAR0071',
 'WDT0315',
 'JBL0104',
 'WDT0249',
 'APP1582',
 'OWC0100',
 'IOT0014',
 'APP0700']

In [23]:
orderlines.isna().sum()

id                  0
id_order            0
product_id          0
product_quantity    0
sku                 0
unit_price          0
date                0
dtype: int64

In [24]:
orderlines.duplicated().sum() # parameters keep=False
# df.drop_duplicates()

0

In [25]:
orderlines.nsmallest(5, 'product_quantity')

Unnamed: 0,id,id_order,product_id,product_quantity,sku,unit_price,date
0,1119109,299539,0,1,OTT0133,18.99,2017-01-01 00:07:19
1,1119110,299540,0,1,LGE0043,399.0,2017-01-01 00:19:45
2,1119111,299541,0,1,PAR0071,474.05,2017-01-01 00:20:57
3,1119112,299542,0,1,WDT0315,68.39,2017-01-01 00:51:40
4,1119113,299543,0,1,JBL0104,23.74,2017-01-01 01:06:38


In [26]:
orderlines.nlargest(5, 'product_quantity')

Unnamed: 0,id,id_order,product_id,product_quantity,sku,unit_price,date
53860,1228150,346221,0,999,APP1190,55.99,2017-04-14 21:50:52
68712,1254032,358747,0,999,SEV0028,19.99,2017-05-24 14:51:58
57796,1234924,349475,0,800,KIN0137,7.49,2017-04-25 09:59:00
57306,1234111,349133,0,555,APP0665,70.99,2017-04-24 10:20:13
40813,1204788,335057,0,201,THU0029,80.99,2017-03-14 15:25:53


### Explore the corrupted values in the unit_price column

In [42]:
orderlines['date'] = pd.to_datetime(orderlines['date'])
orderlines['unit_price'] = pd.to_numeric(orderlines['unit_price'], errors='raise')

ValueError: Unable to parse string "1.137.99" at position 6

<div class="alert alert-block alert-danger">
    There are corrupted values in the <b>unit_price</b> column.
</div>

In [61]:
# we create a copy of the dataset
orderlines_temp = orderlines.copy()

# create a new column with the amount of dots in the unit_price column
orderlines_temp['num_decimals'] = orderlines['unit_price'].str.count(r'\.')

# show the rows with more than one dot
orderlines_temp.query('num_decimals > 1')

Unnamed: 0,id,id_order,product_id,product_quantity,sku,unit_price,date,num_decimals
6,1119115,299544,0,1,APP1582,1.137.99,2017-01-01 01:17:21,2
11,1119126,299549,0,1,PAC0929,2.565.99,2017-01-01 02:07:42,2
15,1119131,299553,0,1,APP1854,3.278.99,2017-01-01 02:14:47,2
43,1119195,299582,0,1,PAC0961,2.616.99,2017-01-01 08:54:00,2
59,1119214,299596,0,1,PAC1599,2.873.99,2017-01-01 09:53:11,2
...,...,...,...,...,...,...,...,...
293862,1649999,452946,0,1,APP2075,2.999.00,2018-03-14 13:03:33,2
293887,1650045,527321,0,1,PAC2148,3.497.00,2018-03-14 13:10:15,2
293889,1650050,527324,0,1,PAC2117,3.075.00,2018-03-14 13:10:56,2
293911,1650088,527342,0,1,APP2492,1.329.00,2018-03-14 13:24:51,2


In [62]:
orderlines_temp['num_decimals'][orderlines_temp['num_decimals']>1].describe()

count    36169.0
mean         2.0
std          0.0
min          2.0
25%          2.0
50%          2.0
75%          2.0
max          2.0
Name: num_decimals, dtype: float64

<div class="alert alert-block alert-danger">
    There are 36169 values which have been corrupted. These values have two decimal points.
    <br>
    We will remove one of the decimal points (left) and turn the values into floats.
    <br>
    We will then compare the values to those in products and orders to deduce the correct position for the decimal point.
</div>

In [65]:
orders = pd.read_csv('../data/orders.csv')
orders[orders['order_id']==299544]

Unnamed: 0,order_id,created_date,total_paid,state
206,299544,2017-01-01 01:17:21,1137.99,Shopping Basket


After checking multiple values we can conclude that simply removing the first decimal point (from left) will fix the values.

### Check for order_ids which exist in orderlines but not in orderlines

In [68]:
orderlines[~orderlines.id_order.isin(orders.order_id)]

Unnamed: 0,id,id_order,product_id,product_quantity,sku,unit_price,date
5,1119114,295310,0,10,WDT0249,231.79,2017-01-01 01:14:27
63,1119218,296284,0,1,BNQ0042,699.00,2017-01-01 09:58:35
67,1119223,294806,0,1,APP1849,2.558.99,2017-01-01 10:09:15
69,1119226,294806,0,1,APP1864,2.797.99,2017-01-01 10:15:14
70,1119235,297261,0,1,QNA0177,304.99,2017-01-01 10:17:59
...,...,...,...,...,...,...,...
275665,1621177,244328,0,1,OWC0260,349.00,2018-02-20 13:00:45
280055,1628699,261391,0,1,APP2352,3.343.00,2018-02-27 11:43:25
280352,1629247,287797,0,1,SYN0182,484.11,2018-02-27 19:16:26
280856,1630150,261391,0,1,SAT0091,44.99,2018-02-28 12:18:26


<div class="alert alert-block alert-danger">
    There are 234 mismatched order IDs which need to be removed
</div>

In [71]:
orderlines.loc[(orderlines.unit_price==6.59)]

Unnamed: 0,id,id_order,product_id,product_quantity,sku,unit_price,date


In [84]:
orderlines#.loc[(orderlines.unit_price==6.59)]

Unnamed: 0,id,id_order,product_id,product_quantity,sku,unit_price,date
0,1119109,299539,0,1,OTT0133,18.99,2017-01-01 00:07:19
1,1119110,299540,0,1,LGE0043,399.00,2017-01-01 00:19:45
2,1119111,299541,0,1,PAR0071,474.05,2017-01-01 00:20:57
3,1119112,299542,0,1,WDT0315,68.39,2017-01-01 00:51:40
4,1119113,299543,0,1,JBL0104,23.74,2017-01-01 01:06:38
...,...,...,...,...,...,...,...
293978,1650199,527398,0,1,JBL0122,42.99,2018-03-14 13:57:25
293979,1650200,527399,0,1,PAC0653,141.58,2018-03-14 13:57:34
293980,1650201,527400,0,2,APP0698,9.99,2018-03-14 13:57:41
293981,1650202,527388,0,1,BEZ0204,19.99,2018-03-14 13:58:01


## Data cleaning

In [94]:
def start_pipeline(df):
    '''Make a copy of the pipeline to prevent corrupting the original data'''
    return df.copy()

def remove_missing_data(df, col):
    return df[~df[col].isna()]

def drop_unmatched_rows(df, comparison_df, col, comparison_col):
    return df[df[col].isin(comparison_df[comparison_col])]
    

print()
orderlines_clean = (orderlines
        .pipe(start_pipeline)
        .pipe(drop_unmatched_rows, comparison_df=orders, col='id_order', comparison_col='order_id')
)
print(f"{orderlines.shape[0] - orderlines_clean.shape[0]} rows were removed.")


234 rows were removed.


<div class="alert alert-block alert-info">
    Upon further analysis, since the price data is so corrupted the data will have to be merged into a single file.
    <br>
    It makes more sense to remove unmatched orders once the data has been merged.
</div>