In [None]:
import pandas as pd
from datetime import date
from scripts.load import *

### Cleaning line item products

In [None]:
#Cleaning line item products
df_item_products1 = all['line_item_data_products1.csv']
df_item_products2 = all['line_item_data_products2.csv']
df_item_products3 = all['line_item_data_products3.parquet']
df_item_products3.nunique()

In [None]:
#Combining line item product files
df_item_products = pd.concat([df_item_products1, df_item_products2, df_item_products3])


In [None]:
print(df_item_products.shape)
print(df_item_products.nunique())
print(df_item_products.isna().sum())

In [None]:
#Remove Unnamed: 0 column
df_item_products = df_item_products.drop('Unnamed: 0', axis=1)

In [None]:
#Dropping duplicates based on order_id, keep no empty values
df_item_products = df_item_products.drop_duplicates(subset=['order_id'])

In [None]:
#line item products data cleaned completely, to check:
print(df_item_products.shape)
print(df_item_products.nunique())
print(df_item_products.isna().sum())

### Cleaning line item prices

In [None]:
#Cleaning line item prices
df_item_prices1 = all['line_item_data_prices1.csv']
df_item_prices2 = all['line_item_data_prices2.csv']
df_item_prices3 = all['line_item_data_prices3.parquet']

df_item_prices = pd.concat([df_item_prices1, df_item_prices2, df_item_prices3])

In [None]:
print(df_item_prices.shape)
print(df_item_prices.nunique())
print(df_item_prices.isna().sum())

In [None]:
#Remove Unnamed: 0 column
df_item_prices = df_item_prices.drop('Unnamed: 0', axis=1)

In [None]:
#Standardize quantity
df_item_prices['quantity'] = df_item_prices['quantity'].str.replace('\D', '', regex=True)
df_item_prices['quantity'] = df_item_prices['quantity'].astype(int)

In [None]:
#Dropping duplicates based on order_id, keep no empty values
df_item_prices = df_item_prices.drop_duplicates(subset=['order_id'])

In [None]:
#line item prices data cleaned completely, to check:
print(df_item_prices.shape)
print(df_item_prices.nunique())
print(df_item_prices.isna().sum())

### Cleaning order data

In [None]:
#Cleaning order data
df_order1 = all['order_data_20200101-20200701.parquet']
df_order2 = all['order_data_20200701-20211001.pickle']
df_order3 = all['order_data_20211001-20220101.csv']
df_order4 = all['order_data_20220101-20221201.xlsx']
df_order5 = all['order_data_20221201-20230601.json']
df_order6 = all['order_data_20230601-20240101.html'][0]

In [None]:
#Removing useless Unnamed: 0 column
df_order3 = df_order3.drop('Unnamed: 0', axis=1)
df_order4 = df_order4.drop('Unnamed: 0', axis=1)
df_order6 = df_order6.drop('Unnamed: 0', axis=1)

In [None]:
#Combining all order data
df_orders = pd.concat([df_order1, df_order2, df_order3, df_order4, df_order5, df_order6])
print(df_orders.shape)
print(df_orders.nunique())
print(df_orders.isna().sum())

In [None]:
#renaming estimated arrival as estimated_arrival
df_orders.rename(columns={'estimated arrival': 'estimated_arrival'}, inplace=True)

In [None]:
#estimated arrival column made to int data type
df_orders['estimated_arrival'] = df_orders['estimated_arrival'].str.replace('\D', '', regex=True)
df_orders['estimated_arrival'] = df_orders['estimated_arrival'].astype(int)

In [None]:
#Removing transactions that happened in the future
today = str(date.today())
df_orders = df_orders[df_orders['transaction_date'] <= today]

In [None]:
#order data cleaned completely, to check:
print(df_orders.shape)
print(df_orders.nunique())
print(df_orders.isna().sum())

### Cleaning order delays

In [None]:
#Cleaning order delays
df_delays = all['order_delays.html'][0]

print(df_delays.shape)
print(df_delays.nunique())
print(df_delays.isna().sum())

In [None]:
#Removing useless Unnamed: 0 column
df_delays = df_delays.drop('Unnamed: 0', axis=1)

In [None]:
#order data cleaned completely, to check:
print(df_delays.shape)
print(df_delays.nunique())
print(df_delays.isna().sum())