In [None]:
import pandas as pd
from scripts.load import *

### Cleaning order data

In [None]:
##Cleaning order data
df_order1 = all['order_with_merchant_data1.parquet']
df_order2 = all['order_with_merchant_data2.parquet']
df_order3 = all['order_with_merchant_data3.csv']

print(df_order1.shape)
print(df_order2.shape)
print(df_order3.shape)

In [None]:
#Drop unnecessary indexing
df_order3 = df_order3.drop(columns=['Unnamed: 0'])

In [None]:
#Combine all order files
df_orders = pd.concat([df_order1, df_order2, df_order3], ignore_index=True)

In [None]:
#order_with_merchant_data 1-3 cleaned completely, to check:
print(df_orders.shape)
print(df_orders.nunique())
print(df_orders.isna().sum())

### Cleaning merchant data

In [None]:
# Cleaning merchant data
df_merchant_data = all['merchant_data.html'][0]

print(df_merchant_data.shape)
print(df_merchant_data.nunique())
print(df_merchant_data.isna().sum())

In [None]:
#Remove Unnamed: 0 column
df_merchant_data = df_merchant_data.drop('Unnamed: 0', axis=1)

In [None]:
#Remove duplicates based on merchant_id, keep first instance based on creation_date
df_merchant_data = df_merchant_data.sort_values(by=['merchant_id', 'creation_date']).drop_duplicates(subset=['merchant_id'], keep='first')

In [None]:
#Standardize contact_number
df_merchant_data['contact_number'] = df_merchant_data['contact_number'].str.replace('\.', '-', regex=True)
df_merchant_data['contact_number'] = df_merchant_data['contact_number'].str.replace('[^0-9+()-]', '', regex=True)

In [None]:
#Title case for address except for country
df_merchant_data['street'] = df_merchant_data['street'].str.title()
df_merchant_data['state'] = df_merchant_data['state'].str.title()
df_merchant_data['city'] = df_merchant_data['city'].str.title()


In [None]:
#merchant data cleaned completely, to check:
print(df_merchant_data.shape)
print(df_merchant_data.nunique())
print(df_merchant_data.isna().sum())

### Cleaning staff data

In [None]:
# Cleaning staff data
df_staff_data = all['staff_data.html'][0]

print(df_staff_data.shape)
print(df_staff_data.nunique())
print(df_staff_data.isna().sum())

In [None]:
#Remove Unnamed: 0 column
df_staff_data = df_staff_data.drop('Unnamed: 0', axis=1)

In [None]:
#Remove duplicates based on staff_id, keep first instance based on creation_date
df_staff_data = df_staff_data.sort_values(by=['staff_id', 'creation_date']).drop_duplicates(subset=['staff_id'], keep='first')

In [None]:
#Standardize contact_number
df_staff_data['contact_number'] = df_staff_data['contact_number'].str.replace('\.', '-', regex=True)
df_staff_data['contact_number'] = df_staff_data['contact_number'].str.replace('[^0-9+()-]', '', regex=True)

In [None]:
#Standardize strings, use title case except for country
df_staff_data['name'] = df_staff_data['name'].str.title()
df_staff_data['job_level'] = df_staff_data['job_level'].str.title()
df_staff_data['street'] = df_staff_data['street'].str.title()
df_staff_data['state'] = df_staff_data['state'].str.title()
df_staff_data['city'] = df_staff_data['city'].str.title()

In [None]:
#merchant data cleaned completely, to check:
print(df_staff_data.shape)
print(df_staff_data.nunique())
print(df_staff_data.isna().sum())