In [None]:
import numpy as np
import pandas as pd
import os
# import sklearn.mo
import wrangle_mob as w

In [None]:
def acquire_mob_sales():
    '''
    This function will read in tables from 3 xlsm excel documents containing sales
    order history for customers
    '''
    # get sales order history for customers
    sales = retention_18 = pd.read_excel('mob_sales.xlsx', sheet_name=0)
    sorder_details = raw_6 = pd.read_excel('mob_sales.xlsx', sheet_name=1)
    # get a customer list
    customer_list = retention_17 = pd.read_excel('mob_sales.xlsx', sheet_name=3)
    # return the 4 dataframes
    return sales, sorder_details, customer_list

In [None]:
def acquire_mob_item_history():
    '''
    This function will retreive a table from an xlsm excel document containing
    item sales history by week for all products
    '''
    # get item sales history
    item_sales = pd.read_excel('mob_sales.xlsx', sheet_name=2)
    # return dataframe
    return item_sales

In [None]:
def prepare_mob_item_history(all_sales_history):
    '''
    This function will take in item sales history for all products, it will then
    drop unnecessary columns and change the change the index to datetime format,
    it will then change the sku names of products to generic numbers.
    '''
    # remove items that are inactive products
    sales_history = all_sales_history[all_sales_history.Forecast == True
                  # remove unnecessary columns
                 ].drop(columns=['Lifetime', 'Forecast'
                                 # remove extra columns that have no data
                                ]).dropna(axis=1
                                          # change the column headers to item_sku
                                         ).set_index('SKU').T
    # convert the index to datetime
    sales_history.index = pd.to_datetime(sales_history.index)
    # resample the index to standard week format
    sales_history = sales_history.resample('W').sum()
    # replace product skus with generic numbers for infosec
    sales_history.columns = ['prod_' + str(x) for x in range(0,len(sales_history.columns))]
    # return the prepared dataframe
    return sales_history

In [None]:
def prepare_mob_sales_order(sales, sorder_details, customer_list):
    '''
    This function will take in dataframes of sales orders, the order details, and a list
    of customers. it will then merge the dfs, fill in null values with appropriate values,
    remove suspended customers, and then remove some now unnecessary info.
    '''
    # merge sales orders with the sales order details
    sales_orders = pd.merge(left=sales, 
                            right=sorder_details, 
                            how='left', 
                            on='OrderID')
    # fill na values
    sales_orders.QtyOrdered.fillna(1, inplace=True)
    sales_orders.QtyShipped.fillna(0, inplace=True)
    # get a list of suspended customers from the customer list, since we don't want
    # suspended customers in the final report
    suspended_list = customer_list[['CustomerID', 'Suspended'
                               ]][customer_list.Suspended == False]
    # merge the suspended list with the sales orders
    sorders = pd.merge(left=sales_orders, 
                       right=suspended_list, 
                       how='left', 
                       on='CustomerID')
    # suspended customers will ahve a null value in the Suspended column,
    # drop the suspended info then remove the suspended column
    sorders = sorders[sorders.Suspended.isna() == False].drop(columns='Suspended')
    # change column names to lowercase
    sorders.columns = [col.lower() for col in sorders]
    # make sure the order dates are in datetime format
    sorders.index = pd.to_datetime(sorders.index, format='%Y-%m-%d')
    # set the index to the orderdate so we can work with the data as a time series problem
    sorders = sorders.set_index('orderdate').sort_index()
    # remove the one row that has decimal qty ordered and qty_shipped
    sorders = sorders[~(sorders.qtyordered.astype(int) != sorders.qtyordered)]
    # convert the float type columns to int
    sorders.qtyordered = sorders.qtyordered.astype(int)
    sorders.qtyshipped = sorders.qtyshipped.astype(int)
    # change column names to be more readable
    cols = ['order_id', 'order_no', 'customer_id', 'order_status', 'order_amount',
         'seq', 'qty_ordered', 'qty_shipped', 'item_id']
    sorders.columns = cols
    # return the prepared sales orders dataframe
    return sorders

In [None]:
def wrangle_mob_item_sales():
    '''
    This function will check for existance of item sales history csv file in the local
    directory, if one does not exist it will acquire the dataset, prepare it, then 
    create a csv file and either way it will return the prepared dataframe.
    '''
    # check for existance of item_history.csv file in the local directory
    if os.path.exists('item_history.csv'):
        # read in csv file if one exists
        item_history = pd.read_csv('item_history.csv', index_col=0, parse_dates=True)
    # if csv file does not exist
    else:
        # read in dataset from excel file
        all_item_history = acquire_mob_item_history()
        # prepare the data
        item_history = prepare_mob_item_history(all_item_history)
        # write a new csv file to the local directory
        item_history.to_csv('item_history.csv')
    # return the prepared dataframe
    return item_history

In [None]:
def wrangle_mob_sales():
    '''
    This function will check for existance of sales orders history csv file in the local
    directory, if one does not exist it will acquire the dataset, prepare it, then 
    create a csv file and either way it will return the prepared dataframe.
    '''
    # check for existance of item_history.csv file in the local directory
    if os.path.exists('sales_history.csv'):
        # read in csv file if one exists
        sales = pd.read_csv('sales_history.csv', index_col=0, parse_dates=True)
    # if csv file does not exist
    else:
        # read in dataset from excel file
        sales_history, sorder_details, customer_list = acquire_mob_sales()
        # prepare the data
        sales = prepare_mob_sales_order(sales_history, sorder_details, customer_list)
        # write a new csv file to the local directory
        sales.to_csv('sales_history.csv')
    # return the prepared dataframe
    return sales

In [None]:
items = w.wrangle_mob_item_sales()

In [None]:
items

In [None]:
items.index = pd.to_datetime(items.index)

In [None]:
items.index

In [None]:
sales = w.wrangle_mob_sales()

In [None]:
# sales.index = pd.to_datetime(sales.index)

In [None]:
sales.index

In [None]:
# sales.resample('Y').order_amount.mean().plot()

In [None]:
sales, sorder_details, customer_list = w.acquire_mob_sales()

In [None]:
sales.head()

In [None]:
sales.OrderDate.value_counts().sort_values()

In [None]:
sales.OrderDate = pd.to_datetime(sales.OrderDate, format='%Y-%m-%d')

In [None]:
sales.OrderDate

In [None]:
all_sales_history = acquire_mob_item_history()

In [None]:
all_sales_history.head()

In [None]:
all_sales_history.T

In [None]:
sale_1 = all_sales_history[['SKU', 'Forecast', 'Lifetime']]

In [None]:
sale_1

In [None]:
sale_1 = sale_1.drop(columns='Lifetime')

In [None]:
sale_2 = all_sales_history.drop(
    columns=['SKU', 'Forecast', 'Lifetime'])

In [None]:
sale_2 = sale_2.dropna(axis=1)

In [None]:
sale_2.isna().sum()

In [None]:
all_sales_history[all_sales_history.Forecast == True
                 ].drop(columns=['Lifetime', 'Forecast'
                                ]).dropna(axis=1
                                         ).set_index('SKU').head().T

In [None]:
sales_history = all_sales_history[all_sales_history.Forecast == True
                 ].drop(columns=['Lifetime', 'Forecast'
                                ]).dropna(axis=1
                                         ).set_index('SKU').T

In [None]:
sales_history

In [None]:
sales.index

In [None]:
item_test = pd.read_csv('item_history.csv', index_col=0, parse_dates=True)

In [None]:
item_test.index

In [None]:
# sales_history.index = pd.to_datetime(sales_history.drop(
#     columns=['SKU', 'Desc']).index, format='%Y-%m-%d')

In [None]:
sales_history.index

In [None]:
date=pd.to_datetime(sales_history.index)

In [None]:
sales_history.index = pd.to_datetime(sales_history.index)

In [None]:
sales_history = sales_history.resample('W').sum()

In [None]:
sku = ['prod_' + str(x) for x in range(0,len(sales_history.columns))]

In [None]:
sku

In [None]:
sales_history.columns = sku

In [None]:
sales_history.columns = ['prod_' + str(x) for x in range(0,len(sales_history.columns))]

In [None]:
sales_history

In [None]:
sales_history.shape

In [None]:
sales_history.resample('M').sum()

In [None]:
sales_orders = sales

In [None]:
sales_orders.head()

In [None]:
sorder_details.head()

In [None]:
sales_orders = pd.merge(left=sales_orders, right=sorder_details, how='left', on='OrderID')

In [None]:
sales_orders[sales_orders.OrderID == 40251]

In [None]:
sales_orders.columns = [col.lower() for col in sales_orders]

In [None]:
sales_orders.head()

In [None]:
sales_orders.isna().sum()

In [None]:
sales_orders[sales_orders.qtyordered.isna() == True]

In [None]:
sales_orders[sales_orders.customerid == 213]

#### The only null in qtyordered has a qtyshipped of 1, and a completed status (3) so lets fill the qty ordered with 1

In [None]:
sales_orders.qtyordered = sales_orders.qtyordered.fillna(1)

In [None]:
sales_orders[sales_orders.qtyordered.isna() == True]

In [None]:
sales_orders[sales_orders.qtyshipped.isna() == True].orderdate.agg(['min','max'])

In [None]:
sales_orders[(sales_orders.qtyshipped.isna() == True) & 
             (sales_orders.orderdate >= '2019')]

#### Lets fill the null values in qtyshipped with 0

In [None]:
sales_orders.qtyshipped.fillna(0, inplace=True)

In [None]:
sales_orders[sales_orders.qtyshipped.isna() == True]

In [None]:
sales_orders.isna().sum()

In [None]:
customer_list.head()

In [None]:
suspended_list = customer_list[['CustomerID', 'Suspended'
                               ]][customer_list.Suspended == False]

In [None]:
suspended_list

In [None]:
suspended_list.columns = [col.lower() for col in suspended_list]

In [None]:
sorders = pd.merge(left=sales_orders, right=suspended_list, how='left', on='customerid')

In [None]:
sorders

In [None]:
sorders = sorders[sorders.suspended.isna() == False].drop(columns='suspended')

In [None]:
sorders.info()

In [None]:
sorders = sorders.set_index('orderdate').sort_index()

In [None]:
sorders.qtyordered.value_counts()

In [None]:
(sorders.qtyordered.astype(int) != sorders.qtyordered).sum()

In [None]:
(sorders.qtyshipped.astype(int) != sorders.qtyshipped).sum()

In [None]:
sorders[sorders.qtyordered.astype(int) != sorders.qtyordered]

#### There is only one row that has decimal values in qtyordered or qtyshipped

In [None]:
1302.12 / 15

#### I know that this itemid is a product with a price between 40-60 dollars, but which doesn't seem to add up right. So lets drop this row

In [None]:
sorders = sorders[~(sorders.qtyordered.astype(int) != sorders.qtyordered)]

In [None]:
sorders

#### Now we can change qtyordered and qtyshipped to int

In [None]:
sorders.qtyordered = sorders.qtyordered.astype(int)
sorders.qtyshipped = sorders.qtyshipped.astype(int)

In [None]:
sorders.info()

In [None]:
items.info()

In [None]:
sorders.columns.to_list()

In [None]:
cols = ['order_id', 'order_no', 'customer_id', 'order_status', 'order_amount',
     'seq', 'qty_ordered', 'qty_shipped', 'item_id']

In [None]:
sorders.columns = cols

In [None]:
sorders.columns

In [None]:
sorders.index.max()