# Data Warehousing - Spring 2021
## HW 4 - Grocery Chain Data
### Group 3, Region A: Badal Juneja, Lucas Grebe, Luis Torres

In [1]:
import calendar
import collections
import datetime
import os
import random
import sqlite3

import numpy as np
import pandas as pd

In [2]:
pd.options.display.max_rows = 999

In [3]:
class SQLTableMaker():
    
    def __init__(self, filepath, table_name, table_column_name_and_type_dict):
        self.filepath = filepath
        self.table_name = table_name
        self.table_column_name_and_type_dict = table_column_name_and_type_dict
        
    def __enter__(self):
        self.con = sqlite3.connect(self.filepath)
        self.cur = self.con.cursor()     
        
        return self 
    
    def __exit__(self, *exc):
        self.cur.close()
        self.con.close()
        
    def add_table(self):
        
        table_column_name_string = ', '.join(f'"{key}" {value}' for key, value in self.table_column_name_and_type_dict.items())
        
        self.cur.execute(f'drop table if exists {self.table_name}')
        self.cur.execute(f'create Table {self.table_name}({table_column_name_string})')
        
    def add_row_to_table_from_column_names_and_value_tuple(self, column_names, value_tuple, placeholder=None, number_of_columns=None):
        
        
        if not placeholder:
            if number_of_columns:
                placeholder = ', '.join(['?' for _ in range(number_of_columns)])
            else:
                raise ValueError('Must provide either placeholder or number_of_columns')
            
        query = f'INSERT INTO {self.table_name} ({column_names}) VALUES ({placeholder})'
        
        try: 
            self.cur.execute(query, value_tuple)
        except sqlite3.OperationalError as err:
            print(f'insert error: {err}')
            
    def add_multiple_rows_to_table_from_column_names_and_list_of_value_tuples(self, column_names, number_of_columns, list_of_value_tuples):
    
        placeholder = ', '.join(['?' for _ in range(number_of_columns)])
        query = f'INSERT INTO {self.table_name} ({column_names}) VALUES ({placeholder})'
        
        try: 
            self.cur.executemany(query, list_of_value_tuples)
        except sqlite3.OperationalError as err:
            print(f'insert error: {err}')
    
    def commit(self):
        self.con.commit()
            

In [52]:
class ProductsInfoGetter():
    
    def __init__(self, products_filepath, price_multiplier):
        self.products_filepath = products_filepath
        self.price_multiplier = price_multiplier
        self.special_items_list = ['Bread', 'Milk', 'Cereal', 'Baby Food', 'Peanut Butter', 'Diapers', 'Jelly/Jam']
        
    def initiate_everything(self):
        self.products_df = self.get_products_dataframe_and_prices()
        self.products_df_with_daily_sold_count = self.add_inventory_required_to_products_df('single_day_inventory.csv')
        self.products_data_dict = self.generate_special_product_and_rest_data_dict()
        self.inventory_df = self.initiate_a_reference_inventory_dataframe_for_day_1()
        
    def get_products_dataframe_and_prices(self):
        products_df = pd.read_csv(self.products_filepath, sep='|')
        products_df['Size'] = products_df['Size'].str.replace('[^\x00-\x7F]', ' ', regex=True)
        products_df['base_price_dollar'] = products_df['BasePrice'].str.extract('\$(.+)')
        products_df['base_price_dollar'] = pd.to_numeric(products_df['base_price_dollar'])
        products_df['Sale Price'] = np.round(products_df['base_price_dollar'] * self.price_multiplier, 2)
        products_df['limited_itemtypes'] = np.where(products_df['itemType'].isin(self.special_items_list), products_df['itemType'], 'Rest')
        return products_df
    
    def add_inventory_required_to_products_df(self, single_day_inventory_file):
        single_day_inventory = pd.read_csv(single_day_inventory_file)
        products_df_with_daily_sold_count = self.products_df.merge(single_day_inventory, on='limited_itemtypes')
        products_df_with_daily_sold_count['minimum_inventory'] = np.ceil(np.where(products_df_with_daily_sold_count['limited_itemtypes'] == 'Milk', products_df_with_daily_sold_count['ceil_mean_counts'] * 1.5, products_df_with_daily_sold_count['ceil_mean_counts'] * 3))
        return products_df_with_daily_sold_count 
    
    def generate_special_product_and_rest_data_dict(self):
        useful_columns = ['SKU', 'Sale Price']
        
        products_data_dict = {}
        
        for item in self.special_items_list:
            products_data_dict[item] = self.products_df.query('itemType == @item')[useful_columns].to_dict(orient='split')['data']

        products_data_dict['Rest'] = self.products_df[~self.products_df['itemType'].isin(self.special_items_list)][useful_columns].to_dict(orient='split')['data']
        
        return products_data_dict
    
    def initiate_a_reference_inventory_dataframe_for_day_1(self):
        inventory_df = self.products_df_with_daily_sold_count[['SKU', 'minimum_inventory']].set_index('SKU')
        inventory_df['cases'] = np.ceil(inventory_df['minimum_inventory'] / 12)
        inventory_df['current_inventory'] = inventory_df['cases'] * 12
        return inventory_df

In [5]:
class CustomerDataframeGenerator():
    
    def __init__(self, parameters_dict):
        self.year = parameters_dict['year']
        self.customer_low, self.customer_high = parameters_dict['customer_range']
        self.weekend_customer_boost = parameters_dict['weekend_customer_boost']
        self.inventory_days = parameters_dict['inventory_days']
        self.items_range = parameters_dict['items_range']
        self.probability_dict = parameters_dict['probability_dict']
        
    def initiate_everything(self):
        self.year_df = self.generate_numnber_of_customers_for_each_day_in_a_year()
        self.customers_df = self.generate_number_and_type_of_transictions_for_each_customer()
        
    def generate_numnber_of_customers_for_each_day_in_a_year(self):
    
        number_days_in_all_months_of_the_year = [(month, calendar.monthrange(self.year, month)[1]) for month in range(1, 13)]
        days = [datetime.date(self.year, month, day) for (month, days_in_month) in number_days_in_all_months_of_the_year for day in range(1, days_in_month+1)]

        year_df = pd.DataFrame(days, columns=['date'])
        year_df['date'] = pd.to_datetime(year_df['date'])
        year_df['number_of_customers'] = np.random.randint(self.customer_low, self.customer_high + 1, year_df.shape[0]) # +1 because randint high limit is not inclusive
        year_df['number_of_customers'] = np.where(year_df['date'].dt.weekday.isin([5, 6]), year_df['number_of_customers'] + self.weekend_customer_boost, year_df['number_of_customers'])
        year_df['inventory_day'] = np.where(year_df['date'].dt.weekday.isin(self.inventory_days), True, False)
        year_df['date'] = year_df['date'].dt.strftime('%Y%m%d')

        return year_df
    
    def generate_number_and_type_of_transictions_for_each_customer(self):
        
        probability_dict = self.probability_dict
    
        customers_date_array = np.hstack([np.full(size, value) for _, (size, value) in self.year_df[['number_of_customers', 'date']].iterrows()])
        customers_df = pd.DataFrame(customers_date_array, columns=['date'])

        customers_df = customers_df.reset_index()
        customers_df.columns = ['customer_number', 'date']
        customers_df['customer_number'] = (customers_df['customer_number'] + 1).astype(int)

        customers_df['will_buy_bread'] = np.random.choice(a=[True, False], size=customers_df.shape[0], p=[probability_dict['bread'], 1-probability_dict['bread']])
        customers_df['will_buy_milk'] = np.random.choice(a=[True, False], size=customers_df.shape[0], p=[probability_dict['milk'], 1-probability_dict['milk']])
        customers_df['will_buy_baby_food'] = np.random.choice(a=[True, False], size=customers_df.shape[0], p=[probability_dict['baby_food'], 1-probability_dict['baby_food']])
        customers_df['will_buy_peanut_butter'] = np.random.choice(a=[True, False], size=customers_df.shape[0], p=[probability_dict['peanut_butter'], 1-probability_dict['peanut_butter']])

        customers_df = self.get_df_with_conditional_probability_vector(customers_df, 'will_buy_milk', 'will_buy_cereal', probability_dict['cereal_with_milk'], probability_dict['cereal_without_milk'])
        customers_df = self.get_df_with_conditional_probability_vector(customers_df, 'will_buy_baby_food', 'will_buy_diapers', probability_dict['diapers_with_baby_food'], probability_dict['diapers_without_baby_food'])
        customers_df = self.get_df_with_conditional_probability_vector(customers_df, 'will_buy_peanut_butter', 'will_buy_jelly', probability_dict['jam_with_peanut_butter'], probability_dict['jam_withmout_peanut_butter'])

        customers_df['number_of_special_itmes'] =  customers_df[['will_buy_bread', 'will_buy_milk', 'will_buy_baby_food', 'will_buy_peanut_butter', 'will_buy_cereal', 'will_buy_diapers', 'will_buy_jelly']].sum(axis=1)
        customers_df['number_of_items'] = np.random.randint(self.items_range[0], self.items_range[1] + 1, customers_df.shape[0]) # +1 because the upper bound is not inclusive
        customers_df['number_of_items'] = self.get_well_distributed_number_of_items(customers_df)
        customers_df['number_of_items'] = customers_df['number_of_items'].astype(int)
        customers_df['number_of_regular_items'] = (customers_df['number_of_items'] - customers_df['number_of_special_itmes']).astype(int)

        customers_df = customers_df.sort_values(['date', 'customer_number']).merge(self.year_df[['date', 'inventory_day']]).groupby('date').apply(self.return_value_for_first_value_false_for_rest)
        
        return customers_df
    
    def get_df_with_conditional_probability_vector(self, df, column, new_column_name, probability_when_true, probability_when_false):

        true_count = (df[column] == True).sum()
        false_count = (df[column] == False).sum()

        df[new_column_name] = np.nan
        df.loc[df[column] == True, new_column_name] = np.random.choice(a=[False, True], size=true_count, p=[1-probability_when_true, probability_when_true])
        df.loc[df[column] == False, new_column_name] = np.random.choice(a=[False, True], size=false_count, p=[1-probability_when_false, probability_when_false])

        return df    
    
    def get_well_distributed_number_of_items(self, df):
    
        # This was the hard part. But essentially, when number of times is lower than number of special items, 
        # I change the number of items with a row where number of items was > 7 and number of special items was 0

        total_customers = df.shape[0]
        index_to_choose_from = df[((df['number_of_items'] > 7) & (df['number_of_special_itmes'] == 0))].index.values

        number_of_items = list(df['number_of_items'].values)

        for index, (number_of_item, number_of_special_item) in enumerate(zip(number_of_items, df['number_of_special_itmes'])):
            if number_of_item < number_of_special_item:
                random_number = np.random.randint(0, index_to_choose_from.shape[0])
                random_index = index_to_choose_from[random_number]
                number_of_items_at_random_index = number_of_items[random_index]

                number_of_items[random_index] = number_of_items[index]
                number_of_items[index] = number_of_items_at_random_index

                index_to_choose_from = np.delete(index_to_choose_from, random_number)

        return number_of_items
    
    def return_value_for_first_value_false_for_rest(self, df):
        df['inventory_time'] = False
        df['milk_intventory_time'] = False

        df['inventory_time'].iloc[0] = df.iloc[0]['inventory_day']
        df['milk_intventory_time'].iloc[0] = True

        return df

In [6]:
class TransactionGenerator():
    
    def __init__(self, customers_df, product_info_getter):
        self.customers_df = customers_df
        self.products_data_dict = product_info_getter.products_data_dict
        self.inventory_dict = product_info_getter.inventory_df.to_dict(orient='index')
        self.milk_skus = [x[0] for x in self.products_data_dict['Milk']]
        
    def define_sql_table_info(self):
        return {
            'Date': 'TEXT',
            'Customer #': 'INTEGER',
            'SKU': 'INTEGER',
            'Sale Price': 'REAL',
            'Items Left': 'INTEGER',
            'Total Cases Ordered': 'INTEGER',
        }
    
    def get_sql_table_builder_helpers(self):
        self.transactions_columns_and_type_dict = self.define_sql_table_info()
        columns = list(self.transactions_columns_and_type_dict.keys())

        self.column_names = ', '.join([f'"{column}"' for column in columns])
        self.placeholder = ', '.join(['?' for _ in range(len(columns))])        
        
    def generate_and_write_transactions_to_sql_database(self, database_filepath):
        
        self.get_sql_table_builder_helpers()
        
        with SQLTableMaker(database_filepath, 'transactions', self.transactions_columns_and_type_dict) as sql_table_maker:

            sql_table_maker.add_table()

            for customer in self.customers_df.itertuples(index=False):

                if customer.milk_intventory_time:
                    if customer.inventory_time:
                        self.update_all_inventory(self.inventory_dict)
                    else:
                        self.update_milk_inventory(self.inventory_dict, self.milk_skus)

                if customer.customer_number % 10000 == 0:
                    print(customer.customer_number)
                    
                    ['Bread', 'Milk', 'Cereal', 'Baby Food', 'Peanut Butter', 'Diapers', 'Jelly/Jam', 'Rest']

                try:
                    if customer.will_buy_milk:
                        self.add_a_row_to_sql_table(sql_table_maker, customer, self.products_data_dict['Milk'], self.inventory_dict)

                    if customer.will_buy_baby_food:
                        self.add_a_row_to_sql_table(sql_table_maker, customer, self.products_data_dict['Baby Food'], self.inventory_dict)

                    if customer.will_buy_bread:
                        self.add_a_row_to_sql_table(sql_table_maker, customer, self.products_data_dict['Bread'], self.inventory_dict)

                    if customer.will_buy_peanut_butter:
                        self.add_a_row_to_sql_table(sql_table_maker, customer, self.products_data_dict['Peanut Butter'], self.inventory_dict)

                    if customer.will_buy_cereal:
                        self.add_a_row_to_sql_table(sql_table_maker, customer, self.products_data_dict['Cereal'], self.inventory_dict)

                    if customer.will_buy_diapers:
                        self.add_a_row_to_sql_table(sql_table_maker, customer, self.products_data_dict['Diapers'], self.inventory_dict)

                    if customer.will_buy_jelly:
                        self.add_a_row_to_sql_table(sql_table_maker, customer, self.products_data_dict['Jelly/Jam'], self.inventory_dict)

                    for item in range(customer.number_of_regular_items):
                        self.add_a_row_to_sql_table(sql_table_maker, customer, self.products_data_dict['Rest'], self.inventory_dict)

                except ValueError as e:
                    print(e)
                    self.add_a_row_to_sql_table(sql_table_maker, customer, self.products_data_dict['Rest'], self.inventory_dict)


            sql_table_maker.commit()         
    
    def get_sku_price_and_inventory(self, data, inventory_dict):
        sku, price = data[random.randrange(len(data))]
        inventory = inventory_dict[sku]

        number_of_tries = 0

        while inventory['current_inventory'] == 0:
            number_of_tries += 1
            sku, price = data[random.randrange(len(data))]
            inventory = inventory_dict[sku]

            if number_of_tries > 30:
                raise ValueError('infinite loop')

        inventory_dict[sku]['current_inventory'] -= 1

        return sku, price, inventory['current_inventory'], inventory['cases']
    
    def update_all_inventory(self, inventory_dict):
    
        for key, value in inventory_dict.items():
            if value['current_inventory'] <= value['minimum_inventory']:
                number_of_cases_to_buy = np.ceil((value['minimum_inventory'] - value['current_inventory']) / 12)
                value['cases'] = value['cases'] + number_of_cases_to_buy
                value['current_inventory'] = value['current_inventory'] + (number_of_cases_to_buy * 12)

                inventory_dict[key] = value

        return inventory_dict
    
    def update_milk_inventory(self, inventory_dict, sku_list):
    
        for key in sku_list:

            value = inventory_dict[key]

            if value['current_inventory'] <= value['minimum_inventory']:
                number_of_cases_to_buy = np.ceil((value['minimum_inventory'] - value['current_inventory']) / 12)
                value['cases'] = value['cases'] + number_of_cases_to_buy
                value['current_inventory'] = value['current_inventory'] + (number_of_cases_to_buy * 12)

                inventory_dict[key] = value

        return inventory_dict
    
    def add_a_row_to_sql_table(self, sql_table_maker_instance, customer, data_list, inventory_dict):
        transaction_details = [customer.date, customer.customer_number, *self.get_sku_price_and_inventory(data_list, inventory_dict)]
        sql_table_maker_instance.add_row_to_table_from_column_names_and_value_tuple(self.column_names, transaction_details, placeholder=self.placeholder)

In [7]:
parameters_dict = {
    'year': 2020,
    'customer_range': [1000, 1040],
    'items_range': [1, 70],
    'weekend_customer_boost': 50,
    'price_multiplier': 1.05,
    'inventory_days': [0, 2, 4], # Monday, Wednesday, Friday
    'probability_dict': {
        'milk': 0.7,
        'baby_food': 0.2,
        'bread': 0.5,
        'peanut_butter': 0.1,
        'cereal_with_milk': 0.5,
        'cereal_without_milk': 0.05,
        'diapers_with_baby_food': 0.8,
        'diapers_without_baby_food': 0.01,
        'jam_with_peanut_butter': 0.9,
        'jam_withmout_peanut_butter': 0.05
    }
}

In [53]:
product_info_getter = ProductsInfoGetter('Products1.txt', parameters_dict['price_multiplier'])
product_info_getter.initiate_everything()

customer_df_generator = CustomerDataframeGenerator(parameters_dict)
customer_df_generator.initiate_everything()

transaction_generator = TransactionGenerator(customer_df_generator.customers_df, product_info_getter)
transaction_generator.generate_and_write_transactions_to_sql_database('transactions.db')

# Validation

In [9]:
class SQLQueryExecuter():
    
    def __init__(self, filepath):
        self.filepath = filepath
    
    def __enter__(self):
        self.con = sqlite3.connect(self.filepath)
        self.cur = self.con.cursor()     
        
        return self 
    
    def __exit__(self, *exc):
        self.cur.close()
        self.con.close()
        
    def get_df_from_cursor_result(self):
        if self.cur.description:
            names = [x[0] for x in self.cur.description]
            Record = collections.namedtuple('Record', ', '.join([names]))
            records = [Record(*row) for row in self.cur]
            return pd.DataFrame(records)
        else:
            raise ValueError('')
    
    def get_query_results(self, query_string):
        self.cur.execute(query_string)
        return self.get_df_from_cursor_result()
    

In [10]:
database_address = 'transactions.db'

In [11]:
with SQLQueryExecuter(database_address) as sql_query_executer:
    
    query = '''
    SELECT *
    FROM transactions
    '''
    
    sql_query_executer.cur.execute(query)
    column_names = [x[0] for x in sql_query_executer.cur.description]
    records = [row for row in sql_query_executer.cur]
    transactions_df = pd.DataFrame(records, columns=column_names)
    
transactions_df

Unnamed: 0,Date,Customer #,SKU,Sale Price,Items Left,Total Cases Ordered
0,20200101,1,42357001,3.87,191,16
1,20200101,1,43690001,3.45,23,2
2,20200101,1,42476001,1.54,71,6
3,20200101,1,42735001,1.89,71,6
4,20200101,1,42545001,9.44,71,6
...,...,...,...,...,...,...
13415063,20201231,378315,44056001,1.93,31,638
13415064,20201231,378315,43770001,0.56,36,642
13415065,20201231,378315,44073001,1.46,23,630
13415066,20201231,378315,43126001,2.51,31,634


In [54]:
times_sold_df = (transactions_df
                     .groupby('SKU')['Date']
                     .count()
                     .sort_values(ascending=False)
                     .reset_index()
                     .rename({'Date': 'COUNT'}, axis=1)
                     .merge(transactions_df.sort_values(['Customer #']).drop_duplicates(subset='SKU', keep='last')[['SKU', 'Items Left', 'Total Cases Ordered']], on='SKU')
                     .merge(product_info_getter.products_df[['Product Name', 'Size', 'itemType', 'SKU']], on=['SKU'])
                )

times_sold_df.head(25)

Unnamed: 0,SKU,COUNT,Items Left,Total Cases Ordered,Product Name,Size,itemType
0,42357001,44334,54,3699,2.00% Milk,1 gal,Milk
1,42359001,44210,58,3689,Whole Milk Milk,1 gal,Milk
2,42360001,44207,73,3690,Whole Milk Milk,1/2 gal,Milk
3,42355001,44113,59,3681,1.00% Milk,1 gal,Milk
4,42358001,44027,85,3676,2.00% Milk,1/2 gal,Milk
5,42356001,43920,72,3666,1.00% Milk,1/2 gal,Milk
6,42314001,12906,42,1079,Squeeze Jelly Grape,20 oz,Jelly/Jam
7,42313001,12797,55,1071,Jelly Grape,18 oz,Jelly/Jam
8,42311001,12631,53,1057,Jam Grape,18 oz,Jelly/Jam
9,42312001,12496,44,1045,Jam Strawberry,18 oz,Jelly/Jam


In [56]:
times_sold_df.to_csv('inventory_validation.csv', index=False)

# Validating number from HW # 2

### Number of Customers

In [12]:
with SQLQueryExecuter(database_address) as sql_query_executer:
    
    query = '''
    SELECT count(DISTINCT "Customer #") AS customer_count
    FROM transactions
    '''
    
    sql_query_executer.cur.execute(query)
    column_names = [x[0] for x in sql_query_executer.cur.description]
    records = [row for row in sql_query_executer.cur]
    total_customers_df = pd.DataFrame(records, columns=column_names)
    
total_customers_df

Unnamed: 0,customer_count
0,378315


### Total Sales

In [13]:
with SQLQueryExecuter(database_address) as sql_query_executer:
    
    query = '''
    SELECT COUNT(*) AS total_counts
    FROM transactions
    '''
    
    sql_query_executer.cur.execute(query)
    column_names = [x[0] for x in sql_query_executer.cur.description]
    records = [row for row in sql_query_executer.cur]
    total_sales_df = pd.DataFrame(records, columns=column_names)
    
total_sales_df

Unnamed: 0,total_counts
0,13415068


### Total Unique Items Bought

In [14]:
with SQLQueryExecuter(database_address) as sql_query_executer:
    
    query = '''
    SELECT COUNT(DISTINCT SKU) AS product_count
    FROM transactions
    '''
    
    sql_query_executer.cur.execute(query)
    column_names = [x[0] for x in sql_query_executer.cur.description]
    records = [row for row in sql_query_executer.cur]
    product_count_df = pd.DataFrame(records, columns=column_names)
    
product_count_df

Unnamed: 0,product_count
0,2075


### Top 10 Selling Items.

In [16]:
with SQLQueryExecuter(database_address) as sql_query_executer:
    
    query = '''
    SELECT SKU, COUNT(SKU) AS times_sold
    FROM transactions
    GROUP BY SKU
    ORDER BY times_sold DESC
    LIMIT 10
    '''
    
    sql_query_executer.cur.execute(query)
    column_names = [x[0] for x in sql_query_executer.cur.description]
    records = [row for row in sql_query_executer.cur]
    top_ten_items_df = pd.DataFrame(records, columns=column_names)
    
top_ten_items_df
top_ten_items_df.merge(product_info_getter.products_df, left_on=['SKU'], right_on=['SKU'])

Unnamed: 0,SKU,times_sold,Manufacturer,Product Name,Size,itemType,BasePrice,base_price_dollar,Sale Price,limited_itemtypes
0,42357001,44334,Rowan Dairy,2.00% Milk,1�gal,Milk,$3.69,3.69,3.87,Milk
1,42359001,44210,Rowan Dairy,Whole Milk Milk,1�gal,Milk,$3.69,3.69,3.87,Milk
2,42360001,44207,Rowan Dairy,Whole Milk Milk,1/2�gal,Milk,$1.89,1.89,1.98,Milk
3,42355001,44113,Rowan Dairy,1.00% Milk,1�gal,Milk,$3.69,3.69,3.87,Milk
4,42358001,44027,Rowan Dairy,2.00% Milk,1/2�gal,Milk,$1.89,1.89,1.98,Milk
5,42356001,43920,Rowan Dairy,1.00% Milk,1/2�gal,Milk,$1.89,1.89,1.98,Milk
6,42314001,12906,Smuckers,Squeeze Jelly Grape,20�oz,Jelly/Jam,$2.14,2.14,2.25,Jelly/Jam
7,42313001,12797,Smuckers,Jelly Grape,18�oz,Jelly/Jam,$2.29,2.29,2.4,Jelly/Jam
8,42311001,12631,Smuckers,Jam Grape,18�oz,Jelly/Jam,$2.29,2.29,2.4,Jelly/Jam
9,42312001,12496,Smuckers,Jam Strawberry,18�oz,Jelly/Jam,$3.09,3.09,3.24,Jelly/Jam


# We are doing the next few steps because we rewrote the code, so wanted to make sure all the conditions are met

In [17]:
for column in transactions_df.columns:
    print(column, transactions_df[column].isna().sum())

Date 0
Customer # 0
SKU 0
Sale Price 0
Items Left 0
Total Cases Ordered 0


### Date range

In [18]:
dates = transactions_df['Date'].unique()
print(min(dates), max(dates), len(dates))

20200101 20201231 366


### Number of items

In [19]:
unique_items_df = pd.DataFrame(transactions_df.groupby('SKU')['Date'].count()).reset_index().rename({'Date': 'Item Count'}, axis=1)
unique_items_df

Unnamed: 0,SKU,Item Count
0,42081001,7575
1,42082001,7441
2,42083001,7598
3,42084001,7626
4,42085001,7587
...,...,...
2070,44156001,7471
2071,44157001,7600
2072,44158001,7517
2073,44159001,7583


### Items Per Day

In [20]:
items_df = pd.DataFrame(transactions_df.groupby(['Date', 'Customer #'])['SKU'].count()).reset_index().rename({'SKU': 'Item Count'}, axis=1)
items_df

Unnamed: 0,Date,Customer #,Item Count
0,20200101,1,32
1,20200101,2,36
2,20200101,3,56
3,20200101,4,41
4,20200101,5,54
...,...,...,...
378310,20201231,378311,70
378311,20201231,378312,16
378312,20201231,378313,37
378313,20201231,378314,13


In [21]:
items_df['Item Count'].agg(['min', 'max'])

min     1
max    70
Name: Item Count, dtype: int64

In [22]:
items_df['Item Count'].value_counts().sort_index().head(20)

1     5378
2     5387
3     5401
4     5488
5     5421
6     5279
7     5462
8     5357
9     5407
10    5321
11    5377
12    5512
13    5469
14    5389
15    5504
16    5585
17    5423
18    5391
19    5360
20    5408
Name: Item Count, dtype: int64

### Customers Per Day

In [23]:
customer_count_df = pd.DataFrame(items_df.groupby(['Date'])['Customer #'].count()).reset_index().rename({'Customer #': 'Customer Count'}, axis=1)
customer_count_df['Date'] = pd.to_datetime(customer_count_df['Date'])
customer_count_df['day_type'] = np.where(customer_count_df['Date'].dt.weekday.isin([5, 6]), 'weekend', 'weekday')
customer_count_df.head()

Unnamed: 0,Date,Customer Count,day_type
0,2020-01-01,1004,weekday
1,2020-01-02,1030,weekday
2,2020-01-03,1005,weekday
3,2020-01-04,1073,weekend
4,2020-01-05,1071,weekend


In [24]:
customer_count_df.groupby(['day_type'])['Customer Count'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
day_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
weekday,262.0,1019.706107,11.938278,1000.0,1008.0,1019.0,1030.0,1040.0
weekend,104.0,1068.769231,11.629583,1050.0,1059.0,1068.0,1078.25,1089.0


### Show a person can buy multiple items

In [25]:
items_frequency_df = pd.DataFrame(transactions_df.groupby(['Customer #', 'SKU'])['Date'].count()).reset_index().rename({'Date': 'MultipleItemsCount'}, axis=1).query('MultipleItemsCount > 1').sort_values('MultipleItemsCount', ascending=False)
items_frequency_df

Unnamed: 0,Customer #,SKU,MultipleItemsCount
1792300,51222,42103001,4
12038494,343731,43911001,4
11273175,321827,42451001,4
13231839,377895,43136001,4
7777342,222012,42894001,4
...,...,...,...
4446316,126856,42322001,2
4446429,126859,42962001,2
4446433,126859,43143001,2
4446526,126861,43657001,2


### Now for the hard part. Validating probabilities of special items

In [26]:
transactions_df['SKU'] = pd.to_numeric(transactions_df['SKU'])

big_df = transactions_df.merge(product_info_getter.products_df[['SKU', 'itemType']], left_on=['SKU'], right_on=['SKU'])
big_df

Unnamed: 0,Date,Customer #,SKU,Sale Price,Items Left,Total Cases Ordered,itemType
0,20200101,1,42357001,3.87,191,16,Milk
1,20200101,3,42357001,3.87,190,16,Milk
2,20200101,4,42357001,3.87,189,16,Milk
3,20200101,10,42357001,3.87,188,16,Milk
4,20200101,19,42357001,3.87,187,16,Milk
...,...,...,...,...,...,...,...
13415063,20201228,375206,43352001,0.72,14,39,Baby Food
13415064,20201229,375872,43352001,0.72,13,39,Baby Food
13415065,20201229,376251,43352001,0.72,12,39,Baby Food
13415066,20201230,376481,43352001,0.72,11,39,Baby Food


In [27]:
temp_df = big_df[big_df['Customer #'] == 1]

In [28]:
def find_existance_of_special_items(df):
    
    unique_item_types = df['itemType'].unique()
    
    milk = True if 'Milk' in unique_item_types else False
    baby_food = True if 'Baby Food' in unique_item_types else False
    bread = True if 'Bread' in unique_item_types else False
    peanut_butter = True if 'Peanut Butter' in unique_item_types else False
    cereal = True if 'Cereal' in unique_item_types else False
    diapers = True if 'Diapers' in unique_item_types else False
    jelly = True if 'Jelly/Jam' in unique_item_types else False
    
    return pd.DataFrame([(milk, baby_food, bread, peanut_butter, cereal, diapers, jelly)])

find_existance_of_special_items(temp_df)

Unnamed: 0,0,1,2,3,4,5,6
0,True,False,False,False,True,False,False


### This step will run for 5 to 10ish minutes!

In [29]:
product_bought_df = big_df.groupby('Customer #').apply(find_existance_of_special_items)
product_bought_df.columns = ['milk', 'baby_food', 'bread', 'peanut_butter', 'cereal', 'diapers', 'jelly']
product_bought_df = product_bought_df.reset_index()
product_bought_df = product_bought_df.drop('level_1', axis=1)
product_bought_df

Unnamed: 0,Customer #,milk,baby_food,bread,peanut_butter,cereal,diapers,jelly
0,1,True,False,False,False,True,False,False
1,2,True,False,True,False,True,False,False
2,3,True,False,False,True,False,False,False
3,4,True,True,False,False,True,True,False
4,5,True,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...
378310,378311,True,True,False,False,True,True,False
378311,378312,True,False,True,False,True,False,False
378312,378313,True,True,False,False,True,True,False
378313,378314,True,True,True,False,True,False,False


In [30]:
def get_probabilities_of_conditional_items(df, initial_column, conditional_column):
    list_of_columns = list(df.columns)
    list_of_columns.remove(initial_column)
    list_of_columns.remove(conditional_column)
    
    random_column = list_of_columns[0]
    
    agg_df = df.groupby([initial_column, conditional_column])[random_column].count().unstack()
    
    did_not_buy_numbers = agg_df.loc[False, :]
    did_not_buy_numbers = did_not_buy_numbers / did_not_buy_numbers.sum()
    
    did_buy_numbers = agg_df.loc[True, :]
    did_buy_numbers = did_buy_numbers / did_buy_numbers.sum()
    
    return f'Probability when Bought: {did_buy_numbers[True]:.04f} | Probability when did not buy: {did_not_buy_numbers[True]:.04f}'

In [31]:
parameters_dict['probability_dict']

{'milk': 0.7,
 'baby_food': 0.2,
 'bread': 0.5,
 'peanut_butter': 0.1,
 'cereal_with_milk': 0.5,
 'cereal_without_milk': 0.05,
 'diapers_with_baby_food': 0.8,
 'diapers_without_baby_food': 0.01,
 'jam_with_peanut_butter': 0.9,
 'jam_withmout_peanut_butter': 0.05}

In [34]:
print(f"Will buy Milk Probability: {product_bought_df['milk'].value_counts(normalize=True)[True]:.04f}")
print(f"Will buy Baby Food Probability: {product_bought_df['baby_food'].value_counts(normalize=True)[True]:.04f}")
print(f"Will buy Bread Probability: {product_bought_df['bread'].value_counts(normalize=True)[True]:.04f}")
print(f"Will buy Peanut Butter Probability: {product_bought_df['peanut_butter'].value_counts(normalize=True)[True]:.04f}")

Will buy Milk Probability: 0.7000
Will buy Baby Food Probability: 0.2002
Will buy Bread Probability: 0.5003
Will buy Peanut Butter Probability: 0.0994


In [35]:
print(get_probabilities_of_conditional_items(product_bought_df, 'milk', 'cereal'))
print(get_probabilities_of_conditional_items(product_bought_df, 'baby_food', 'diapers'))
print(get_probabilities_of_conditional_items(product_bought_df, 'peanut_butter', 'jelly'))

Probability when Bought: 0.5015 | Probability when did not buy: 0.0489
Probability when Bought: 0.8033 | Probability when did not buy: 0.0103
Probability when Bought: 0.8982 | Probability when did not buy: 0.0500


### Scrap code

In [None]:
probability_dict

In [None]:
print(f"Will buy Milk Probability: {customers_df['will_buy_milk'].value_counts(normalize=True)[True]:.04f}")
print(f"Will buy Baby Food Probability: {customers_df['will_buy_baby_food'].value_counts(normalize=True)[True]:.04f}")
print(f"Will buy Bread Probability: {customers_df['will_buy_bread'].value_counts(normalize=True)[True]:.04f}")
print(f"Will buy Peanut Butter Probability: {customers_df['will_buy_peanut_butter'].value_counts(normalize=True)[True]:.04f}")

In [None]:
print(get_probabilities_of_conditional_items(customers_df, 'will_buy_milk', 'will_buy_cereal'))
print(get_probabilities_of_conditional_items(customers_df, 'will_buy_baby_food', 'will_buy_diapers'))
print(get_probabilities_of_conditional_items(customers_df, 'will_buy_peanut_butter', 'will_buy_jelly'))