# Business Case #3 - Market Basket Analysis

## Authors:
#### Débora Santos (m20200748), Diana Furtado (m20200590),Pedro Henrique Medeiros (m20200742), Rebeca Pinheiro (m20201096)

#### Group D - D4B Consulting

### Installing and import packages

Maybe it will be necessary install some 'special' packages to this notebook works. 

Please follow the next cells and check if it's necessary

In [1]:
#Install package mlxtend
#!pip install mlxtend

In [2]:
# Import packages
import pandas as pd
import numpy as np
import datetime as dt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns
import plotly.io as pio
pio.renderers
import plotly.graph_objects as go
import dash
import dash_bootstrap_components as dbc
import dash_core_components as dcc
import dash_html_components as html
import dash_table
from dash.dependencies import Input, Output
import plotly.express as px

### Collect initial data

In [3]:
#import dataset
orders = pd.read_csv('orders.csv',sep=",")
orders_prod = pd.read_csv('order_products.csv',sep=",")
departments = pd.read_csv('departments.csv',sep=",")
products = pd.read_csv('products.csv',sep=",")
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Customers' behaviors

In [4]:
#Create a function to return the number of orders by day of week or per hour of the day to be use in the app
def frequent_time(option):
    df = orders
    if option == "Day of week":
        freq = df['order_dow'].value_counts()
    elif option == 'Hour of the day':
        freq = df['order_hour_of_day'].value_counts()
    return freq

In [5]:
prior_orders = orders['days_since_prior_order'].value_counts()

In [6]:
avg_basket = orders_prod.groupby("order_id")['add_to_cart_order'].agg(['count'])
avg_basket = avg_basket.rename({"count":"basket_size"}, axis=1)
avg_basket.reset_index(inplace = True)
avg_basket= avg_basket.groupby('basket_size')["order_id"].agg(['count'])
avg_basket = avg_basket.rename({"count":"number_of_orders"}, axis=1)
avg_basket.reset_index(inplace = True)
avg_basket.sort_values(by='number_of_orders', ascending=False, inplace=True)
avg_basket_top25 = avg_basket[0:25]

### Products and departments most consumed

In [7]:
df = pd.merge(orders_prod, products, how='left', on='product_id')
df = pd.merge(df, departments, how='left', on='department_id')

In [8]:
#Products more frequent
prod_freq = df['product_name'].value_counts()

In [9]:
# Pivot the data - lines as orders and products as columns
df1 = df.drop(["product_id", "department_id", 'add_to_cart_order','reordered','department'], axis=1)
df_pivot = pd.pivot_table(df1, index='order_id', columns='product_name', 
                    aggfunc=lambda x: 1 if len(x)>0 else 0).fillna(0)
#Count of orders that the product appears
df_pivot =df_pivot.T
df_pivot['sum'] = df_pivot[df_pivot.columns].apply(lambda x: sum(x), axis = 1)
df_pivot.sort_values(by='sum', ascending=False, inplace=True)

In [10]:
grouped = df.groupby("department")["order_id"].agg(['count'])
grouped = grouped.rename({"count":"Ordersitens_bydept"}, axis=1)
grouped.reset_index(inplace = True)
grouped['Ratio'] = grouped["Ordersitens_bydept"].apply(lambda x: x /grouped['Ordersitens_bydept'].sum())
grouped.sort_values(by='Ordersitens_bydept', ascending=False, inplace=True)
grouped['department'] = grouped['department'].replace(['personal care', 'babies', 'international','alcohol','pets', 
                                                       'missing','other','bulk'],
                                                      'others')

In [11]:
# Pivot the data - lines as orders and products as columns
df2 = df.drop(["product_id", "department_id", 'add_to_cart_order','reordered','product_name'], axis=1)
df_pivot_dept = pd.pivot_table(df2, index='order_id', columns='department', 
                    aggfunc=lambda x: 1 if len(x)>0 else 0).fillna(0)
#Number of orders that the department appears
df_pivot_dept =df_pivot_dept.T
df_pivot_dept['sum'] = df_pivot_dept[df_pivot_dept.columns].apply(lambda x: sum(x), axis = 1)
df_pivot_dept.sort_values(by='sum', ascending=False, inplace=True)

### Creating the components

In [12]:
#@app.callback(Output('priororder_plot', 'figure'))
#Create a function to Plotly Visualization to show the days since prior order


df = prior_orders
data_bar = (dict(type='bar',
                     x=df.index,
                     y=df.values,
                     marker_color='lightseagreen',
                     showlegend=False))    
layout_bar = dict(title=dict(
                        text='Days since prior order'),
                  xaxis=dict(title='Number of days'),
                  yaxis=dict(title='Number of orders'),
                  title_x=0.5)
fig_bar = go.Figure(data=data_bar, layout=layout_bar)
#return fig_bar

In [13]:
#Plot basket size graphic
#@app.callback(
#Output('graphic', 'figure'))

#Create a Function to create a bar plot to show the basket size
#def basket_plot (df):
data = dict(type='bar',
                  x=avg_basket_top25['basket_size'],
                  y=avg_basket_top25['number_of_orders'],
                  name='Basket Size'
                  )
layout = dict(title=dict(text='Average Basket size'),
                  xaxis=dict(title='Basket Size'),
                  yaxis=dict(title='Number of orders')
                  )
avgbasket_fig = go.Figure(data, layout)
#return fig


In [14]:
#Products plots

#@app.callback(
#Output('treemap_itens', 'figure'))

#Plotly number of itens sold by product
#def tree_map (df):
df = prod_freq
treemap_trace = go.Treemap(labels=df.index[0:30], parents=[""] * len(df.index), values=df.values[0:30])
treemap_layout = go.Layout({"margin": dict(t=10, b=10, l=5, r=5, pad=4)} )
treemap_figure = go.Figure(data=treemap_trace, layout=treemap_layout)
#return treemap_figure

In [15]:
#@app.callback(
#Output('treemap_orders', 'figure'))

#Plotly number of orders by product
#def tree_map_prod (df):
df = df_pivot
treemap_trace = go.Treemap(labels=df.index[0:16], parents=[""] * len(df.index), values=df['sum'][0:16])
treemap_layout = go.Layout({"margin": dict(t=10, b=10, l=5, r=5, pad=4)} )
treemap_prod = go.Figure(data=treemap_trace, layout=treemap_layout)
#return treemap_figure

In [16]:
#Department plots

#@app.callback(
#    Output('pie_graphic', 'figure'))

#Plotly % of itens sold by department
#def pie_graphic (df):
df = grouped
department_labels = df['department']
department_values = df['Ratio']
department_data = dict(type='pie',labels=department_labels,
                        values=department_values,)
department_layout = dict(title=dict(text='Orders itens by department - % share'))
department_fig = go.Figure(data= department_data, layout=department_layout)
#return department_fig

In [17]:
#@app.callback(
#    Output('treemap_dept', 'figure'))

#Plotly number of orders by deparment
#def tree_map_dept (df):
df = df_pivot_dept
treemap_trace = go.Treemap(labels=df.index[0:16], parents=[""] * len(df.index), values=df['sum'][0:16])
treemap_layout = go.Layout({"margin": dict(t=10, b=10, l=5, r=5, pad=4)} )
treemap_dept = go.Figure(data=treemap_trace, layout=treemap_layout)
#return treemap_figure_dept

### Designing the dashboard

#### Layout

In [18]:
app = dash.Dash(__name__)
#, external_stylesheets =[dbc.themes.BOOTSTRAP], suppress_callback_exceptions=True)
server = app.server

app.layout = html.Div([
    html.Div([
        dbc.Navbar(
            dbc.Row([dbc.Col(dbc.NavbarBrand('INSTACART MARKET BASKET ANALYSIS', className="ml-2"),
                style={"text-align":"center"}),],),
            color="LightBlue",
            dark=True,
            sticky="top",
        ),
             html.Br()
    ]),
    html.Div([
        dcc.Tabs(id='tabs', value='basketanalysis', persistence=False, children=[ 
        dcc.Tab(label='Consumer Behaviors', value='basketanalysis', children=
            html.Div([
                html.Br([]),
                html.Div([
                    html.Div([
                    dbc.CardHeader(html.H4('Number of orders by Period')),
                    dbc.CardBody(
                        children =[
                        dcc.RadioItems(id='option',
                                       options=[{'label': 'Day of week', 'value': 'Day of week'},
                                                {'label': 'Hour of the day', 'value': 'Hour of the day'},],
                                       value='Day of week', labelStyle={'display': 'inline-block', "width": "40%"}),
                        dcc.Graph(id='bar_plot', style={"height": "50%", "width": "100%"})])
                    ],style={"width": "100%"} ),
                    html.Div(id='margin', style={"width": "5%"}),
                    html.Div([
                        dbc.CardHeader(html.H4('Days since prior order')),
                        html.Div(dcc.Graph(id='prior_orders_plot', figure=fig_bar), style={"height": "50%", "width": "95%"}),],
                        style={"width": "100%"})
                ],style=dict(display='flex')),
                
                html.Br([]),
                html.Div([dbc.CardHeader(html.H4('Basket Size')),
                          html.Div(dcc.Graph(id='basket_plot',figure=avgbasket_fig), style={"height": "50%", "width": "95%"})],
                                   style={"width": "100%"} ),])),#define the basket plot size - full or half screen
            dcc.Tab(label='Products and Departmens Analysis', value='tabproducts',children=[
            html.Br([]), 
            html.Div([dbc.CardHeader(html.H4('Products participation')),
                     html.Div([ html.Div(dcc.Graph(id='treemap_itens',figure=treemap_figure),style={"width":"100%"})],
                             ),
                     html.Div(style={"width":"4%"}),
                     html.Div(dcc.Graph(id='treemap_orders', figure=treemap_prod),style={"width":"48%"})       
                    ],style=dict(display="flex",width="100%")),
            html.Div([dbc.CardHeader(html.H4('Departments participation')),
                      html.Div([
                          html.Div(dcc.Graph(id='pie_graphic', figure=department_fig ),style={"width":"100%"})],
                          ),
                      html.Div(style={"width":"4%"}),
                      html.Div(dcc.Graph(id='treemap_dept', figure=treemap_dept),style={"width":"48%"})       
                    ],style=dict(display="flex",width="100%")),])
]),
                   
                    ] )
    ],style=dict(margin='1em'))

### Callbacks

In [19]:
#Plot Bar Transactions by period
@app.callback(Output('bar_plot', 'figure'),
            [Input('option', 'value')])
def update_barplot(option):
    data_bar = (dict(type='bar',
                     x=frequent_time(option).index,
                     y=frequent_time(option).values,
                     marker_color=' cornflowerblue',
                     showlegend=False))  
    layout_bar = dict(title=dict(
                        text=f'Frequency by {option}'),
                  xaxis=dict(title=f'{option}'),
                  yaxis=dict(title='Number of orders'),
                  title_x=0.5)
    fig_bar = go.Figure(data=data_bar, layout=layout_bar)
    return fig_bar

In [20]:
if __name__ == '__main__': app.run_server(debug=False)

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [10/Apr/2021 20:07:08] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [10/Apr/2021 20:07:08] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [10/Apr/2021 20:07:08] "[37mGET /_favicon.ico?v=1.19.0 HTTP/1.1[0m" 200 -
127.0.0.1 - - [10/Apr/2021 20:07:08] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [10/Apr/2021 20:07:08] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [10/Apr/2021 20:07:45] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [10/Apr/2021 20:14:17] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [10/Apr/2021 20:14:25] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
