# Business Case #3 - Market Basket Analysis

## Authors:
#### Débora Santos (m20200748), Diana Furtado (m20200590),Pedro Henrique Medeiros (m20200742), Rebeca Pinheiro (m20201096)

#### Group D - D4B Consulting

### Installing and import packages

Maybe it will be necessary install some 'special' packages to this notebook works. 

Please follow the next cells and check if it's necessary

In [1]:
#Install package mlxtend
#!pip install mlxtend

In [2]:
# Import packages
import pandas as pd
import numpy as np
import datetime as dt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns
import plotly.io as pio
pio.renderers
import plotly.graph_objects as go
import dash
import dash_bootstrap_components as dbc
import dash_core_components as dcc
import dash_html_components as html
import dash_table
from dash.dependencies import Input, Output
import plotly.express as px

### Collect initial data

In [3]:
#import dataset
orders = pd.read_csv('orders.csv',sep=",")
orders_prod = pd.read_csv('order_products.csv',sep=",")
departments = pd.read_csv('departments.csv',sep=",")
products = pd.read_csv('products.csv',sep=",")
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Creating the dataframes to plot the customers' behaviors

In [4]:
#Create a function to return the number of orders by day of week or per hour of the day to be use in the app
def frequent_time(option):
    df = orders
    if option == "Day of week":
        freq = df['order_dow'].value_counts()
    elif option == 'Hour of the day':
        freq = df['order_hour_of_day'].value_counts()
    return freq

In [5]:
#Create the dataframe grouped by days since prior order
prior_orders = orders['days_since_prior_order'].value_counts()

In [6]:
#Create a dataframe to plot the basket size in the dash
#Create the dataframe grouped by number of itens add to cart
avg_basket = orders_prod.groupby("order_id")['add_to_cart_order'].agg(['count'])
#Rename the column
avg_basket = avg_basket.rename({"count":"basket_size"}, axis=1)
#Reset the index the column
avg_basket.reset_index(inplace = True)
#Create the dataframe grouped by number of itens add to cart
avg_basket= avg_basket.groupby('basket_size')["order_id"].agg(['count'])
#Rename the column
avg_basket = avg_basket.rename({"count":"number_of_orders"}, axis=1)
#Reset the index the column
avg_basket.reset_index(inplace = True)
#Sort by number of orders according with the basket size.
avg_basket.sort_values(by='number_of_orders', ascending=False, inplace=True)
#Filter the 25 basket sizes that most appear. 
avg_basket_top25 = avg_basket[0:25]

### Creating the dataframes to plot the Products and departments most consumed

In [7]:
#Create a dataframe to merge the datasets by product and department
df = pd.merge(orders_prod, products, how='left', on='product_id')
df = pd.merge(df, departments, how='left', on='department_id')

In [8]:
#Products more frequent by itens sold
prod_freq = df['product_name'].value_counts()

In [9]:
# Create dataframe to plot the number of orders by products

#Drop the columns not useful
df1 = df.drop(["product_id", "department_id", 'add_to_cart_order','reordered','department'], axis=1)
#Pivot the data - lines as orders and products as columns
df_pivot = pd.pivot_table(df1, index='order_id', columns='product_name', 
                    aggfunc=lambda x: 1 if len(x)>0 else 0).fillna(0)
#Transpose the pivot
df_pivot =df_pivot.T
#Create a column to count number of orders that the product appears
df_pivot['sum'] = df_pivot[df_pivot.columns].apply(lambda x: sum(x), axis = 1)
#Sort by number of orders. 
df_pivot.sort_values(by='sum', ascending=False, inplace=True)

In [10]:
#Departments more frequent by itens sold
#Create a dataframe grouped by department
grouped = df.groupby("department")["order_id"].agg(['count'])
#Rename columsn
grouped = grouped.rename({"count":"Ordersitens_bydept"}, axis=1)
#Reset index
grouped.reset_index(inplace = True)
#Create a column with the % of participation of each department in total itens sold
grouped['Ratio'] = grouped["Ordersitens_bydept"].apply(lambda x: x /grouped['Ordersitens_bydept'].sum())
#Sort by the departmens with more itens sold
grouped.sort_values(by='Ordersitens_bydept', ascending=False, inplace=True)
#Group some departments with low participation in one label callled others
grouped['department'] = grouped['department'].replace(['personal care', 'babies', 'international','alcohol','pets', 
                                                       'missing','other','bulk'],
                                                      'others')

In [11]:
# Create dataframe to plot the number of orders by departments
df2 = df.drop(["product_id", "department_id", 'add_to_cart_order','reordered','product_name'], axis=1)
# Pivot the data - lines as orders and products as columns
df_pivot_dept = pd.pivot_table(df2, index='order_id', columns='department', 
                    aggfunc=lambda x: 1 if len(x)>0 else 0).fillna(0)
#Transpose the pivot
df_pivot_dept =df_pivot_dept.T
#Create a column to count number of orders that the department appears
df_pivot_dept['sum'] = df_pivot_dept[df_pivot_dept.columns].apply(lambda x: sum(x), axis = 1)
#Sort by number of orders. 
df_pivot_dept.sort_values(by='sum', ascending=False, inplace=True)

### Creating the components

In [12]:
#Create a  Plotly Visualization to show the days since prior order - Bar plot
#Define the dataset to be used
df = prior_orders
#Define the data to plot
data_bar = (dict(type='bar',
                     x=df.index,
                     y=df.values,
                     marker_color='lightseagreen',
                     showlegend=False))  
#Define the layout
layout_bar = dict(title=dict(
                        text='Days since prior order'),
                  xaxis=dict(title='Number of days'),
                  yaxis=dict(title='Number of orders'),
                  title_x=0.5)
fig_bar = go.Figure(data=data_bar, layout=layout_bar)


In [13]:
#Create a bar plot to show the basket size
#define the data
data = dict(type='bar',
                  x=avg_basket_top25['basket_size'],
                  y=avg_basket_top25['number_of_orders'],
                  name='Basket Size'
                  )
#define the layout
layout = dict(title=dict(text='Average Basket size'),
                  xaxis=dict(title='Basket Size'),
                  yaxis=dict(title='Number of orders')
                  )
#create the fig
avgbasket_fig = go.Figure(data, layout)

In [14]:
#Products plots
#Plot number of itens sold by product - Treemap
#Define the dataset
df = prod_freq
#Define the data
treemap_trace = go.Treemap(labels=df.index[0:30], parents=[""] * len(df.index), values=df.values[0:30])
#Define the layout
treemap_layout = go.Layout({"margin": dict(t=10, b=10, l=5, r=5, pad=4)} )
#Create the figure
treemap_figure = go.Figure(data=treemap_trace, layout=treemap_layout)

In [15]:
#Plotly number of orders by product - Treemap
#Define the dataset
df = df_pivot
#Define the data
treemap_trace = go.Treemap(labels=df.index[0:16], parents=[""] * len(df.index), values=df['sum'][0:16])
#Define the layout
treemap_layout = go.Layout({"margin": dict(t=10, b=10, l=5, r=5, pad=4)} )
#Create the figure
treemap_prod = go.Figure(data=treemap_trace, layout=treemap_layout)


In [16]:
#Department plots
#Plotly % of itens sold by department - Pie plot
#Define the dataset
df = grouped
#Define the labels and the data
department_labels = df['department']
department_values = df['Ratio']
department_data = dict(type='pie',labels=department_labels,
                        values=department_values,)
#Define the layout
department_layout = dict(title=dict(text='Orders itens by department - % share'))
#Create the figure
department_fig = go.Figure(data= department_data, layout=department_layout)

In [17]:
#Filter the  deparments that appears at least in 5% of orders
#Plotly number of orders by deparment
#Define the dataset
df = df_pivot_dept
#Define the data
treemap_trace = go.Treemap(labels=df.index[0:16], parents=[""] * len(df.index), values=df['sum'][0:16])
#Define the layout
treemap_layout = go.Layout({"margin": dict(t=10, b=10, l=5, r=5, pad=4)} )
#Create the figure
treemap_dept = go.Figure(data=treemap_trace, layout=treemap_layout)

### Designing the dashboard

#### Layout

In [18]:
app = dash.Dash(__name__,external_stylesheets =[dbc.themes.BOOTSTRAP], suppress_callback_exceptions=True)
server = app.server

app.layout = html.Div([
    html.Div([
        dbc.Navbar(
            dbc.Row([dbc.Col(dbc.NavbarBrand('INSTACART MARKET BASKET ANALYSIS', className="ml-2"),
                style={"text-align":"center"}),],),
            color="LightBlue",
            dark=True,
            sticky="top",
        ),
             html.Br()
    ]),
    dcc.Tabs(id='tabs', value='tabStocks', persistence=False, children=[ 
        dcc.Tab(label='Consumer Behaviors', value='tabStocks', children=
            html.Div([
                html.Br([]),
                html.Div([
                    html.Div([
                    dbc.CardHeader(html.H4('Number of orders by Period')),
                    dbc.CardBody(
                        children =[
                        html.Div([html.Label('Choose an option:',style=dict(margin="10px")),
                        dcc.RadioItems(id='option',
                                       options=[{'label': 'Day of week', 'value': 'Day of week'},
                                                {'label': 'Hour of the day', 'value': 'Hour of the day'},],
                                       value='Day of week', labelStyle={'display': 'inline-block',})
                                 ], style=dict(display='flex', width = "100%")),
                        dcc.Graph(id='bar_plot', style={"height": "50%", "width": "100%"})])
                    ],style={"width": "100%"} ),
                    html.Div(id='margin', style={"width": "5%"}),
                    html.Div([
                        dbc.CardHeader(html.H4('Days since prior order')),
                        html.Br([]),
                        html.Br([]),
                        html.Div(dcc.Graph(id='prior_orders_plot', figure=fig_bar), style={"height": "50%", "width": "95%"}),],
                        style={"width": "100%"})
                ],style=dict(display='flex')),
                
                html.Br([]),
                html.Div([dbc.CardHeader(html.H4('Basket Size')),
                          html.Div(dcc.Graph(id='basket_plot',figure=avgbasket_fig), style={"height": "50%", "width": "95%"})],
                                   style={"width": "100%"} ), #define the basket plot size - full or half screen
            ])),
            dcc.Tab(label='Products and Departmens Analysis', value='tabproducts',children=[
            html.Br([]),
            dbc.CardHeader(html.H4('Products participation')),
            html.Div([
                     html.Div(dcc.Graph(id='treemap_itens',figure=treemap_figure),style={"width":"100%"}),
                     html.Div(style={"width":"5%"}),
                     html.Div(dcc.Graph(id='treemap_orders', figure=treemap_prod),style={"width":"100%"})       
                    ],style=dict(display="flex",width="100%")),
            dbc.CardHeader(html.H4('Departments participation')),
            html.Div([
                      
                      html.Div(dcc.Graph(id='pie_graphic', figure=department_fig ),style={"width":"100%"}),
                      html.Div(style={"width":"5%"}),
                      html.Div(dcc.Graph(id='treemap_dept', figure=treemap_dept),style={"width":"100%"})       
                    ],style=dict(display="flex",width="100%")),])
]),
                   
                    
    ],style=dict(margin='1em'))

### Callbacks

In [19]:
#Plot Bar Transactions by period
@app.callback(Output('bar_plot', 'figure'),
            [Input('option', 'value')])
def update_barplot(option):
    data_bar = (dict(type='bar',
                     x=frequent_time(option).index,
                     y=frequent_time(option).values,
                     marker_color=' cornflowerblue',
                     showlegend=False))  
    layout_bar = dict(title=dict(
                        text=f'Frequency by {option}'),
                  xaxis=dict(title=f'{option}'),
                  yaxis=dict(title='Number of orders'),
                  title_x=0.5)
    fig_bar = go.Figure(data=data_bar, layout=layout_bar)
    return fig_bar

In [20]:
if __name__ == '__main__': app.run_server(debug=False)

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [13/Apr/2021 22:27:37] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/Apr/2021 22:27:38] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/Apr/2021 22:27:38] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/Apr/2021 22:27:38] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/Apr/2021 22:27:46] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/Apr/2021 22:27:48] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
