In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
aisles_df = pd.read_csv("../input/d/datasets/psparks/instacart-market-basket-analysis/aisles.csv")
departments_df = pd.read_csv("../input/d/datasets/psparks/instacart-market-basket-analysis/departments.csv")
order_products_prior_df = pd.read_csv("../input/d/datasets/psparks/instacart-market-basket-analysis/order_products__prior.csv")
order_products_train_df = pd.read_csv("../input/d/datasets/psparks/instacart-market-basket-analysis/order_products__train.csv")
orders_df = pd.read_csv("../input/d/datasets/psparks/instacart-market-basket-analysis/orders.csv")
products_df = pd.read_csv("../input/d/datasets/psparks/instacart-market-basket-analysis/products.csv")

In [3]:
orders_df.head()

In [4]:
order_products_prior_df.shape

In [5]:
merge_prior_df = pd.merge(order_products_prior_df, products_df, on='product_id', how='left')
merge_prior_df = pd.merge(merge_prior_df, aisles_df, on='aisle_id', how='left')
merge_prior_df = pd.merge(merge_prior_df, departments_df, on='department_id', how='left')
merge_prior_df.head()

In [6]:
merge_train_df = pd.merge(order_products_train_df, products_df, on='product_id', how='left')
merge_train_df = pd.merge(merge_train_df, aisles_df, on='aisle_id', how='left')
merge_train_df = pd.merge(merge_train_df, departments_df, on='department_id', how='left')
merge_train_df.head()

In [7]:
all_products = merge_prior_df['product_name'].unique()
print("Total products: {}".format(len(all_products)))

In [8]:
 merge_prior_df['product_name'].value_counts()

In [9]:
orders_df['user_id'].value_counts()

In [10]:
color = sns.color_palette()
plt.figure(figsize=(12,8))
sns.countplot(x="order_dow", data=orders_df, color=color[0])
plt.ylabel('Count', fontsize=12)
plt.xlabel('Day of week', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency of order by week day", fontsize=12)
plt.show()

In [11]:
plt.figure(figsize=(12,8))
sns.countplot(x="order_hour_of_day", data=orders_df, color=color[1])
plt.ylabel('Count', fontsize=12)
plt.xlabel('Hour of day', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency of order by hour", fontsize=12)
plt.show()

In [12]:
plt.figure(figsize=(12,8))
sns.countplot(x="days_since_prior_order", data=orders_df, color=color[2])
plt.ylabel('Count', fontsize=12)
plt.xlabel('Days', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency of order since prior order", fontsize=12)
plt.show()

In [13]:
count_table = merge_prior_df['product_name'].value_counts().reset_index().head(20)
count_table.columns = ['product_name', 'frequency_count']
count_table

In [14]:
import plotly.express as px
fig = px.treemap(count_table, path=['product_name'], values='frequency_count')
fig.update_layout(title_text='Frequency of Items Sold',
                  title_x=0.5, title_font=dict(size=18)
                  )
fig.update_traces(textinfo="label+value")
fig.show()

In [15]:
a_count_table = merge_prior_df['aisle'].value_counts().reset_index().head(20)
a_count_table.columns = ['aisle', 'frequency_count']
fig = px.treemap(a_count_table, path=['aisle'], values='frequency_count')
fig.update_layout(title_text='Number of Aisle visited',
                  title_x=0.5, title_font=dict(size=18)
                  )
fig.update_traces(textinfo="label+value")
fig.show()

In [16]:
plt.figure(figsize=(12,12))
dept = merge_prior_df['department'].value_counts()
labels = (np.array(dept.index))
sizes = (np.array((dept / dept.sum())*100))
plt.pie(sizes, labels=labels, 
        autopct='%1.1f%%', startangle=200)
plt.title("Departments distribution", fontsize=15)
plt.show()