# Association rule

In [21]:
# importing libraries
import sys
import pandas as pd
import matplotlib
import numpy as np
import scipy as sp
import IPython
import sklearn
import mglearn
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

In [22]:
# Loading the data
df = pd.read_csv('jd_computer_final1.csv', index_col=0)
df

Unnamed: 0,Seq,Session_ID,sku,behavior_type,category,time_interval,dwell_time
2,0,0,1816417,9,678,44,6
4,1,0,10477058312,9,679,10,4
6,2,0,1279827,9,11303,16,17
8,3,0,3148032,9,11303,16,7
10,4,0,1853383,9,681,15,11
...,...,...,...,...,...,...,...
27808451,1429292,999997,1842770,9,687,395,18
27808454,1429293,999997,876228,9,691,14,24
27808491,1429294,999999,1916099,9,4840,16,6
27808495,1429295,999999,2269495,9,736,62,31


In [23]:
df.category.astype(str).describe()

count     1429297
unique         87
top           672
freq       132125
Name: category, dtype: object

## A priori

First, we discussed what attributes to use for association rule after doing our data exploration. There were too many products and too sparse to obtain any insight using them. Categories was our second option. Let's see why it didn't turn out. In sample data, there are 90 categories. Some of them have a lot of products but others have too few. 

### A priori with category

In [24]:
# First we will create our baskets based on category. 

# Grouping the categories
data_cb = df.groupby('Session_ID').agg(
    user_categories = ('category', lambda x: list(x)),
)
data_cb

Unnamed: 0_level_0,user_categories
Session_ID,Unnamed: 1_level_1
0,"[678, 679, 11303, 11303, 681, 691, 687]"
1,[672]
6,[672]
8,[1105]
9,"[688, 678, 11303, 683, 680, 691, 682, 687, 681..."
...,...
999989,[694]
999992,[11303]
999995,"[694, 694, 694, 694, 694, 694, 694, 694, 694, ..."
999997,"[681, 679, 11303, 11303, 683, 680, 687, 691]"


In [26]:
basket = data_cb['user_categories'].to_numpy() # first we get only the products as an array

In [27]:
# The apriori function expects data in a one-hot encoded pandas DataFrame. 
te = TransactionEncoder() 
te_ary = te.fit(basket).transform(basket)
category_df = pd.DataFrame(te_ary, columns=te.columns_)
category_df

Unnamed: 0,672,673,674,675,678,679,680,681,682,683,...,12370,12376,12798,12799,12801,12802,12803,12804,12805,13278
0,False,False,False,False,True,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,True,False,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488651,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
488652,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
488653,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
488654,False,False,False,False,False,True,True,True,False,True,...,False,False,False,False,False,False,False,False,False,False


In [28]:
apriori(category_df, min_support=0.2, use_colnames=True)

Unnamed: 0,support,itemsets


### Apriori of behavior of bought products 

In [44]:
# bought_products = data_pf[data_pf["behavior_type"] == "10"] we can check the behaviours of sessions where products have been bought
# bought_products

data_cb = df.groupby('user_id').agg(
    behavior_types = ('behavior_type', lambda x: list(x)),
)

In [54]:
bought_products = data_cb['behavior_types'].apply(lambda a: 10 in a) # get a list of sessions with bought products to make filter
basket = data_cb[bought_products]
basket

Unnamed: 0_level_0,behavior_types
user_id,Unnamed: 1_level_1
19,"[4, 6, 5, 5, 5, 9, 5, 5, 9, 5, 5, 6, 5, 5, 5, ..."
39,"[5, 9, 7, 5, 7, 5, 5, 7, 9, 7, 4, 5, 7, 10, 5,..."
43,"[5, 5, 5, 5, 5, 7, 5, 6, 5, 9, 6, 5, 6, 5, 6, ..."
51,"[4, 4, 6, 5, 5, 5, 5, 9, 7, 4, 5, 5, 5, 5, 5, ..."
63,"[5, 5, 5, 9, 5, 5, 8, 6, 5, 6, 5, 6, 5, 7, 5, ..."
...,...
15702,"[5, 9, 5, 7, 5, 7, 5, 7, 4, 4, 5, 7, 5, 7, 5, ..."
15704,"[5, 7, 5, 7, 5, 7, 5, 7, 5, 7, 5, 7, 5, 7, 5, ..."
15723,"[5, 7, 5, 5, 5, 7, 5, 9, 7, 9, 5, 7, 5, 7, 5, ..."
15730,"[5, 6, 5, 6, 5, 5, 9, 10, 2, 5, 5, 5, 5, 5, 5,..."


In [55]:
basket = basket['behavior_types'].to_numpy() # first we get only the products as an array

In [57]:
# The apriori function expects data in a one-hot encoded pandas DataFrame. 
te = TransactionEncoder() 
te_ary = te.fit(basket).transform(basket)
behavior_df = pd.DataFrame(te_ary, columns=te.columns_)

# We will get rid of column #10 (make order)
behavior_df = behavior_df.drop(columns=[10])
behavior_df

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,False,False,False,True,True,True,False,False,True
1,False,False,False,True,True,False,True,False,True
2,False,False,False,True,True,True,True,False,True
3,True,False,False,True,True,True,True,True,True
4,False,False,False,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...
768,False,False,False,True,True,False,True,False,True
769,False,True,False,False,True,False,True,False,False
770,False,False,False,False,True,True,True,True,True
771,False,True,False,True,True,True,True,False,True


In [59]:
# We decided on a min_support of 80% because all of these are common behaviors. Lower min_support offers too many options as there is a 
# very limited set of frequent behaviors users can do. 
apriori(behavior_df, min_support=0.7, use_colnames=True)

Unnamed: 0,support,itemsets
0,0.701164,(4)
1,0.97542,(5)
2,0.812419,(7)
3,0.935317,(9)
4,0.791721,"(5, 7)"
5,0.917206,"(9, 5)"
6,0.752911,"(9, 7)"
7,0.737387,"(9, 5, 7)"


Let's remember the description to draw our conclusions. 

| behavior_type | Micro behaviors      | Description                                  |
| ------------- | -------------------- | -------------------------------------------- |
| 1             | Home2Product         | Browse the product from the homepage         |
| 2             | ShopList2Product     | Browse the product from the category page    |
| 3             | Sale2Product         | Browse the product from the sale page        |
| 4             | Cart2Product         | Browse the product from the carted page      |
| 5             | SearchList2Product   | Browse the product from the searched results |
| 6             | Detail_comments      | Read the comments of the product             |
| 7             | Detail_specification | Read the specification of the product        |
| 8             | Detail_bottom        | Read the bottom of page of the product       |
| 9             | Cart                 | Add the product to the shopping cart         |
| 10            | Order                | Make an order                                |

Adding the product to the shopping cart (9) is to be expected as it is common to do before making an order. The most frequent behaviors are browse the product from the searched results and read the specification of the product. Some relevant combinations:

- (5, 7, 9) --> Suggest that people browse the product from searched results, read the specification, and finally add to cart in order to buy.

For this scenario, we could offer some suggestions to the company: 
- Make it easy to your clients to access the search tool and offer insightful options
- Make sure all products have their specifications clear 
- If you don't have it, offer an option to buy from the search results 
- Clients buying after browsing the sales page does not have a lot of support. Why? Are they not good sales? Poorly designed? Not enough marketing? Research might make sales go up. 
- Clients buying after reading comments does not have a lot of support. As above, research might help to make the comments more relevant to the buying process.