## 使用mlxtend工具包得出频繁项集与规则
- pip install mlxtend

In [29]:
import pandas as pd 
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

## 实例一

In [30]:
# 自定义一份购物数据集
data = {'ID':[1,2,3,4,5,6],
       'Onion':[1,0,0,1,1,1],
       'Potato':[1,1,0,1,1,1],
       'Burger':[1,1,0,0,1,1],
       'Mike':[0,1,1,1,0,1],
       'Beer':[0,0,1,0,1,0]}

In [31]:
df = pd.DataFrame(data).drop(['ID'],axis=1)

In [32]:
df

Unnamed: 0,Onion,Potato,Burger,Mike,Beer
0,1,1,1,0,0
1,0,1,1,1,0
2,0,0,0,1,1
3,1,1,0,1,0
4,1,1,1,0,1
5,1,1,1,1,0


### 1.频繁项集：大于最小支持度的商品或商品组合

In [33]:
frequent_itemsets = apriori(df,min_support=0.5,use_colnames=True)
frequent_itemsets



Unnamed: 0,support,itemsets
0,0.666667,(Onion)
1,0.833333,(Potato)
2,0.666667,(Burger)
3,0.666667,(Mike)
4,0.666667,"(Onion, Potato)"
5,0.5,"(Burger, Onion)"
6,0.666667,"(Burger, Potato)"
7,0.5,"(Mike, Potato)"
8,0.5,"(Burger, Onion, Potato)"


### 2.关联规则：在频繁项集中，满足最小置信度，或最小提升度的推荐规则

In [37]:
rules = association_rules(frequent_itemsets,metric='lift',min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Onion),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf,0.5
1,(Potato),(Onion),0.833333,0.666667,0.666667,0.8,1.2,0.111111,1.666667,1.0
2,(Burger),(Onion),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333,0.333333
3,(Onion),(Burger),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333,0.333333
4,(Burger),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf,0.5
5,(Potato),(Burger),0.833333,0.666667,0.666667,0.8,1.2,0.111111,1.666667,1.0
6,"(Burger, Onion)",(Potato),0.5,0.833333,0.5,1.0,1.2,0.083333,inf,0.333333
7,"(Burger, Potato)",(Onion),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333,0.333333
8,"(Onion, Potato)",(Burger),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333,0.333333
9,(Burger),"(Onion, Potato)",0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333,0.333333


In [35]:
rules[(rules['lift']>1.125) & (rules['confidence']>0.8)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Onion),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf,0.5
4,(Burger),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf,0.5
6,"(Burger, Onion)",(Potato),0.5,0.833333,0.5,1.0,1.2,0.083333,inf,0.333333


关联规则结论：
- （Onion和Potato）（Burger和Potato）可以搭配着卖
- 如果Burger和 Onion都在购物篮中，顾客买Potato的可能性较高

## 实例二

In [40]:
retail_shopping_basket = {'ID':[1,2,3,4,5,6],
                         'Basket':[['Beer','Diaper','Chips','Aspirin'],
                                  ['Diaper','Beer','Chips','Lotion','Juice','Babyfood','Mike'],
                                  ['Soda','Beer','Diaper','Mike','Icecream'],
                                  ['Soda','Coffee','Mike','Bread'],
                                  ['Soda','Chips','Mike'],
                                  ['Beer','Chips']]}
retail = pd.DataFrame(retail_shopping_basket)
retail 

Unnamed: 0,ID,Basket
0,1,"[Beer, Diaper, Chips, Aspirin]"
1,2,"[Diaper, Beer, Chips, Lotion, Juice, Babyfood,..."
2,3,"[Soda, Beer, Diaper, Mike, Icecream]"
3,4,"[Soda, Coffee, Mike, Bread]"
4,5,"[Soda, Chips, Mike]"
5,6,"[Beer, Chips]"


In [41]:
retail_id = retail.drop(['Basket'],axis=1)
retail_id

Unnamed: 0,ID
0,1
1,2
2,3
3,4
4,5
5,6


### 1.数据转换成one-hot编码：str.get_dummies()

In [45]:
# 将Basket列进行字符串拼接
retail_Basket = retail.Basket.str.join(',')
retail_Basket

0                       Beer,Diaper,Chips,Aspirin
1    Diaper,Beer,Chips,Lotion,Juice,Babyfood,Mike
2                  Soda,Beer,Diaper,Mike,Icecream
3                          Soda,Coffee,Mike,Bread
4                                 Soda,Chips,Mike
5                                      Beer,Chips
Name: Basket, dtype: object

In [46]:
retail_Basket = retail_Basket.str.get_dummies(',')
retail_Basket

Unnamed: 0,Aspirin,Babyfood,Beer,Bread,Chips,Coffee,Diaper,Icecream,Juice,Lotion,Mike,Soda
0,1,0,1,0,1,0,1,0,0,0,0,0
1,0,1,1,0,1,0,1,0,1,1,1,0
2,0,0,1,0,0,0,1,1,0,0,1,1
3,0,0,0,1,0,1,0,0,0,0,1,1
4,0,0,0,0,1,0,0,0,0,0,1,1
5,0,0,1,0,1,0,0,0,0,0,0,0


In [47]:
retail = retail_id.join(retail_Basket)
retail

Unnamed: 0,ID,Aspirin,Babyfood,Beer,Bread,Chips,Coffee,Diaper,Icecream,Juice,Lotion,Mike,Soda
0,1,1,0,1,0,1,0,1,0,0,0,0,0
1,2,0,1,1,0,1,0,1,0,1,1,1,0
2,3,0,0,1,0,0,0,1,1,0,0,1,1
3,4,0,0,0,1,0,1,0,0,0,0,1,1
4,5,0,0,0,0,1,0,0,0,0,0,1,1
5,6,0,0,1,0,1,0,0,0,0,0,0,0


### 2.获取频繁项集

In [48]:
frequent_itemsets_retail = apriori(retail.drop(['ID'],1),min_support=0.5,use_colnames=True)
frequent_itemsets_retail



Unnamed: 0,support,itemsets
0,0.666667,(Beer)
1,0.666667,(Chips)
2,0.5,(Diaper)
3,0.666667,(Mike)
4,0.5,(Soda)
5,0.5,"(Chips, Beer)"
6,0.5,"(Diaper, Beer)"
7,0.5,"(Mike, Soda)"


(Chips, Beer)和(Diaper, Beer)都很频繁，哪一种组合关联性更强？

### 3.查看关联规则

In [50]:
association_rules(frequent_itemsets_retail,metric='lift')

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Chips),(Beer),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333,0.333333
1,(Beer),(Chips),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333,0.333333
2,(Diaper),(Beer),0.5,0.666667,0.5,1.0,1.5,0.166667,inf,0.666667
3,(Beer),(Diaper),0.666667,0.5,0.5,0.75,1.5,0.166667,2.0,1.0
4,(Mike),(Soda),0.666667,0.5,0.5,0.75,1.5,0.166667,2.0,1.0
5,(Soda),(Mike),0.5,0.666667,0.5,1.0,1.5,0.166667,inf,0.666667


显然(Diaper, Beer)更相关一些