# 关联规则
- 一种在大规模数据集中寻找有趣关系的任务
- 若两个或多个变量之间存在某种规律性，我们就称之为关联规则
- 由k个元素组成的项集称为k项集
- X -> Y
- 我们只关心物品是否被购买，而不关心购买的数量或其它商品信息
- 支持度：数据集中包含该项集的记录所占的比例
    - s(X) = P(X)/N
- 置信度：在包含X的记录中，也包含Y的概率
    - c(X->Y) = P(Y|X) = P(XY)/P(X)
- 提升度：置信度与支持度的比值
- 

## 使用mlxtend库

In [3]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import warnings
warnings.filterwarnings('ignore')
# np.random.seed(42)

### 自定义数据

In [4]:
data = {'ID':[1,2,3,4,5,6],
        'Onion':[1,0,0,1,1,1],
        'Potato':[1,1,0,1,1,1],
        'Burger':[1,1,0,0,1,1],
        'Milk':[0,1,1,1,0,1],
        'Beer':[0,0,1,0,1,0]}

In [5]:
df = pd.DataFrame(data)

In [6]:
df = df[['ID','Onion','Potato','Burger','Milk','Beer']]

In [7]:
df

Unnamed: 0,ID,Onion,Potato,Burger,Milk,Beer
0,1,1,1,1,0,0
1,2,0,1,1,1,0
2,3,0,0,0,1,1
3,4,1,1,0,1,0
4,5,1,1,1,0,1
5,6,1,1,1,1,0


#### 设置支持度阈值
- 最小为50%
- aprirori(df, min_support=0.5, use_colnames=True)

In [8]:
frequent_itemsets = apriori(df[['Onion','Potato','Burger','Milk','Beer']], min_support=0.5, use_colnames=True)

In [9]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.666667,(Onion)
1,0.833333,(Potato)
2,0.666667,(Burger)
3,0.666667,(Milk)
4,0.666667,"(Potato, Onion)"
5,0.5,"(Onion, Burger)"
6,0.666667,"(Potato, Burger)"
7,0.5,"(Milk, Potato)"
8,0.5,"(Potato, Onion, Burger)"


In [10]:
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.6)

In [11]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Potato),(Onion),0.833333,0.666667,0.666667,0.8,1.2,0.111111,1.666667,1.0
1,(Onion),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf,0.5
2,(Onion),(Burger),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333,0.333333
3,(Burger),(Onion),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333,0.333333
4,(Potato),(Burger),0.833333,0.666667,0.666667,0.8,1.2,0.111111,1.666667,1.0
5,(Burger),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf,0.5
6,(Milk),(Potato),0.666667,0.833333,0.5,0.75,0.9,-0.055556,0.666667,-0.25
7,(Potato),(Milk),0.833333,0.666667,0.5,0.6,0.9,-0.055556,0.833333,-0.4
8,"(Potato, Onion)",(Burger),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333,0.333333
9,"(Potato, Burger)",(Onion),0.666667,0.666667,0.5,0.75,1.125,0.055556,1.333333,0.333333


In [12]:
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.2)

In [13]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Potato),(Onion),0.833333,0.666667,0.666667,0.8,1.2,0.111111,1.666667,1.0
1,(Onion),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf,0.5
2,(Potato),(Burger),0.833333,0.666667,0.666667,0.8,1.2,0.111111,1.666667,1.0
3,(Burger),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf,0.5
4,"(Onion, Burger)",(Potato),0.5,0.833333,0.5,1.0,1.2,0.083333,inf,0.333333
5,(Potato),"(Onion, Burger)",0.833333,0.5,0.5,0.6,1.2,0.083333,1.25,1.0


In [14]:
rules[(rules['lift'] > 1.125) & (rules['confidence'] > 0.8)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1,(Onion),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf,0.5
3,(Burger),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf,0.5
4,"(Onion, Burger)",(Potato),0.5,0.833333,0.5,1.0,1.2,0.083333,inf,0.333333


# 这几条结果比较由价值
-可以搭配

数据要转成one hot

In [15]:
retail_shopping_basket = {'ID':[1,2,3,4,5,6],
                          'Basket':[['Beer','Diaper','Pretzels','Chips','Aspirin'],
                                    ['Diaper','Beer','Chips','Lotion','Juice','BabyFood','Milk'],
                                    ['Soda','Chips','Milk'],
                                    ['Soup','Beer','Diaper','Milk','IceCream'],
                                    ['Soda','Chips','Milk','Beer'],
                                    ['Beer','Chips']
                                   ]}


In [16]:
retail = pd.DataFrame(retail_shopping_basket)

In [17]:
retail = retail[['ID','Basket']]

In [18]:
pd.options.display.max_colwidth = 100

In [19]:
retail

Unnamed: 0,ID,Basket
0,1,"[Beer, Diaper, Pretzels, Chips, Aspirin]"
1,2,"[Diaper, Beer, Chips, Lotion, Juice, BabyFood, Milk]"
2,3,"[Soda, Chips, Milk]"
3,4,"[Soup, Beer, Diaper, Milk, IceCream]"
4,5,"[Soda, Chips, Milk, Beer]"
5,6,"[Beer, Chips]"


In [20]:
retail_id = retail.drop('Basket', axis=1)

In [21]:
retail_id

Unnamed: 0,ID
0,1
1,2
2,3
3,4
4,5
5,6


In [22]:
retail_Basket = retail['Basket'].str.join(sep=',')
retail_Basket

0              Beer,Diaper,Pretzels,Chips,Aspirin
1    Diaper,Beer,Chips,Lotion,Juice,BabyFood,Milk
2                                 Soda,Chips,Milk
3                  Soup,Beer,Diaper,Milk,IceCream
4                            Soda,Chips,Milk,Beer
5                                      Beer,Chips
Name: Basket, dtype: object

In [23]:
retail_Basket = retail_Basket.str.get_dummies(sep=',')
retail_Basket

Unnamed: 0,Aspirin,BabyFood,Beer,Chips,Diaper,IceCream,Juice,Lotion,Milk,Pretzels,Soda,Soup
0,1,0,1,1,1,0,0,0,0,1,0,0
1,0,1,1,1,1,0,1,1,1,0,0,0
2,0,0,0,1,0,0,0,0,1,0,1,0
3,0,0,1,0,1,1,0,0,1,0,0,1
4,0,0,1,1,0,0,0,0,1,0,1,0
5,0,0,1,1,0,0,0,0,0,0,0,0


In [24]:
retail = retail_id.join(retail_Basket)

In [25]:
frequent_itemsets_2 = apriori(retail.drop('ID', axis=1), min_support=0.5, use_colnames=True)

In [26]:
frequent_itemsets_2

Unnamed: 0,support,itemsets
0,0.833333,(Beer)
1,0.833333,(Chips)
2,0.5,(Diaper)
3,0.666667,(Milk)
4,0.666667,"(Beer, Chips)"
5,0.5,"(Diaper, Beer)"
6,0.5,"(Milk, Beer)"
7,0.5,"(Milk, Chips)"


In [27]:
association_rules(frequent_itemsets_2, metric='lift')

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Beer),(Chips),0.833333,0.833333,0.666667,0.8,0.96,-0.027778,0.833333,-0.2
1,(Chips),(Beer),0.833333,0.833333,0.666667,0.8,0.96,-0.027778,0.833333,-0.2
2,(Diaper),(Beer),0.5,0.833333,0.5,1.0,1.2,0.083333,inf,0.333333
3,(Beer),(Diaper),0.833333,0.5,0.5,0.6,1.2,0.083333,1.25,1.0
4,(Milk),(Beer),0.666667,0.833333,0.5,0.75,0.9,-0.055556,0.666667,-0.25
5,(Beer),(Milk),0.833333,0.666667,0.5,0.6,0.9,-0.055556,0.833333,-0.4
6,(Milk),(Chips),0.666667,0.833333,0.5,0.75,0.9,-0.055556,0.666667,-0.25
7,(Chips),(Milk),0.833333,0.666667,0.5,0.6,0.9,-0.055556,0.833333,-0.4


#电影数据
- 数据集: https://grouplens.org/datasets/movielens/

In [28]:
movies = pd.read_csv(r'E:\Project\data\ml-latest-small\movies.csv')


In [29]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


- 数据格式不符合要求
- 需要将数据转换成one hot

In [30]:
movies_ohe = movies.drop('genres', axis=1).join(movies['genres'].str.get_dummies(sep='|'))

In [31]:
pd.options.display.max_columns = 100

In [32]:
movies_ohe.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [33]:
movies_ohe.shape

(9742, 22)

In [34]:
movies_ohe.set_index('movieId', inplace=True)

In [35]:
movies_ohe.head()

Unnamed: 0_level_0,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,Jumanji (1995),0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [36]:
frequent_itemsets_movies = apriori(movies_ohe.drop('title', axis=1), min_support=0.025, use_colnames=True)

In [37]:
frequent_itemsets_movies

Unnamed: 0,support,itemsets
0,0.187641,(Action)
1,0.129645,(Adventure)
2,0.062718,(Animation)
3,0.068158,(Children)
4,0.385547,(Comedy)
5,0.123075,(Crime)
6,0.045165,(Documentary)
7,0.447649,(Drama)
8,0.079963,(Fantasy)
9,0.10039,(Horror)


In [38]:
rules_movies = association_rules(frequent_itemsets_movies, metric='lift', min_threshold=1.25)

In [39]:
rules_movies

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Action),(Adventure),0.187641,0.129645,0.062615,0.333698,2.57394,0.038289,1.306247,0.752735
1,(Adventure),(Action),0.129645,0.187641,0.062615,0.482977,2.57394,0.038289,1.571224,0.702576
2,(Action),(Crime),0.187641,0.123075,0.042907,0.228665,1.857929,0.019813,1.136892,0.568426
3,(Crime),(Action),0.123075,0.187641,0.042907,0.348624,1.857929,0.019813,1.247142,0.526575
4,(Action),(Sci-Fi),0.187641,0.100595,0.046294,0.246718,2.452576,0.027419,1.193981,0.729069
5,(Sci-Fi),(Action),0.100595,0.187641,0.046294,0.460204,2.452576,0.027419,1.504937,0.658508
6,(Thriller),(Action),0.194416,0.187641,0.067235,0.345829,1.843034,0.030754,1.241814,0.567807
7,(Action),(Thriller),0.187641,0.194416,0.067235,0.358315,1.843034,0.030754,1.25542,0.563072
8,(Animation),(Adventure),0.062718,0.129645,0.025354,0.404255,3.118175,0.017223,1.460953,0.724755
9,(Adventure),(Animation),0.129645,0.062718,0.025354,0.195566,3.118175,0.017223,1.165145,0.780486


In [40]:
rules_movies[(rules_movies.lift>4)].sort_values(by='lift', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
16,(Children),(Animation),0.068158,0.062718,0.031,0.454819,7.251799,0.026725,1.719213,0.925161
17,(Animation),(Children),0.062718,0.068158,0.031,0.494272,7.251799,0.026725,1.842573,0.919791


- 电影之间的关联规则

In [41]:
movies[(movies.genres.str.contains('Children')) & (movies.genres.str.contains('Animation'))]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
12,13,Balto (1995),Adventure|Animation|Children
44,48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance
205,239,"Goofy Movie, A (1995)",Animation|Children|Comedy|Romance
272,313,"Swan Princess, The (1994)",Animation|Children
...,...,...,...
9629,178827,Paddington 2 (2017),Adventure|Animation|Children|Comedy
9657,180987,Ferdinand (2017),Animation|Children|Comedy
9664,182293,Hare-um Scare-um (1939),Animation|Children|Comedy
9666,182299,Porky's Hare Hunt (1938),Animation|Children|Comedy


Reference:
- Mining Association Rules: https://www-users.cs.umn.edu/~kumar001/dmbook/ch6.pdf