In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori as m_apriori
from mlxtend.frequent_patterns import association_rules

pd.set_option('max_columns', None)

#### 读取数据

In [2]:
raw_data = pd.read_csv('./Market_Basket_Optimisation.csv', header=None)
raw_data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


### 一、使用mlxtend进行频繁项集与关联规则挖掘

#### 1. 独热编码处理

In [3]:
data = raw_data.apply(lambda se: ','.join(se[se.notna()].tolist()), axis=1)
data = data.str.lower()
data.head(3)

0    shrimp,almonds,avocado,vegetables mix,green gr...
1                               burgers,meatballs,eggs
2                                              chutney
dtype: object

In [4]:
item_hot_coded = data.str.get_dummies(',')
item_hot_coded.head(3)

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,body spray,bramble,brownies,bug spray,burger sauce,burgers,butter,cake,candy bars,carrots,cauliflower,cereals,champagne,chicken,chili,chocolate,chocolate bread,chutney,cider,clothes accessories,cookies,cooking oil,corn,cottage cheese,cream,dessert wine,eggplant,eggs,energy bar,energy drink,escalope,extra dark chocolate,flax seed,french fries,french wine,fresh bread,fresh tuna,fromage blanc,frozen smoothie,frozen vegetables,gluten free bar,grated cheese,green beans,green grapes,green tea,ground beef,gums,ham,hand protein bar,herb & pepper,honey,hot dogs,ketchup,light cream,light mayo,low fat yogurt,magazines,mashed potato,mayonnaise,meatballs,melons,milk,mineral water,mint,mint green tea,muffins,mushroom cream sauce,napkins,nonfat milk,oatmeal,oil,olive oil,pancakes,parmesan cheese,pasta,pepper,pet food,pickles,protein bar,red wine,rice,salad,salmon,salt,sandwich,shallot,shampoo,shrimp,soda,soup,spaghetti,sparkling water,spinach,strawberries,strong cheese,tea,tomato juice,tomato sauce,tomatoes,toothpaste,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### 2. 生成频繁项集

In [5]:
item_sets = m_apriori(item_hot_coded, use_colnames=True, min_support=0.1)
print(item_sets.shape)
item_sets.head()

(7, 2)


Unnamed: 0,support,itemsets
0,0.163845,(chocolate)
1,0.179709,(eggs)
2,0.170911,(french fries)
3,0.132116,(green tea)
4,0.129583,(milk)


In [6]:
item_sets = m_apriori(item_hot_coded, use_colnames=True, min_support=0.05)
print(item_sets.shape)
item_sets.head()

(28, 2)


Unnamed: 0,support,itemsets
0,0.087188,(burgers)
1,0.081056,(cake)
2,0.059992,(chicken)
3,0.163845,(chocolate)
4,0.080389,(cookies)


In [7]:
item_sets = item_sets.sort_values('support', ascending=False).reset_index(drop=True)
item_sets.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
support,0.238368,0.179709,0.17411,0.170911,0.163845,0.132116,0.129583,0.0982536,0.0953206,0.095054,0.0871884,0.0810559,0.0803893,0.0793228,0.0765231,0.0714571,0.0683909,0.0658579,0.0633249,0.062525,0.059992,0.0597254,0.0585255,0.0526596,0.052393,0.0510599,0.0509265,0.0505266
itemsets,(mineral water),(eggs),(spaghetti),(french fries),(chocolate),(green tea),(milk),(ground beef),(frozen vegetables),(pancakes),(burgers),(cake),(cookies),(escalope),(low fat yogurt),(shrimp),(tomatoes),(olive oil),(frozen smoothie),(turkey),(chicken),"(mineral water, spaghetti)",(whole wheat rice),"(mineral water, chocolate)",(grated cheese),(cooking oil),"(mineral water, eggs)",(soup)


#### 3. 挖掘关联规则

In [8]:
# 根据最小置信度产生
ass_rules = association_rules(item_sets, metric='confidence', min_threshold=0.2)
ass_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(mineral water),(spaghetti),0.238368,0.17411,0.059725,0.250559,1.439085,0.018223,1.102008
1,(spaghetti),(mineral water),0.17411,0.238368,0.059725,0.343032,1.439085,0.018223,1.159314
2,(mineral water),(chocolate),0.238368,0.163845,0.05266,0.220917,1.348332,0.013604,1.073256
3,(chocolate),(mineral water),0.163845,0.238368,0.05266,0.3214,1.348332,0.013604,1.122357
4,(mineral water),(eggs),0.238368,0.179709,0.050927,0.213647,1.188845,0.00809,1.043158
5,(eggs),(mineral water),0.179709,0.238368,0.050927,0.283383,1.188845,0.00809,1.062815


In [50]:
# 根据最小提升度产生
ass_rules = association_rules(item_sets, metric='lift', min_threshold=1.2)
ass_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(spaghetti),(mineral water),0.17411,0.238368,0.059725,0.343032,1.439085,0.018223,1.159314
1,(mineral water),(spaghetti),0.238368,0.17411,0.059725,0.250559,1.439085,0.018223,1.102008
2,(chocolate),(mineral water),0.163845,0.238368,0.05266,0.3214,1.348332,0.013604,1.122357
3,(mineral water),(chocolate),0.238368,0.163845,0.05266,0.220917,1.348332,0.013604,1.073256


### 二、使用efficient_apriori进行频繁项集与关联规则挖掘

In [9]:
from efficient_apriori import apriori as e_apriori

In [10]:
raw_data.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,


需转换成[List[tuple]]的格式

In [11]:
ls_items = raw_data.apply(lambda se: tuple(se[se.notna()]), axis=1).tolist()
ls_items[:5]

[('shrimp',
  'almonds',
  'avocado',
  'vegetables mix',
  'green grapes',
  'whole weat flour',
  'yams',
  'cottage cheese',
  'energy drink',
  'tomato juice',
  'low fat yogurt',
  'green tea',
  'honey',
  'salad',
  'mineral water',
  'salmon',
  'antioxydant juice',
  'frozen smoothie',
  'spinach',
  'olive oil'),
 ('burgers', 'meatballs', 'eggs'),
 ('chutney',),
 ('turkey', 'avocado'),
 ('mineral water', 'milk', 'energy bar', 'whole wheat rice', 'green tea')]

#### 不同的参数verbosity，结果的详细程度不同

In [23]:
itemsets, rules = e_apriori(ls_items, min_support=0.05, min_confidence=0.2, verbosity=0)
dict_itemsets = {}
for d in itemsets.values():
    dict_itemsets.update(d)
dict_itemsets

{('shrimp',): 536,
 ('low fat yogurt',): 574,
 ('frozen smoothie',): 475,
 ('green tea',): 991,
 ('mineral water',): 1788,
 ('olive oil',): 494,
 ('burgers',): 654,
 ('eggs',): 1348,
 ('turkey',): 469,
 ('whole wheat rice',): 439,
 ('milk',): 972,
 ('french fries',): 1282,
 ('soup',): 379,
 ('frozen vegetables',): 715,
 ('spaghetti',): 1306,
 ('cookies',): 603,
 ('cooking oil',): 383,
 ('chocolate',): 1229,
 ('chicken',): 450,
 ('tomatoes',): 513,
 ('pancakes',): 713,
 ('grated cheese',): 393,
 ('ground beef',): 737,
 ('escalope',): 595,
 ('cake',): 608,
 ('eggs', 'mineral water'): 382,
 ('mineral water', 'spaghetti'): 448,
 ('chocolate', 'mineral water'): 395}

In [24]:
rules

[{mineral water} -> {eggs},
 {eggs} -> {mineral water},
 {spaghetti} -> {mineral water},
 {mineral water} -> {spaghetti},
 {mineral water} -> {chocolate},
 {chocolate} -> {mineral water}]

In [64]:
e_apriori(ls_items, min_support=0.05, min_confidence=0.2, verbosity=1)

Generating itemsets.
 Counting itemsets of length 1.
  Found 120 candidate itemsets of length 1.
  Found 25 large itemsets of length 1.
 Counting itemsets of length 2.
  Found 300 candidate itemsets of length 2.
  Found 3 large itemsets of length 2.
 Counting itemsets of length 3.
  Found 0 candidate itemsets of length 3.
Itemset generation terminated.

Generating rules from itemsets.
 Generating rules of size 2.
Rule generation terminated.



({1: {('shrimp',): 536,
   ('frozen smoothie',): 475,
   ('olive oil',): 494,
   ('mineral water',): 1788,
   ('low fat yogurt',): 574,
   ('green tea',): 991,
   ('eggs',): 1348,
   ('burgers',): 654,
   ('turkey',): 469,
   ('milk',): 972,
   ('whole wheat rice',): 439,
   ('french fries',): 1282,
   ('soup',): 379,
   ('frozen vegetables',): 715,
   ('spaghetti',): 1306,
   ('cookies',): 603,
   ('cooking oil',): 383,
   ('chicken',): 450,
   ('chocolate',): 1229,
   ('tomatoes',): 513,
   ('pancakes',): 713,
   ('grated cheese',): 393,
   ('escalope',): 595,
   ('ground beef',): 737,
   ('cake',): 608},
  2: {('eggs', 'mineral water'): 382,
   ('mineral water', 'spaghetti'): 448,
   ('chocolate', 'mineral water'): 395}},
 [{mineral water} -> {eggs},
  {eggs} -> {mineral water},
  {spaghetti} -> {mineral water},
  {mineral water} -> {spaghetti},
  {mineral water} -> {chocolate},
  {chocolate} -> {mineral water}])

In [65]:
e_apriori(ls_items, min_support=0.05, min_confidence=0.2, verbosity=2)

Generating itemsets.
 Counting itemsets of length 1.
  Found 120 candidate itemsets of length 1.
  Found 25 large itemsets of length 1.
    [('shrimp',), ('frozen smoothie',), ('olive oil',), ('mineral water',), ('low fat yogurt',), ('green tea',), ('eggs',), ('burgers',), ('turkey',), ('milk',), ('whole wheat rice',), ('french fries',), ('soup',), ('frozen vegetables',), ('spaghetti',), ('cookies',), ('cooking oil',), ('chicken',), ('chocolate',), ('tomatoes',), ('pancakes',), ('grated cheese',), ('escalope',), ('ground beef',), ('cake',)]
 Counting itemsets of length 2.
  Found 300 candidate itemsets of length 2.
   [('burgers', 'cake'), ('burgers', 'chicken'), ('burgers', 'chocolate'), ('burgers', 'cookies'), ('burgers', 'cooking oil'), ('burgers', 'eggs'), ('burgers', 'escalope'), ('burgers', 'french fries'), ('burgers', 'frozen smoothie'), ('burgers', 'frozen vegetables'), ('burgers', 'grated cheese'), ('burgers', 'green tea'), ('burgers', 'ground beef'), ('burgers', 'low fat yogu

({1: {('shrimp',): 536,
   ('frozen smoothie',): 475,
   ('olive oil',): 494,
   ('mineral water',): 1788,
   ('low fat yogurt',): 574,
   ('green tea',): 991,
   ('eggs',): 1348,
   ('burgers',): 654,
   ('turkey',): 469,
   ('milk',): 972,
   ('whole wheat rice',): 439,
   ('french fries',): 1282,
   ('soup',): 379,
   ('frozen vegetables',): 715,
   ('spaghetti',): 1306,
   ('cookies',): 603,
   ('cooking oil',): 383,
   ('chicken',): 450,
   ('chocolate',): 1229,
   ('tomatoes',): 513,
   ('pancakes',): 713,
   ('grated cheese',): 393,
   ('escalope',): 595,
   ('ground beef',): 737,
   ('cake',): 608},
  2: {('eggs', 'mineral water'): 382,
   ('mineral water', 'spaghetti'): 448,
   ('chocolate', 'mineral water'): 395}},
 [{mineral water} -> {eggs},
  {eggs} -> {mineral water},
  {spaghetti} -> {mineral water},
  {mineral water} -> {spaghetti},
  {mineral water} -> {chocolate},
  {chocolate} -> {mineral water}])

### 三、使用fptools进行频繁项集与关联规则挖掘
测试了一下，不懂咋用，放弃..= =

In [27]:
import fptools

In [33]:
raw_data.values

array([['shrimp', 'almonds', 'avocado', ..., 'frozen smoothie',
        'spinach', 'olive oil'],
       ['burgers', 'meatballs', 'eggs', ..., nan, nan, nan],
       ['chutney', nan, nan, ..., nan, nan, nan],
       ...,
       ['chicken', nan, nan, ..., nan, nan, nan],
       ['escalope', 'green tea', nan, ..., nan, nan, nan],
       ['eggs', 'frozen smoothie', 'yogurt cake', ..., nan, nan, nan]],
      dtype=object)

In [35]:
fptools.frequent_itemsets(raw_data.values, minsup=0.05)

<generator object frequent_itemsets at 0x00000203B6A42BA0>

In [37]:
fp_tree, _ = fptools.build_tree(raw_data.values, minsup=0.05)

In [None]:
fptools.itertools()

In [40]:
fptools.fpgrowth(fp_tree, 1)

<generator object fpgrowth at 0x00000203B4D7C4A0>

In [57]:
fpg_gen = fptools.fpgrowth(fp_tree, 5)

In [62]:
next(fpg_gen)

['mineral water', nan, nan, nan, nan]