> # 亚马逊购物数据

In [2]:
#加载相应的库
import numpy as np
import pandas as pd
import networkx as nx
from itertools import combinations  
from collections import defaultdict, Counter
from mlxtend.preprocessing import TransactionEncoder  
from mlxtend.frequent_patterns import apriori,fpgrowth, association_rules

## 数据预处理
给定的数据格式为（物品1，物品2），因此构建一个购物有向图

In [2]:
#给出的数据格式是（物品1，物品2），因此构建一个购物图
G = nx.DiGraph() 
count = 0
filename = 'C:/Users/pipi_s/Desktop/data_mining/amazon/Amazon0302.txt'
with open(filename, 'r') as file: 
    # 跳过前n-1行  
    for _ in range(4):  
        next(file) 
    for line in file:
        # 从第n行开始读取数据  
        nodes = line.strip().split()
        # 添加边到图中  
        G.add_edge(nodes[0], nodes[1])
        count = count + 1

In [None]:
# 将长度为k的路径上的节点视为一次购物事务
k = 3
# 获取所有节点对之间的路径  
with open('C:/Users/pipi_s/Desktop/data_mining/amazon/shopping.txt', 'w', encoding='utf-8') as f:
    for source in G.nodes():
        for target in G.nodes():
            if source != target:
                for path in nx.all_simple_paths(G,source=source,target=target, cutoff=k):
                    path_as_string = ', '.join(path) 
                    f.write(path_as_string + '\n')
print('done!')

In [3]:
with open("C:/Users/pipi_s/Desktop/data_mining/amazon/shopping.txt", 'r') as f:
    dataset = [line.strip().split() for line in f]
print(len(dataset))

518464


## 关联规则挖掘

In [9]:
# 使用mlxtend的TransactionEncoder进行编码  
te = TransactionEncoder()  
te_ary = te.fit(dataset).transform(dataset)  
df = pd.DataFrame(te_ary, columns=te.columns_)  

# 使用FP-Growth算法找到频繁项集  
frequent_itemsets = fpgrowth(df, min_support=0.0003, use_colnames=True)  
if frequent_itemsets.empty:  
    print("没有找到频繁项集，请检查数据集或调整min_support的值。")  
else:  
    # 从频繁项集中生成关联规则  
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)[['antecedents', 'consequents', 'support', 'confidence', 'lift']]  
      
    # 打印关联规则  
    print(rules)

   antecedents consequents   support  confidence        lift
0         (24)        (8,)  0.001254    0.863214   98.643440
1        (142)       (23,)  0.000640    0.864583  197.382357
2         (50)       (30,)  0.000615    0.952239  245.256602
3        (175)       (99,)  0.000600    0.987302  228.009056
4        (177)       (99,)  0.000596    0.990385  228.721055
5        (430)      (100,)  0.000320    0.813725  299.635918
6        (283)       (76,)  0.000312    0.857143  317.426939
7         (96)       (56,)  0.000378    0.855895  360.187376
8        (451)      (158,)  0.000309    0.874317  355.809936
9        (457)      (167,)  0.000309    0.935673  352.809102
10       (364)      (303,)  0.000374    0.866071  281.168978
11       (365)      (303,)  0.000374    0.866071  281.168978
12      (1663)      (449,)  0.000365    0.825328  304.773933
13      (1672)      (481,)  0.000916    0.944334  120.177511
14      (1963)     (1241,)  0.000415    0.860000  258.780638
15       (393)      (368

In [10]:
rules_sorted = rules.sort_values(by=['confidence', 'lift'], ascending=[False, False])
print(rules_sorted)

   antecedents consequents   support  confidence        lift
4        (177)       (99,)  0.000596    0.990385  228.721055
3        (175)       (99,)  0.000600    0.987302  228.009056
17      (2647)      (976,)  0.000363    0.954315  311.376858
2         (50)       (30,)  0.000615    0.952239  245.256602
13      (1672)      (481,)  0.000916    0.944334  120.177511
9        (457)      (167,)  0.000309    0.935673  352.809102
16      (2646)      (976,)  0.000359    0.902913  294.605217
8        (451)      (158,)  0.000309    0.874317  355.809936
10       (364)      (303,)  0.000374    0.866071  281.168978
11       (365)      (303,)  0.000374    0.866071  281.168978
1        (142)       (23,)  0.000640    0.864583  197.382357
0         (24)        (8,)  0.001254    0.863214   98.643440
14      (1963)     (1241,)  0.000415    0.860000  258.780638
6        (283)       (76,)  0.000312    0.857143  317.426939
7         (96)       (56,)  0.000378    0.855895  360.187376
15       (393)      (368

## 挖掘结果分析

### 说明
由于购物图中节点数和边数有点大，节点间的路径需要很大的内存存储，同时需要很久的时间，因此选用部分路径进行关联规则分析。因为路径数据不完整，导致支持度不能选择太高，因此从置信度和支持度进行分析。


### 分析
置信度前5的关联规则如下
* \[177\]->\[99\]，置信度为0.990385，提升度为228.721025
* \[175\]->\[99\]，置信度为0.987302，提升度为228.009056
* \[2647\]->\[976\]，置信度为0.954315，提升度为311.376851
* \[50\]->\[30\]，置信度为0.952239，提升度为245.256602
* \[1672\]->\[481\]，置信度为0.944334，提升度为120.177511

因此，在用户浏览商品\[177,175,2647,50,1672\]时，可以在商品相关页面推荐以下商品\[99,99,976,30,481\]