In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv('data/Coastal Data System - Waves (Mooloolaba) 01-2017 to 06 - 2019.csv')
df.replace(-99.90, np.nan, inplace=True)
df.drop('Date/Time', axis=1, inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Hs,Hmax,Tz,Tp,Peak Direction,SST
0,0.763,1.15,4.52,5.513,49.0,25.65
1,0.77,1.41,4.582,5.647,75.0,25.5
2,0.747,1.16,4.515,5.083,91.0,25.45
3,0.718,1.61,4.614,6.181,68.0,25.45
4,0.707,1.34,4.568,4.705,73.0,25.5


In [2]:
transactions = df.groupby(by="Hmax").apply(lambda i: list(i.SST))
transactions

Hmax
0.510                                     [20.8]
0.520                                     [20.8]
0.530    [23.7, 23.7, 20.85, 20.75, 21.15, 22.6]
0.540                              [23.7, 20.85]
0.550                 [20.95, 20.75, 22.7, 21.5]
                          ...                   
7.037                                     [21.3]
7.091                                    [21.15]
7.262                                     [21.1]
7.327                                     [26.8]
7.906                                    [26.85]
Length: 2338, dtype: object

In [3]:
from apyori import apriori

# apriori包是一个用于关联规则挖掘的算法包，常用于频繁项集的发现。它的主要参数有：
#
# 1. data：要进行关联规则挖掘的数据集。
# 2. min_support：最小支持度阈值，用于确定频繁项集的阈值。支持度指的是包含该项集的事务数与总事务数之比。
# 3. min_confidence：最小置信度阈值，用于确定关联规则的阈值。置信度指的是规则的条件和结论同时发生的概率。
# 4. min_lift：最小提升度阈值，用于确定关联规则的阈值。提升度指的是规则中结论的发生概率相对于条件发生概率的增益程度。
# 5. max_length：频繁项集的最大长度限制。
#
# 这些参数可以根据具体应用场景和需求进行调整，以得到满足要求的关联规则挖掘结果。

min_supp = 0.1
min_conf = 0.1
min_lift = 0.1
result = list(apriori(transactions=transactions, min_support=min_supp, min_confidence=min_conf, min_lift=min_lift))
result

[RelationRecord(items=frozenset({21.9}), support=0.11804961505560307, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({21.9}), confidence=0.11804961505560307, lift=1.0)]),
 RelationRecord(items=frozenset({23.85}), support=0.10778443113772455, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({23.85}), confidence=0.10778443113772455, lift=1.0)]),
 RelationRecord(items=frozenset({24.05}), support=0.10949529512403763, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({24.05}), confidence=0.10949529512403763, lift=1.0)]),
 RelationRecord(items=frozenset({25.9}), support=0.10778443113772455, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({25.9}), confidence=0.10778443113772455, lift=1.0)]),
 RelationRecord(items=frozenset({25.95}), support=0.10564585115483319, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({25.95}), confidence=0.1

In [4]:
supports = []
confidence = []
lifts = []
bases = []
adds = []
for r in result:
    for x in r.ordered_statistics:
        supports.append(r.support)
        confidence.append(x.confidence)
        lifts.append(x.lift)
        bases.append(list(x.items_base))
        adds.append(list(x.items_add))
resultShow = pd.DataFrame({
    'support': supports,
    'confidence': confidence,
    'lift': lifts,
    'base': bases,
    'add': adds
})
resultShow

Unnamed: 0,support,confidence,lift,base,add
0,0.11805,0.11805,1.0,[],[21.9]
1,0.107784,0.107784,1.0,[],[23.85]
2,0.109495,0.109495,1.0,[],[24.05]
3,0.107784,0.107784,1.0,[],[25.9]
4,0.105646,0.105646,1.0,[],[25.95]
5,0.122327,0.122327,1.0,[],[26.05]
6,0.109068,0.109068,1.0,[],[26.15]
7,0.100941,0.100941,1.0,[],[26.2]
8,0.107357,0.107357,1.0,[],[26.55]
9,0.101369,0.101369,1.0,[],[26.8]
