In [1]:
import matplotlib.pyplot
import pandas as pd
from apyori import apriori

In [2]:
retail_data = pd.read_excel("../../data_sets/online_retail_II.xlsx")

In [3]:
retail_data.head(5)

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [4]:
retail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      525461 non-null  object        
 1   StockCode    525461 non-null  object        
 2   Description  522533 non-null  object        
 3   Quantity     525461 non-null  int64         
 4   InvoiceDate  525461 non-null  datetime64[ns]
 5   Price        525461 non-null  float64       
 6   Customer ID  417534 non-null  float64       
 7   Country      525461 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 32.1+ MB


In [5]:
canceled_data = retail_data[retail_data["Invoice"].str.startswith('C', na=False)]
canceled_data.head(5)

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
178,C489449,22087,PAPER BUNTING WHITE LACE,-12,2009-12-01 10:33:00,2.95,16321.0,Australia
179,C489449,85206A,CREAM FELT EASTER EGG BASKET,-6,2009-12-01 10:33:00,1.65,16321.0,Australia
180,C489449,21895,POTTING SHED SOW 'N' GROW SET,-4,2009-12-01 10:33:00,4.25,16321.0,Australia
181,C489449,21896,POTTING SHED TWINE,-6,2009-12-01 10:33:00,2.1,16321.0,Australia
182,C489449,22083,PAPER CHAIN KIT RETRO SPOT,-12,2009-12-01 10:33:00,2.95,16321.0,Australia


In [6]:
cleaned_data = retail_data[~ retail_data["Invoice"].str.startswith('C', na=False)]
cleaned_data = cleaned_data[~ cleaned_data['Description'].isna()]
cleaned_data['Description'] = cleaned_data['Description'].astype(str)

In [7]:
grouped_retail_data = cleaned_data.groupby(["Invoice"])["Description"].apply(list).to_frame().reset_index()
grouped_retail_data

Unnamed: 0,Invoice,Description
0,489434,"[15CM CHRISTMAS GLASS BALL 20 LIGHTS, PINK CHE..."
1,489435,"[CAT BOWL , DOG BOWL , CHASING BALL DESIGN, HE..."
2,489436,"[DOOR MAT BLACK FLOCK , LOVE BUILDING BLOCK WO..."
3,489437,"[CHRISTMAS CRAFT HEART DECORATIONS, CHRISTMAS ..."
4,489438,"[DINOSAURS WRITING SET , SET OF MEADOW FLOWE..."
...,...,...
21291,538170,"[ASSORTED COLOUR BIRD ORNAMENT, HAND WARMER BA..."
21292,538171,"[3 TIER SWEETHEART GARDEN SHELF, FIRST AID TIN..."
21293,A506401,[Adjust bad debt]
21294,A516228,[Adjust bad debt]


In [8]:
transactions = grouped_retail_data["Description"].to_list()

### Построение модели

In [13]:
association_rules = apriori(transactions,
                            min_support=0.02,
                            min_confidence=0.2,
                            min_lift=3,
                            min_length=3)

In [14]:
rules = list(association_rules)

In [15]:
print(f"Количество правил: {len(rules)}")

Количество правил: 17


### Отображение правил

In [16]:
n = 1

for rule in rules:
    items = [x for x in rule[0]]
    print("==============================================")
    print(f"Правило {n}:")
    print(f"Предшествующий: {items[0]} ---> Последующий: {items[1]}")
    print(f"Поддержка (support): {rule[1]:.4f}")
    print(f"Достоверность (confidence): {rule[2][0][2]:.4f}")
    print(f"Зависимость (lift): {rule[2][0][3]:.4f}")
    print("==============================================\n")

    n += 1

Правило 1:
Предшествующий: 60 TEATIME FAIRY CAKE CASES ---> Последующий: 72 SWEETHEART FAIRY CAKE CASES
Поддержка (support): 0.0222
Достоверность (confidence): 0.3533
Зависимость (lift): 8.6380

Правило 2:
Предшествующий: PACK OF 60 PINK PAISLEY CAKE CASES ---> Последующий: 60 TEATIME FAIRY CAKE CASES
Поддержка (support): 0.0278
Достоверность (confidence): 0.4439
Зависимость (lift): 8.1840

Правило 3:
Предшествующий: PACK OF 72 RETRO SPOT CAKE CASES ---> Последующий: 60 TEATIME FAIRY CAKE CASES
Поддержка (support): 0.0283
Достоверность (confidence): 0.4513
Зависимость (lift): 6.8169

Правило 4:
Предшествующий: CHOCOLATE HOT WATER BOTTLE ---> Последующий: HOT WATER BOTTLE TEA AND SYMPATHY
Поддержка (support): 0.0225
Достоверность (confidence): 0.5162
Зависимость (lift): 10.8191

Правило 5:
Предшествующий: HEART OF WICKER SMALL ---> Последующий: HEART OF WICKER LARGE
Поддержка (support): 0.0259
Достоверность (confidence): 0.5027
Зависимость (lift): 10.4046

Правило 6:
Предшествующий: HOM