## Referans: https://github.com/mineebasol/Apriori-Association-Rule

# Veriyi Anlama (Data Understanding)

In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [66]:
df = pd.read_csv("GroceryStoreDataSet.csv", names=["products"])


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   products  20 non-null     object
dtypes: object(1)
memory usage: 288.0+ bytes


In [68]:
df.values

array([['MILK,BREAD,BISCUIT'],
       ['BREAD,MILK,BISCUIT,CORNFLAKES'],
       ['BREAD,TEA,BOURNVITA'],
       ['JAM,MAGGI,BREAD,MILK'],
       ['MAGGI,TEA,BISCUIT'],
       ['BREAD,TEA,BOURNVITA'],
       ['MAGGI,TEA,CORNFLAKES'],
       ['MAGGI,BREAD,TEA,BISCUIT'],
       ['JAM,MAGGI,BREAD,TEA'],
       ['BREAD,MILK'],
       ['COFFEE,COCK,BISCUIT,CORNFLAKES'],
       ['COFFEE,COCK,BISCUIT,CORNFLAKES'],
       ['COFFEE,SUGER,BOURNVITA'],
       ['BREAD,COFFEE,COCK'],
       ['BREAD,SUGER,BISCUIT'],
       ['COFFEE,SUGER,CORNFLAKES'],
       ['BREAD,SUGER,BOURNVITA'],
       ['BREAD,COFFEE,SUGER'],
       ['BREAD,COFFEE,SUGER'],
       ['TEA,MILK,COFFEE,CORNFLAKES']], dtype=object)

In [69]:
data = list(df["products"].apply(lambda x: x.split(",")))
data

[['MILK', 'BREAD', 'BISCUIT'],
 ['BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['JAM', 'MAGGI', 'BREAD', 'MILK'],
 ['MAGGI', 'TEA', 'BISCUIT'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['MAGGI', 'TEA', 'CORNFLAKES'],
 ['MAGGI', 'BREAD', 'TEA', 'BISCUIT'],
 ['JAM', 'MAGGI', 'BREAD', 'TEA'],
 ['BREAD', 'MILK'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'COCK'],
 ['BREAD', 'SUGER', 'BISCUIT'],
 ['COFFEE', 'SUGER', 'CORNFLAKES'],
 ['BREAD', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['TEA', 'MILK', 'COFFEE', 'CORNFLAKES']]

# Veri Ön İşleme (Data Preprocessing)

In [70]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_data = te.fit_transform(data)
df_te = pd.DataFrame(te_data, columns = te.columns_)
df_te


Unnamed: 0,BISCUIT,BOURNVITA,BREAD,COCK,COFFEE,CORNFLAKES,JAM,MAGGI,MILK,SUGER,TEA
0,True,False,True,False,False,False,False,False,True,False,False
1,True,False,True,False,False,True,False,False,True,False,False
2,False,True,True,False,False,False,False,False,False,False,True
3,False,False,True,False,False,False,True,True,True,False,False
4,True,False,False,False,False,False,False,True,False,False,True
5,False,True,True,False,False,False,False,False,False,False,True
6,False,False,False,False,False,True,False,True,False,False,True
7,True,False,True,False,False,False,False,True,False,False,True
8,False,False,True,False,False,False,True,True,False,False,True
9,False,False,True,False,False,False,False,False,True,False,False


# Veri Birliktelik Kuralları (Data Assocation Rule)

In [71]:
# df = True-False array ya da One Hot Encoding ile dönüştürülmüş dataframe
# min_support = Tüm kombinasyonların support değerini istemiyorum bu 
# yüzden belirli bir eşik değerini supporta göre getir.
# use_colnames = True, sutün isimlerini göster.
# verbose = 1, Toplam kombinasyon sayısını verir. 

from mlxtend.frequent_patterns import apriori

df_apriori = apriori(df_te, min_support=0.01, use_colnames=True, verbose=1)
df_apriori

# BISCUIT tüm alışverişlerin % 35'inde,
# BREAD tüm alışverişlerin %65'inde veya
# TEA, MAGGI, BREAD, BISCUIT tüm satışların % 5'inde beraber yorumları yapılır.

Processing 30 combinations | Sampling itemset size 54


Unnamed: 0,support,itemsets
0,0.35,(BISCUIT)
1,0.20,(BOURNVITA)
2,0.65,(BREAD)
3,0.15,(COCK)
4,0.40,(COFFEE)
...,...,...
78,0.05,"(TEA, BREAD, BISCUIT, MAGGI)"
79,0.10,"(COFFEE, CORNFLAKES, BISCUIT, COCK)"
80,0.05,"(MILK, JAM, BREAD, MAGGI)"
81,0.05,"(TEA, JAM, BREAD, MAGGI)"


In [72]:
df_apriori.sort_values("support", ascending=False)

Unnamed: 0,support,itemsets
2,0.65,(BREAD)
4,0.40,(COFFEE)
0,0.35,(BISCUIT)
10,0.35,(TEA)
5,0.30,(CORNFLAKES)
...,...,...
55,0.05,"(MILK, CORNFLAKES, BISCUIT)"
57,0.05,"(SUGER, BREAD, BOURNVITA)"
17,0.05,"(BISCUIT, SUGER)"
37,0.05,"(CORNFLAKES, MAGGI)"


In [73]:
# antecedent support= Birincinin tek başına görülme olasılığı,
# consequent support= İkincinin tek başına görülme olasılığı,
# support= İkisinin birlikte görülme olasılığı,
# confidence= İlki satıldığında ikinci ürünün satılma olasılığı,
# lift= İlki satıldığında ikinci ürünün satılma olasılığı şu kadar kat arttı 
# yorumu yapılır.

# Conviction değeri hesaplanırken, A elemanlarının, B elemanı olmaksızın görülme 
# olasılıkları hesaplanır. Eğer conviction değeri 1 ise A ve B birbirinden bağımsızdır. 
# Conviction değeri 1’den uzak ise ilişkili kural oluşturulabilir 
# Leverage değeri ise bir satış verisi üzerinde A ve B ürünlerinin birlikte 
# satılmasının A ve B’nin ayrı ayrı satılmasından ne kadar fazla olduğunu bulmaktadır 

from mlxtend.frequent_patterns import association_rules
df_association = association_rules(df_apriori, metric="confidence",
                                   min_threshold=0.6)
df_association


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(COCK),(BISCUIT),0.15,0.35,0.10,0.666667,1.904762,0.0475,1.950
1,(BOURNVITA),(BREAD),0.20,0.65,0.15,0.750000,1.153846,0.0200,1.400
2,(JAM),(BREAD),0.10,0.65,0.10,1.000000,1.538462,0.0350,inf
3,(MAGGI),(BREAD),0.25,0.65,0.15,0.600000,0.923077,-0.0125,0.875
4,(MILK),(BREAD),0.25,0.65,0.20,0.800000,1.230769,0.0375,1.750
...,...,...,...,...,...,...,...,...,...
81,"(MILK, CORNFLAKES, COFFEE)",(TEA),0.05,0.35,0.05,1.000000,2.857143,0.0325,inf
82,"(COFFEE, TEA, CORNFLAKES)",(MILK),0.05,0.25,0.05,1.000000,4.000000,0.0375,inf
83,"(MILK, TEA)","(COFFEE, CORNFLAKES)",0.05,0.20,0.05,1.000000,5.000000,0.0400,inf
84,"(MILK, COFFEE)","(TEA, CORNFLAKES)",0.05,0.10,0.05,1.000000,10.000000,0.0450,inf


In [74]:
rules = association_rules(df_apriori, metric="confidence", min_threshold=0.6)
rules[(rules["confidence"] > 0.6) & (rules["support"] >= 0.2)]


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75
5,(SUGER),(BREAD),0.3,0.65,0.2,0.666667,1.025641,0.005,1.05
8,(CORNFLAKES),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8
9,(SUGER),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8
11,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25
