## Regras de Associação - Algoritmo Apriori

* Nome do dataset: **Groceries Dataset**
* Link: https://www.kaggle.com/heeraldedhia/groceries-dataset
* Objetivo: Minerar Regras de Associação utilizando o Algoritmo Apriori


#### Importando as bibliotecas

In [1]:
import numpy as np
import pandas as pd

#### Importando o dataset

In [2]:
data = pd.read_csv('basket.csv')

In [3]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,whole milk,pastry,salty snack,,,,,,,,
1,sausage,whole milk,semi-finished bread,yogurt,,,,,,,
2,soda,pickled vegetables,,,,,,,,,
3,canned beer,misc. beverages,,,,,,,,,
4,sausage,hygiene articles,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
14958,butter milk,whipped/sour cream,,,,,,,,,
14959,bottled water,herbs,,,,,,,,,
14960,fruit/vegetable juice,onions,,,,,,,,,
14961,bottled beer,other vegetables,,,,,,,,,


#### Tratando valores NaN utilizando a função replace

In [4]:
data.replace(np.nan, 0, inplace = True)

In [5]:
data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,whole milk,pastry,salty snack,0,0,0,0,0,0,0,0
1,sausage,whole milk,semi-finished bread,yogurt,0,0,0,0,0,0,0
2,soda,pickled vegetables,0,0,0,0,0,0,0,0,0
3,canned beer,misc. beverages,0,0,0,0,0,0,0,0,0
4,sausage,hygiene articles,0,0,0,0,0,0,0,0,0
5,sausage,whole milk,rolls/buns,0,0,0,0,0,0,0,0
6,whole milk,soda,0,0,0,0,0,0,0,0,0
7,frankfurter,soda,whipped/sour cream,0,0,0,0,0,0,0,0
8,frankfurter,curd,0,0,0,0,0,0,0,0,0
9,beef,white bread,0,0,0,0,0,0,0,0,0


#### Criando a função para remover os zeros

In [6]:
def removeAllZerosFromList(l):
    return list(filter(lambda x: x!= 0, l))

#### Transformando tudo em lista

In [7]:
lista_todas_transacoes = []

for index, row in data.iterrows():
    lista_de_transacao = row.values.tolist()
    lista_de_transacao = removeAllZerosFromList(lista_de_transacao)
    
    
    lista_todas_transacoes.append(lista_de_transacao)

lista_todas_transacoes[0:10]

[['whole milk', 'pastry', 'salty snack'],
 ['sausage', 'whole milk', 'semi-finished bread', 'yogurt'],
 ['soda', 'pickled vegetables'],
 ['canned beer', 'misc. beverages'],
 ['sausage', 'hygiene articles'],
 ['sausage', 'whole milk', 'rolls/buns'],
 ['whole milk', 'soda'],
 ['frankfurter', 'soda', 'whipped/sour cream'],
 ['frankfurter', 'curd'],
 ['beef', 'white bread']]

#### Instalando a biblioteca MLxtend

* http://rasbt.github.io/mlxtend/

In [8]:
!pip install mlxtend



### Importando a classe TransactionEncoder 

* A classe TransactionEncoder é utilizada para trabalhar com transações de dados em listas de Python.
* Codifica transações de conjuntos de dados na forma de listas de listas em Python dentro da array NumPy
* Viabiliza a técnica one-hot-encoding -> apresenta uma variável categórica de maneira binária (0 e 1) ou (Falso e Verdade)

In [9]:
# preprocessing é um submodulo da biblioteca mlxtend
from mlxtend.preprocessing import TransactionEncoder

#### Aplicando a técnica one-hot-encoding

* Transformando variáveis categóricas em booleanas

In [10]:
te = TransactionEncoder()

data_te = te.fit(lista_todas_transacoes).transform(lista_todas_transacoes)
data_te

array([[False, False, False, ...,  True, False, False],
       [False, False, False, ...,  True,  True, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

### Encontrando o nome das colunas

In [11]:
te.columns_

['Instant food products',
 'UHT-milk',
 'abrasive cleaner',
 'artif. sweetener',
 'baby cosmetics',
 'bags',
 'baking powder',
 'bathroom cleaner',
 'beef',
 'berries',
 'beverages',
 'bottled beer',
 'bottled water',
 'brandy',
 'brown bread',
 'butter',
 'butter milk',
 'cake bar',
 'candles',
 'candy',
 'canned beer',
 'canned fish',
 'canned fruit',
 'canned vegetables',
 'cat food',
 'cereals',
 'chewing gum',
 'chicken',
 'chocolate',
 'chocolate marshmallow',
 'citrus fruit',
 'cleaner',
 'cling film/bags',
 'cocoa drinks',
 'coffee',
 'condensed milk',
 'cooking chocolate',
 'cookware',
 'cream',
 'cream cheese ',
 'curd',
 'curd cheese',
 'decalcifier',
 'dental care',
 'dessert',
 'detergent',
 'dish cleaner',
 'dishes',
 'dog food',
 'domestic eggs',
 'female sanitary products',
 'finished products',
 'fish',
 'flour',
 'flower (seeds)',
 'flower soil/fertilizer',
 'frankfurter',
 'frozen chicken',
 'frozen dessert',
 'frozen fish',
 'frozen fruits',
 'frozen meals',
 'froze

### Construindo o DataFrame dos dados transformados

In [12]:
#Usando os produtos como nomes das colunas
df = pd.DataFrame(data_te, columns=te.columns_)
df

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14958,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
14959,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14960,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14961,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Importando o apriori

In [13]:
from mlxtend.frequent_patterns import apriori

### Calculando o itemsets mais frequentes

In [14]:
frequent_itemsets = apriori(df, min_support = 0.003, use_colnames=True)

# Ascending = False -> ordena do maior para o menor
frequent_itemsets.sort_values(by=['support'], ascending = False)

Unnamed: 0,support,itemsets
108,0.157923,(whole milk)
67,0.122101,(other vegetables)
82,0.110005,(rolls/buns)
91,0.097106,(soda)
109,0.085879,(yogurt)
...,...,...
155,0.003074,"(whole milk, hamburger meat)"
160,0.003007,"(rolls/buns, newspapers)"
50,0.003007,(house keeping products)
157,0.003007,"(rolls/buns, margarine)"


### Importanto o association_rules

In [15]:
from mlxtend.frequent_patterns import association_rules

### Estabelecendo as regras associativas

In [20]:
rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.001)


rules.sort_values(by=['lift'], ascending = False).drop(['antecedent support', 'consequent support', 'leverage', 'conviction'], 
                                                       axis=1).head(15)

# Lift abaixo de 1 - não tem relevância estatística

Unnamed: 0,antecedents,consequents,support,confidence,lift
6,(bottled beer),(sausage),0.003342,0.073746,1.222
7,(sausage),(bottled beer),0.003342,0.055371,1.222
72,(frankfurter),(other vegetables),0.005146,0.136283,1.11615
73,(other vegetables),(frankfurter),0.005146,0.042146,1.11615
184,(yogurt),(sausage),0.005748,0.066926,1.108986
185,(sausage),(yogurt),0.005748,0.095238,1.108986
128,(pastry),(sausage),0.003208,0.062016,1.027617
129,(sausage),(pastry),0.003208,0.053156,1.027617
178,(soda),(sausage),0.005948,0.061253,1.014975
179,(sausage),(soda),0.005948,0.09856,1.014975
