# Regras de associação usando algoritmo Apriori

## Importação das bibliotecas

In [19]:
import pandas as pd
from apyori import apriori

## Importação dos dados

In [20]:
dataset = pd.read_csv('../data/raw_data/census.csv')
dataset.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Transformando variáveis numéricas em categóricas

In [21]:
dataset['age'] = pd.cut(dataset['age'], bins=[0, 17, 25, 40, 60, 90], labels=['faixa1','faixa2','faixa3','faixa4','faixa5'])

In [22]:
dataset.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,faixa3,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,faixa4,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,faixa3,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,faixa4,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,faixa3,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Selecionando variávies categóricas

In [23]:
dataset_apriori = dataset[['age', 'workclass', 'education', 'marital-status', 'relationship', 'occupation', 'sex', 'native-country', 'income']]
dataset_apriori.head()

Unnamed: 0,age,workclass,education,marital-status,relationship,occupation,sex,native-country,income
0,faixa3,State-gov,Bachelors,Never-married,Not-in-family,Adm-clerical,Male,United-States,<=50K
1,faixa4,Self-emp-not-inc,Bachelors,Married-civ-spouse,Husband,Exec-managerial,Male,United-States,<=50K
2,faixa3,Private,HS-grad,Divorced,Not-in-family,Handlers-cleaners,Male,United-States,<=50K
3,faixa4,Private,11th,Married-civ-spouse,Husband,Handlers-cleaners,Male,United-States,<=50K
4,faixa3,Private,Bachelors,Married-civ-spouse,Wife,Prof-specialty,Female,Cuba,<=50K


In [24]:
dataset_apriori.shape

(32561, 9)

## Selecionando uma amostra

In [25]:
dataset_apriori = dataset_apriori.sample(n = 1000)
dataset_apriori.shape

(1000, 9)

In [26]:
transacoes = []
for i in range(dataset_apriori.shape[0]):
    transacoes.append([str(dataset_apriori.values[i, j]) for j in range(dataset_apriori.shape[1])])

In [27]:
len(transacoes)

1000

In [28]:
transacoes[:2]

[['faixa4',
  ' Private',
  ' Assoc-voc',
  ' Married-civ-spouse',
  ' Husband',
  ' Exec-managerial',
  ' Male',
  ' United-States',
  ' >50K'],
 ['faixa3',
  ' Private',
  ' Bachelors',
  ' Never-married',
  ' Own-child',
  ' Sales',
  ' Female',
  ' United-States',
  ' <=50K']]

In [29]:
regras = apriori(transacoes, min_support = 0.3, min_confidence = 0.2)
resultados = list(regras)

In [30]:
len(resultados)

38

In [31]:
resultados

[RelationRecord(items=frozenset({' <=50K'}), support=0.753, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({' <=50K'}), confidence=0.753, lift=1.0)]),
 RelationRecord(items=frozenset({' Female'}), support=0.323, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({' Female'}), confidence=0.323, lift=1.0)]),
 RelationRecord(items=frozenset({' HS-grad'}), support=0.348, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({' HS-grad'}), confidence=0.348, lift=1.0)]),
 RelationRecord(items=frozenset({' Husband'}), support=0.417, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({' Husband'}), confidence=0.417, lift=1.0)]),
 RelationRecord(items=frozenset({' Male'}), support=0.677, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({' Male'}), confidence=0.677, lift=1.0)]),
 RelationRecord(items=frozenset({' Married-civ-spouse'}), support=0