# Distribuição de Frequência
## Regras de Associação

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('CSVs/census.csv')
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
def calc_classes(classes):
    return np.insert(classes, 0, classes[0] - (classes[1]-classes[0]))

In [4]:
freq, classes = np.histogram(df['age'], bins='sturges')
freq, classes, len(classes)

(array([3130, 4066, 3376, 4353, 3399, 3876, 2722, 2716, 1924, 1225,  917,
         389,  273,   96,   51,   48], dtype=int64),
 array([17.    , 21.5625, 26.125 , 30.6875, 35.25  , 39.8125, 44.375 ,
        48.9375, 53.5   , 58.0625, 62.625 , 67.1875, 71.75  , 76.3125,
        80.875 , 85.4375, 90.    ]),
 17)

In [5]:
classes

array([17.    , 21.5625, 26.125 , 30.6875, 35.25  , 39.8125, 44.375 ,
       48.9375, 53.5   , 58.0625, 62.625 , 67.1875, 71.75  , 76.3125,
       80.875 , 85.4375, 90.    ])

In [6]:
classes = calc_classes(classes)
labels = [f'AG_{classes[i]}-{classes[i + 1]}' for i in range(len(classes)-1)]
df['age'] = pd.cut(df['age'], bins=classes,
                  labels=labels)
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,AG_35.25-39.8125,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,AG_48.9375-53.5,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,AG_35.25-39.8125,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,AG_48.9375-53.5,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,AG_26.125-30.6875,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
freq, classes = np.histogram(df['final-weight'], bins='sturges')
freq, classes, len(classes)

(array([ 6244, 13292,  8090,  3328,  1106,   306,   101,    54,    15,
            9,     3,     5,     3,     2,     1,     2], dtype=int64),
 array([  12285.  ,  104311.25,  196337.5 ,  288363.75,  380390.  ,
         472416.25,  564442.5 ,  656468.75,  748495.  ,  840521.25,
         932547.5 , 1024573.75, 1116600.  , 1208626.25, 1300652.5 ,
        1392678.75, 1484705.  ]),
 17)

In [8]:
classes = calc_classes(classes)
labels = [f'FW_{classes[i]}-{classes[i + 1]}' for i in range(len(classes)-1)]
df['final-weight'] = pd.cut(df['final-weight'], bins=classes,
                  labels=labels)
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,AG_35.25-39.8125,State-gov,FW_12285.0-104311.25,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,AG_48.9375-53.5,Self-emp-not-inc,FW_12285.0-104311.25,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,AG_35.25-39.8125,Private,FW_196337.5-288363.75,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,AG_48.9375-53.5,Private,FW_196337.5-288363.75,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,AG_26.125-30.6875,Private,FW_288363.75-380390.0,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
freq, classes = np.histogram(df['education-num'], bins='sturges')
freq, classes, len(classes)

(array([   51,   168,   333,   646,   514,   933,  1175,   433, 10501,
         7291,  1382,  1067,  5355,  1723,   576,   413], dtype=int64),
 array([ 1.    ,  1.9375,  2.875 ,  3.8125,  4.75  ,  5.6875,  6.625 ,
         7.5625,  8.5   ,  9.4375, 10.375 , 11.3125, 12.25  , 13.1875,
        14.125 , 15.0625, 16.    ]),
 17)

In [10]:
classes = calc_classes(classes)
labels = [f'EN_{classes[i]}-{classes[i + 1]}' for i in range(len(classes)-1)]
df['education-num'] = pd.cut(df['education-num'], bins=classes,
                  labels=labels)
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,AG_35.25-39.8125,State-gov,FW_12285.0-104311.25,Bachelors,EN_12.25-13.1875,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,AG_48.9375-53.5,Self-emp-not-inc,FW_12285.0-104311.25,Bachelors,EN_12.25-13.1875,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,AG_35.25-39.8125,Private,FW_196337.5-288363.75,HS-grad,EN_8.5-9.4375,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,AG_48.9375-53.5,Private,FW_196337.5-288363.75,11th,EN_6.625-7.5625,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,AG_26.125-30.6875,Private,FW_288363.75-380390.0,Bachelors,EN_12.25-13.1875,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [11]:
freq, classes = np.histogram(df['capital-gain'], bins='sturges')
freq, classes, len(classes)

(array([31100,   754,   454,    38,    49,     5,     2,     0,     0,
            0,     0,     0,     0,     0,     0,   159], dtype=int64),
 array([    0.    ,  6249.9375, 12499.875 , 18749.8125, 24999.75  ,
        31249.6875, 37499.625 , 43749.5625, 49999.5   , 56249.4375,
        62499.375 , 68749.3125, 74999.25  , 81249.1875, 87499.125 ,
        93749.0625, 99999.    ]),
 17)

In [12]:
classes = calc_classes(classes)
labels = [f'CG_{classes[i]}-{classes[i + 1]}' for i in range(len(classes)-1)]
df['capital-gain'] = pd.cut(df['capital-gain'], bins=classes,
                  labels=labels)
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,AG_35.25-39.8125,State-gov,FW_12285.0-104311.25,Bachelors,EN_12.25-13.1875,Never-married,Adm-clerical,Not-in-family,White,Male,CG_0.0-6249.9375,0,40,United-States,<=50K
1,AG_48.9375-53.5,Self-emp-not-inc,FW_12285.0-104311.25,Bachelors,EN_12.25-13.1875,Married-civ-spouse,Exec-managerial,Husband,White,Male,CG_-6249.9375-0.0,0,13,United-States,<=50K
2,AG_35.25-39.8125,Private,FW_196337.5-288363.75,HS-grad,EN_8.5-9.4375,Divorced,Handlers-cleaners,Not-in-family,White,Male,CG_-6249.9375-0.0,0,40,United-States,<=50K
3,AG_48.9375-53.5,Private,FW_196337.5-288363.75,11th,EN_6.625-7.5625,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,CG_-6249.9375-0.0,0,40,United-States,<=50K
4,AG_26.125-30.6875,Private,FW_288363.75-380390.0,Bachelors,EN_12.25-13.1875,Married-civ-spouse,Prof-specialty,Wife,Black,Female,CG_-6249.9375-0.0,0,40,Cuba,<=50K


In [13]:
freq, classes = np.histogram(df['capital-loos'], bins='sturges')
freq, classes, len(classes)

(array([31047,     6,    17,     8,    20,   269,   659,   305,   180,
           27,    12,     2,     0,     4,     2,     3], dtype=int64),
 array([   0.  ,  272.25,  544.5 ,  816.75, 1089.  , 1361.25, 1633.5 ,
        1905.75, 2178.  , 2450.25, 2722.5 , 2994.75, 3267.  , 3539.25,
        3811.5 , 4083.75, 4356.  ]),
 17)

In [14]:
classes = calc_classes(classes)
labels = [f'CL_{classes[i]}-{classes[i + 1]}' for i in range(len(classes)-1)]
df['capital-loos'] = pd.cut(df['capital-loos'], bins=classes,
                  labels=labels)
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,AG_35.25-39.8125,State-gov,FW_12285.0-104311.25,Bachelors,EN_12.25-13.1875,Never-married,Adm-clerical,Not-in-family,White,Male,CG_0.0-6249.9375,CL_-272.25-0.0,40,United-States,<=50K
1,AG_48.9375-53.5,Self-emp-not-inc,FW_12285.0-104311.25,Bachelors,EN_12.25-13.1875,Married-civ-spouse,Exec-managerial,Husband,White,Male,CG_-6249.9375-0.0,CL_-272.25-0.0,13,United-States,<=50K
2,AG_35.25-39.8125,Private,FW_196337.5-288363.75,HS-grad,EN_8.5-9.4375,Divorced,Handlers-cleaners,Not-in-family,White,Male,CG_-6249.9375-0.0,CL_-272.25-0.0,40,United-States,<=50K
3,AG_48.9375-53.5,Private,FW_196337.5-288363.75,11th,EN_6.625-7.5625,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,CG_-6249.9375-0.0,CL_-272.25-0.0,40,United-States,<=50K
4,AG_26.125-30.6875,Private,FW_288363.75-380390.0,Bachelors,EN_12.25-13.1875,Married-civ-spouse,Prof-specialty,Wife,Black,Female,CG_-6249.9375-0.0,CL_-272.25-0.0,40,Cuba,<=50K


In [15]:
freq, classes = np.histogram(df['hour-per-week'], bins='sturges')
freq, classes, len(classes)

(array([  295,   648,   761,  2239,  1307,  1999, 16137,  2713,  3827,
         1545,   301,   365,   216,    64,    38,   106], dtype=int64),
 array([ 1.   ,  7.125, 13.25 , 19.375, 25.5  , 31.625, 37.75 , 43.875,
        50.   , 56.125, 62.25 , 68.375, 74.5  , 80.625, 86.75 , 92.875,
        99.   ]),
 17)

In [16]:
classes = calc_classes(classes)
labels = [f'HW_{classes[i]}-{classes[i + 1]}' for i in range(len(classes)-1)]
df['hour-per-week'] = pd.cut(df['hour-per-week'], bins=classes,
                  labels=labels)
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,AG_35.25-39.8125,State-gov,FW_12285.0-104311.25,Bachelors,EN_12.25-13.1875,Never-married,Adm-clerical,Not-in-family,White,Male,CG_0.0-6249.9375,CL_-272.25-0.0,HW_37.75-43.875,United-States,<=50K
1,AG_48.9375-53.5,Self-emp-not-inc,FW_12285.0-104311.25,Bachelors,EN_12.25-13.1875,Married-civ-spouse,Exec-managerial,Husband,White,Male,CG_-6249.9375-0.0,CL_-272.25-0.0,HW_7.125-13.25,United-States,<=50K
2,AG_35.25-39.8125,Private,FW_196337.5-288363.75,HS-grad,EN_8.5-9.4375,Divorced,Handlers-cleaners,Not-in-family,White,Male,CG_-6249.9375-0.0,CL_-272.25-0.0,HW_37.75-43.875,United-States,<=50K
3,AG_48.9375-53.5,Private,FW_196337.5-288363.75,11th,EN_6.625-7.5625,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,CG_-6249.9375-0.0,CL_-272.25-0.0,HW_37.75-43.875,United-States,<=50K
4,AG_26.125-30.6875,Private,FW_288363.75-380390.0,Bachelors,EN_12.25-13.1875,Married-civ-spouse,Prof-specialty,Wife,Black,Female,CG_-6249.9375-0.0,CL_-272.25-0.0,HW_37.75-43.875,Cuba,<=50K


In [17]:
df_apriori = df[['age','workclass','final-weight','education','education-num',
                 'marital-status','relationship','race','capital-gain','capital-loos',
                 'hour-per-week','occupation', 'native-country','income']]

In [18]:
df_apriori.shape

(32561, 14)

In [19]:
df_apriori = df_apriori.sample(n = 1000)
df_apriori.shape

(1000, 14)

In [20]:
transacoes = []
for i in range(df_apriori.shape[0]):
    transacoes.append([str(df_apriori.values[i, j]) for j in range(df_apriori.shape[1])])

In [21]:
len(transacoes)

1000

In [22]:
transacoes[:2]

[['AG_62.625-67.1875',
  ' ?',
  'FW_12285.0-104311.25',
  ' HS-grad',
  'EN_8.5-9.4375',
  ' Widowed',
  ' Not-in-family',
  ' White',
  'CG_-6249.9375-0.0',
  'CL_-272.25-0.0',
  'HW_37.75-43.875',
  ' ?',
  ' United-States',
  ' <=50K'],
 ['AG_17.0-21.5625',
  ' Local-gov',
  'FW_12285.0-104311.25',
  ' Some-college',
  'EN_9.4375-10.375',
  ' Never-married',
  ' Not-in-family',
  ' White',
  'CG_-6249.9375-0.0',
  'CL_1905.75-2178.0',
  'HW_37.75-43.875',
  ' Other-service',
  ' United-States',
  ' <=50K']]

In [23]:
!pip install apyori



You should consider upgrading via the 'C:\ProgramData\Anaconda3\python.exe -m pip install --upgrade pip' command.


In [24]:
from apyori import apriori

In [25]:
rules = apriori(transacoes, min_support = 0.3, min_confidence = 0.5, min_lift=2)
result = list(rules)
len(result)

13

In [26]:
print(*result, sep='\n\n')

RelationRecord(items=frozenset({'EN_8.5-9.4375', ' HS-grad'}), support=0.308, ordered_statistics=[OrderedStatistic(items_base=frozenset({' HS-grad'}), items_add=frozenset({'EN_8.5-9.4375'}), confidence=1.0, lift=3.2467532467532467), OrderedStatistic(items_base=frozenset({'EN_8.5-9.4375'}), items_add=frozenset({' HS-grad'}), confidence=1.0, lift=3.2467532467532467)])

RelationRecord(items=frozenset({' Husband', ' Married-civ-spouse'}), support=0.398, ordered_statistics=[OrderedStatistic(items_base=frozenset({' Husband'}), items_add=frozenset({' Married-civ-spouse'}), confidence=1.0, lift=2.1739130434782608), OrderedStatistic(items_base=frozenset({' Married-civ-spouse'}), items_add=frozenset({' Husband'}), confidence=0.8652173913043478, lift=2.1739130434782608)])

RelationRecord(items=frozenset({' Husband', ' United-States', ' Married-civ-spouse'}), support=0.357, ordered_statistics=[OrderedStatistic(items_base=frozenset({' Husband'}), items_add=frozenset({' United-States', ' Married-civ

In [27]:
df_apriori.drop(['final-weight','education-num','marital-status','relationship',
                 'capital-gain','capital-loos', 'native-country'], inplace=True, axis=1)

In [28]:
transacoes = []
for i in range(df_apriori.shape[0]):
    transacoes.append([str(df_apriori.values[i, j]) for j in range(df_apriori.shape[1])])

In [35]:
rules = apriori(transacoes, min_support = 0.2, min_confidence = 0.2, min_lift=0.9)
result = list(rules)
len(result)

23

In [36]:
print(*result, sep='\n\n')

RelationRecord(items=frozenset({' <=50K'}), support=0.756, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({' <=50K'}), confidence=0.756, lift=1.0)])

RelationRecord(items=frozenset({' >50K'}), support=0.244, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({' >50K'}), confidence=0.244, lift=1.0)])

RelationRecord(items=frozenset({' HS-grad'}), support=0.308, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({' HS-grad'}), confidence=0.308, lift=1.0)])

RelationRecord(items=frozenset({' Private'}), support=0.689, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({' Private'}), confidence=0.689, lift=1.0)])

RelationRecord(items=frozenset({' Some-college'}), support=0.222, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({' Some-college'}), confidence=0.222, lift=1.0)])

RelationRecord(items=frozenset({' White'}), support=0.842, o