In [1]:
# Adult.ipynb
import pandas as pd
import os
from dataclasses import asdict
os.chdir('/Users/nateaugust/Repos/natelib/')
from pattern_mining.FPGrowth import FPTree
from pattern_mining.apriori import apriori
from pattern_mining.association_rules import mine_association_rules



In [2]:
cols = ['age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income']

In [3]:
adult = pd.read_csv('data/adult.data', names=cols)

In [4]:
adult.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# Drop numeric cols
adult.drop(
    labels=adult.dtypes[adult.dtypes == 'int64'].index.to_list(),
    axis=1,
    inplace=True)

In [6]:
# Add attribute name to all attribute values
for col in adult.columns:
    adult[col] = f'{col}:' + adult[col]

## Check to ensure our algorithms return the same frequent patterns

In [7]:
transactions = adult.values

In [10]:
apriori_patterns = apriori(transactions, min_sup=len(transactions)*0.3, prune_infrequent=True)

In [11]:
fpt = FPTree()
fpt.fit(transactions, min_sup=0.3)
fpg_patterns = fpt.mine()

In [12]:
assert len(fpg_patterns) == len(apriori_patterns)
for k, v in fpg_patterns.items():
    # Exist in both
    assert apriori_patterns[tuple(sorted(k))] == v
    
    # And meet min_support
    assert v > len(transactions)*0.3

## Mine assocation rules

In [None]:
fptree = FPTree()
fptree.fit(transactions, min_sup=0.05)
frequent_patterns = fptree.mine()

In [28]:
association_rules = mine_association_rules(frequent_patterns, min_confidence=0.3)

In [29]:
# Make into a dataframe
df = pd.DataFrame.from_dict([asdict(ar) for ar in association_rules]).sort_values(by=['support','confidence'])

# Make support a ratio
df['support'] = df.support / len(transactions)

In [None]:
# Strongest association rules
print(df.tail(5).to_markdown())

|     | A                                  | B                                  |   support |   confidence |
|----:|:-----------------------------------|:-----------------------------------|----------:|-------------:|
| 280 | ('income: <=50K',)                 | ('race: White',)                   |  0.635699 |     0.837338 |
| 289 | ('native-country: United-States',) | ('income: <=50K',)                 |  0.675624 |     0.754165 |
| 288 | ('income: <=50K',)                 | ('native-country: United-States',) |  0.675624 |     0.889927 |
| 290 | ('native-country: United-States',) | ('race: White',)                   |  0.786862 |     0.878334 |
| 291 | ('race: White',)                   | ('native-country: United-States',) |  0.786862 |     0.921089 |


In [48]:
# race --> income: <50k 
f1 = df[df.A.apply(lambda x: 'race' in x[0] and len(x)==1)]
print(f1[f1.B==('income: <=50K',)].to_markdown())

|       | A                | B                  |   support |   confidence |
|------:|:-----------------|:-------------------|----------:|-------------:|
|    68 | ('race: Black',) | ('income: <=50K',) | 0.0840576 |      0.87612 |
| 10027 | ('race: White',) | ('income: <=50K',) | 0.635699  |      0.74414 |
