# Association rule discovery

To complete the exercise we will need the `mlextend` library since `scikit-learn` does not provide any tools for frequent itemset or association rule discovery.

In [1]:
!pip install mlxtend



Our first step is to download a piece of text from Wikipedia and to parse paragraphs.

In [2]:
import pandas as pd

In [3]:
# with open("./data/RC_2009-10", 'r') as file:
#     content = file.read().replace("\n", ",")
#     content = "["+content[:-1]+"]"
#     print(content[:1000])
#     with open("./data/RC_2009-10.json", "w") as f2:
#         f2.write(content)

In [5]:
comments = pd.read_json("./data/RC_2009-10.json")

In [4]:
# counts = comments[['author', 'subreddit', 'id']].groupby(['author', 'subreddit']).count()
counts = comments[['author', 'subreddit']].groupby(['author', 'subreddit']).size().unstack(fill_value=0)
counts.to_csv("./data/counts.csv")
counts.head()

In [None]:
counts_matrix = pd.read_json("./data/counts_matrix.json")
threshold = 5
df = counts_matrix.mask(counts_matrix <threshold,0)
df.head()

In [None]:
counts_matrix.to_csv("./data/counts_matrix.json")

In [None]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

In [24]:
frequent_itemsets = apriori(df, min_support=0.04, use_colnames=True)

frequent_itemsets

Unnamed: 0,support,itemsets
0,0.414313,(AskReddit)
1,0.053732,(DoesAnybodyElse)
2,0.205532,(IAmA)
3,0.050214,(Music)
4,0.205650,(WTF)
...,...,...
123,0.067942,"(funny, WTF, pics, reddit.com)"
124,0.054901,"(pics, WTF, politics, reddit.com)"
125,0.054611,"(pics, WTF, science, reddit.com)"
126,0.050117,"(funny, pics, science, reddit.com)"


In [26]:
from mlxtend.frequent_patterns import association_rules

association_rules(frequent_itemsets, 
                  metric='confidence', 
                  min_threshold=0.7)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(worldnews),(reddit.com),0.101264,0.429232,0.071846,0.709490,1.652929,0.028380,1.964708
1,"(WTF, IAmA)",(AskReddit),0.084094,0.414313,0.072350,0.860349,2.076567,0.037509,4.193944
2,"(funny, IAmA)",(AskReddit),0.071599,0.414313,0.062237,0.869233,2.098008,0.032572,4.478858
3,"(pics, IAmA)",(AskReddit),0.088588,0.414313,0.076297,0.861259,2.078762,0.039594,4.221441
4,"(IAmA, politics)",(AskReddit),0.059630,0.414313,0.051426,0.862410,2.081540,0.026720,4.256755
...,...,...,...,...,...,...,...,...,...
113,"(funny, WTF, AskReddit, reddit.com)",(pics),0.063041,0.235369,0.053571,0.849779,3.610418,0.038733,5.090038
114,"(funny, pics, AskReddit, reddit.com)",(WTF),0.066419,0.205650,0.053571,0.806556,3.921985,0.039912,4.106353
115,"(funny, WTF, pics, reddit.com)",(AskReddit),0.067942,0.414313,0.053571,0.788477,1.903092,0.025421,2.768899
116,"(pics, WTF, AskReddit, reddit.com)",(funny),0.073712,0.174430,0.053571,0.726757,4.166474,0.040713,3.021376


In [27]:
association_rules(frequent_itemsets, metric='lift', min_threshold=5.0)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(funny, science)","(pics, reddit.com)",0.069583,0.143574,0.050117,0.720253,5.01659,0.040127,3.061427
1,"(funny, reddit.com)","(pics, science)",0.114799,0.083847,0.050117,0.436566,5.206681,0.040492,1.626015
2,"(pics, science)","(funny, reddit.com)",0.083847,0.114799,0.050117,0.597723,5.206681,0.040492,2.200477
3,"(pics, reddit.com)","(funny, science)",0.143574,0.069583,0.050117,0.34907,5.01659,0.040127,1.429365
4,"(funny, WTF, AskReddit)","(pics, reddit.com)",0.073841,0.143574,0.053571,0.72549,5.053069,0.042969,3.119837
5,"(funny, pics, AskReddit)","(WTF, reddit.com)",0.078624,0.132699,0.053571,0.681353,5.134571,0.043137,2.721825
6,"(funny, AskReddit, reddit.com)","(pics, WTF)",0.084512,0.119958,0.053571,0.633883,5.284212,0.043433,2.40372
7,"(pics, WTF, AskReddit)","(funny, reddit.com)",0.088223,0.114799,0.053571,0.607221,5.289415,0.043443,2.253687
8,"(WTF, AskReddit, reddit.com)","(funny, pics)",0.09794,0.107657,0.053571,0.546978,5.080767,0.043027,1.969756
9,"(pics, AskReddit, reddit.com)","(funny, WTF)",0.103506,0.09913,0.053571,0.517563,5.221042,0.04331,1.867331


Both frequent itemsets and association rules (antecedens and consequents) are returned as `frozenset`s, so we can use [standard API calls](https://docs.python.org/3/library/stdtypes.html#set-types-set-frozenset) to find subsets, supersets, etc.

In [28]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

capital_idx =  rules['antecedents'].apply(lambda x: x.issuperset({'capital'}))
rules[capital_idx]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
