# Association rule discovery

To complete the exercise we will need the `mlextend` library since `scikit-learn` does not provide any tools for frequent itemset or association rule discovery.

In [1]:
!pip install mlxtend



Our first step is to download a piece of text from Wikipedia and to parse paragraphs.

In [1]:
import pandas as pd

In [3]:
# with open("./data/RC_2009-10", 'r') as file:
#     content = file.read().replace("\n", ",")
#     content = "["+content[:-1]+"]"
#     print(content[:1000])
#     with open("./data/RC_2009-10.json", "w") as f2:
#         f2.write(content)

In [4]:
comments = pd.read_json("./data/RC_2009-10.json")

In [5]:
# counts = comments[['author', 'subreddit', 'id']].groupby(['author', 'subreddit']).count()
counts = comments[['author', 'subreddit']].groupby(['author', 'subreddit']).size().unstack(fill_value=0)
counts.to_csv("./data/counts.csv")
counts.head()

subreddit,1000words,2012,2100science,3DMA,3rb,40Plus,411onsoaps,4WheelsNews,4chan,538,...,yoshiler,yospos,yourfav,youthpassion,youtube,yuen999,zelda,zen,zh,zombies
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---eeZurr---,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
---sniff---,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
--cough--,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-13-,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
counts_matrix = pd.read_csv("./data/counts.csv", index_col = 0)
threshold = 5
counts_matrix.head()
df = counts_matrix.mask(counts_matrix < threshold,0).astype(bool)

Unnamed: 0_level_0,1000words,2012,2100science,3DMA,3rb,40Plus,411onsoaps,4WheelsNews,4chan,538,...,yoshiler,yospos,yourfav,youthpassion,youtube,yuen999,zelda,zen,zh,zombies
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---eeZurr---,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
---sniff---,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
--cough--,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
-11,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
-13-,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [3]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

In [11]:
frequent_itemsets = apriori(df, min_support=0.002, use_colnames=True)

frequent_itemsets

Unnamed: 0,support,itemsets
0,0.142598,(AskReddit)
1,0.002778,(Christianity)
2,0.014425,(DoesAnybodyElse)
3,0.007422,(Economics)
4,0.064092,(IAmA)
...,...,...
1340,0.002274,"(worldnews, pics, AskReddit, politics, reddit...."
1341,0.002059,"(worldnews, pics, AskReddit, reddit.com, scien..."
1342,0.002048,"(worldnews, pics, AskReddit, politics, science..."
1343,0.002091,"(funny, pics, politics, technology, science, W..."


In [13]:
from mlxtend.frequent_patterns import association_rules

association_rules(frequent_itemsets, 
                  metric='confidence', 
                  min_threshold=0.7)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(DoesAnybodyElse),(AskReddit),0.014425,0.142598,0.011047,0.765799,5.370329,0.008990,3.660970
1,(self),(AskReddit),0.005845,0.142598,0.004633,0.792661,5.558699,0.003800,4.135256
2,(todayilearned),(AskReddit),0.005041,0.142598,0.003818,0.757447,5.311755,0.003099,3.534902
3,(offbeat),(WTF),0.005899,0.044465,0.004247,0.720000,16.192359,0.003985,3.412624
4,(business),(reddit.com),0.004494,0.089371,0.003196,0.711217,7.958070,0.002794,3.153337
...,...,...,...,...,...,...,...,...,...
2323,"(worldnews, politics, science, WTF, funny)","(reddit.com, pics)",0.002649,0.031542,0.002338,0.882591,27.981529,0.002254,8.248591
2324,"(worldnews, politics, reddit.com, science, funny)","(WTF, pics)",0.002906,0.024088,0.002338,0.804428,33.395225,0.002268,4.990040
2325,"(worldnews, funny, science, WTF, reddit.com)","(politics, pics)",0.003314,0.015251,0.002338,0.705502,46.259969,0.002287,3.343819
2326,"(worldnews, pics, politics, science, WTF)","(reddit.com, funny)",0.003175,0.019380,0.002338,0.736486,38.002621,0.002277,3.721328


In [14]:
association_rules(frequent_itemsets, metric='lift', min_threshold=5.0)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(AskReddit),(DoesAnybodyElse),0.142598,0.014425,0.011047,0.077467,5.370329,0.008990,1.068336
1,(DoesAnybodyElse),(AskReddit),0.014425,0.142598,0.011047,0.765799,5.370329,0.008990,3.660970
2,(AskReddit),(self),0.142598,0.005845,0.004633,0.032491,5.558699,0.003800,1.027541
3,(self),(AskReddit),0.005845,0.142598,0.004633,0.792661,5.558699,0.003800,4.135256
4,(AskReddit),(todayilearned),0.142598,0.005041,0.003818,0.026775,5.311755,0.003099,1.022332
...,...,...,...,...,...,...,...,...,...
21889,(pics),"(worldnews, politics, reddit.com, science, WTF...",0.052649,0.002563,0.002338,0.044408,17.324970,0.002203,1.043790
21890,(politics),"(worldnews, pics, reddit.com, science, WTF, fu...",0.039082,0.002982,0.002338,0.059824,20.065050,0.002222,1.060460
21891,(science),"(worldnews, pics, politics, reddit.com, WTF, f...",0.026212,0.003153,0.002338,0.089198,28.288823,0.002255,1.094472
21892,(WTF),"(worldnews, pics, politics, reddit.com, scienc...",0.044465,0.002595,0.002338,0.052581,20.259035,0.002223,1.052760


In [28]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

capital_idx =  rules['antecedents'].apply(lambda x: x.issuperset({'capital'}))
rules[capital_idx]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


In [15]:
import sqlite3

In [16]:
con = sqlite3.connect('example.db')
cur = con.cursor()

In [None]:
cur.execute('''
CREATE TABLE IF NOT EXISTS Subredit (
    id integer PRIMARY KEY,
    name text NOT NULL
)
''')

cur.execute('''
CREATE TABLE IF NOT EXISTS Rule (
    id INTEGER PRIMARY KEY,
    support REAL NOT NULL DEFAULT 0,
    consequence REAL NOT NULL DEFAULT 0
)
''')


cur.execute('''
CREATE TABLE IF NOT EXISTS Ancendents (
    subredit_id integer not null,
    rule_id integer not null,
    FOREIGN KEY(subredit_id) REFERENCES subredit(id),
    FOREIGN KEY(rule_id) REFERENCES rule(id),
    PRIMARY KEY(subredit_id, rule_id)
)
''')