In [1]:
import pandas as pd

In [2]:
rules = pd.read_json("../data/arules.json")
rules["antecedents"] = rules["antecedents"].apply(lambda x: frozenset(x))
rules["consequents"] = rules["consequents"].apply(lambda x: frozenset(x))

In [3]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bitcoin),(CryptoCurrency),0.012683,0.035937,0.003876,0.305556,8.502633,0.003420,1.388251
1,(SHIBArmy),(CryptoCurrency),0.003641,0.035937,0.001174,0.322581,8.976386,0.001044,1.423141
2,(ethtrader),(CryptoCurrency),0.004932,0.035937,0.002231,0.452381,12.588313,0.002054,1.760464
3,(SatoshiStreetBets),(CryptoCurrency),0.004228,0.035937,0.001762,0.416667,11.594499,0.001610,1.652680
4,(CryptoMoonShots),(CryptoCurrency),0.003523,0.035937,0.001057,0.300000,8.348039,0.000930,1.377233
...,...,...,...,...,...,...,...,...,...
1016443,"(Showerthoughts, Wellthatsucks, PS5)","(SelfAwarewolves, assholedesign, videos, movie...",0.000352,0.000235,0.000235,0.666667,2838.333333,0.000235,2.999295
1016444,"(Showerthoughts, science, Wellthatsucks)","(SelfAwarewolves, PS5, assholedesign, videos, ...",0.000587,0.000235,0.000235,0.400000,1703.000000,0.000235,1.666275
1016445,"(science, Wellthatsucks, PS5)","(SelfAwarewolves, Showerthoughts, assholedesig...",0.000235,0.000235,0.000235,1.000000,4257.500000,0.000235,
1016446,"(SelfAwarewolves, PS5)","(Showerthoughts, assholedesign, Wellthatsucks,...",0.000235,0.000235,0.000235,1.000000,4257.500000,0.000235,


In [4]:
import sqlite3
from itertools import chain

In [5]:
con = sqlite3.connect('../data/example.db')

In [6]:
def create_db(conn):
    cur = con.cursor()
    cur.execute('''
    CREATE TABLE IF NOT EXISTS Subreddits (
        id integer PRIMARY KEY,
        name text NOT NULL
    )
    ''')

    cur.execute('''
    CREATE TABLE IF NOT EXISTS Rules (
        id INTEGER PRIMARY KEY,
        support REAL NOT NULL DEFAULT 0,
        confidence REAL NOT NULL DEFAULT 0,
        lift REAL NOT NULL DEFAULT 0
    )
    ''')


    cur.execute('''
    CREATE TABLE IF NOT EXISTS Antecedents (
        rule_id integer not null,
        subreddit_id integer not null,
        FOREIGN KEY(subreddit_id) REFERENCES Subredsits(id),
        FOREIGN KEY(rule_id) REFERENCES Rules(id),
        PRIMARY KEY(rule_id, subreddit_id)
    )
    ''')

    cur.execute('''
    CREATE TABLE IF NOT EXISTS Consequents (
        rule_id integer not null,
        subreddit_id integer not null,
        FOREIGN KEY(subreddit_id) REFERENCES Subredsits(id),
        FOREIGN KEY(rule_id) REFERENCES Rules(id),
        PRIMARY KEY(rule_id, subreddit_id)
    )
    ''')
    
#     cur.execute('''
#     CREATE UNIQUE INDEX subreddits_id_index 
# ON Subreddits (id)''')
#     cur.execute('''
#     CREATE UNIQUE INDEX subreddits_name_index 
# ON Subreddits (name)''')
#     cur.execute('''
#     CREATE UNIQUE INDEX rule_id_index 
# ON Rules (id)''')
#     cur.execute('''
#     CREATE INDEX Antecedents_subreddit_id_index 
# ON Antecedents (subreddit_id)''')
#     cur.execute('''
#     CREATE INDEX Antecedents_rule_id_index 
# ON Antecedents (rule_id)''')
#     cur.execute('''
#     CREATE INDEX Consequents_subreddit_id_index 
# ON Consequents (subreddit_id)''')
#     cur.execute('''
#     CREATE INDEX Consequents_rule_id_index 
# ON Consequents (rule_id)''')
    con.commit()

In [7]:
def add_rule(rule, cur):
    data = (rule["support"], rule["confidence"], rule["lift"])
    
    sql = '''INSERT INTO Rules (support,confidence, lift) VALUES(?,?,?)'''
    cur.execute(sql,data)
    return cur.lastrowid

def add_antecedents(rule_id, antecedents, cur):    
    sql = '''INSERT INTO antecedents(rule_id, subreddit_id) VALUES '''
    sql += ','.join(["(?, (SELECT id from Subreddits WHERE name = ?))" for i in range(len(antecedents))])
    data = chain.from_iterable(zip([rule_id]*len(antecedents), [an for an in antecedents]))
    cur.execute(sql, tuple(data))
    
def add_consequents(rule_id, consequents, cur):    
    sql = '''INSERT INTO Consequents(rule_id, subreddit_id) VALUES '''
    sql += ','.join(["(?, (SELECT id from Subreddits WHERE name = ?))" for i in range(len(consequents))])
    data = chain.from_iterable(zip([rule_id]*len(consequents), [an for an in consequents]))
    cur.execute(sql, tuple(data))
    
    
def add_subreddits(subreddits, cur):
    sql = '''SELECT name FROM Subreddits'''
    cur.execute(sql)
    already_added = set([sub[0] for sub in cur.fetchall()])
    to_add = subreddits - already_added
    if len(to_add) == 0:
        return
    sql = 'INSERT INTO subreddits(name) values '+','.join([ "(?)" for i in range(len(to_add))])
    data = list(to_add)
    cur.execute(sql,data)
        
        


In [8]:
create_db(con)

In [21]:
subreddits = set(item for sublist in rules["antecedents"] for item in sublist) | set(item for sublist in rules["consequents"] for item in sublist)
print(len(subreddits))
subreddits

921


{'196',
 '2007scape',
 '2meirl4meirl',
 '3DS',
 '3Dprinting',
 '40kLore',
 '49ers',
 '4chan',
 '90DayFiance',
 '90dayfianceuncensored',
 'ABoringDystopia',
 'ADHD',
 'AEWOfficial',
 'AFL',
 'AMA',
 'AMCSTOCKS',
 'ANormalDayInRussia',
 'ARK',
 'ASX_Bets',
 'ATBGE',
 'AbandonedPorn',
 'AbruptChaos',
 'AbsoluteUnits',
 'Accounting',
 'ActualPublicFreakouts',
 'Advice',
 'AdviceAnimals',
 'AirForce',
 'AmItheAsshole',
 'Amd',
 'AnalogCommunity',
 'AnarchyChess',
 'Android',
 'AnimalCrossing',
 'AnimalsBeingBros',
 'AnimalsBeingDerps',
 'AnimalsBeingJerks',
 'Animemes',
 'Animesuggest',
 'ApexOutlands',
 'Aquariums',
 'AreTheStraightsOK',
 'Art',
 'AskALiberal',
 'AskAnAmerican',
 'AskConservatives',
 'AskCulinary',
 'AskEurope',
 'AskHistorians',
 'AskLosAngeles',
 'AskMen',
 'AskMenOver30',
 'AskOldPeople',
 'AskOuija',
 'AskRedditAfterDark',
 'AskScienceFiction',
 'AskUK',
 'AskWomen',
 'Astros',
 'AusFinance',
 'Austin',
 'BB_Stock',
 'Bad_Cop_No_Donut',
 'Baking',
 'BanVideoGames',
 'B

In [10]:
cur=con.cursor()
add_subreddits(subreddits, cur)

In [11]:
cur = con.cursor()
for index, row in rules.iterrows():
    if index % 100000 == 0:
        print(index)
    rule_id = add_rule(row,cur)
    add_antecedents(rule_id, row["antecedents"],cur)
    add_consequents(rule_id, row["consequents"], cur)
    
con.commit()

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000


In [12]:
def get_antecendents(rule_id, cur):
    sql = '''
    SELECT subreddits.name
    FROM rules
    INNER JOIN antecedents ON rules.id = Antecedents.rule_id
    INNER JOIN subreddits ON Antecedents.subreddit_id = subreddits.id
    WHERE rules.id = ?
    '''
    data = (rule_id,)
    cur.execute(sql,data)
    antecedents = cur.fetchall()
    return antecedents

def get_consequents(rule_id, cur):
    sql = '''
    SELECT subreddits.name
    FROM rules
    INNER JOIN Consequents ON rules.id = Consequents.rule_id
    INNER JOIN subreddits ON Consequents.subreddit_id = subreddits.id
    WHERE rules.id = ?
    '''
    data = (rule_id,)
    cur.execute(sql,data)
    consequents = cur.fetchall()
    return consequents

def get_rule_data(rule_id, cur):
    sql = '''
    SELECT *
    FROM rules
    WHERE rules.id=?'''
    data=(rule_id,)
    cur.execute(sql,data)
    res = cur.fetchall()
    if len(res) == 0:
        return None
    return res[0]

def get_rule(rule_id,cur):
    rule = get_rule_data(rule_id, cur)
    antecedents = get_antecendents(rule_id, cur)
    consequents = get_consequents(rule_id, cur)
    print(rule)
    print("IF:",antecedents)
    print("THEN",consequents)


In [13]:
get_rule(95, cur)

(95, 0.0003523194, 0.30000000000000004, 4.0291798107)
IF: [('NYYankees',)]
THEN [('nottheonion',)]


In [14]:
def get_subreddit_ids(subreddits, cur):
    sql = 'SELECT id FROM Subreddits WHERE name IN ('+','.join(["?"]*len(subreddits))+")"
    data = tuple(subreddits)
    cur.execute(sql, data)
    result = set([sub[0] for sub in cur.fetchall()])
    return result

def get_appropriate_rule_ids(subreddit_ids, cur):
    sql = '''
    SELECT R.id
    FROM rules as R
    INNER JOIN antecedents as A ON R.id = A.rule_id
    INNER JOIN Consequents as C ON R.id = C.rule_id
    WHERE 
    C.subreddit_id NOT IN ('''+','.join(["?"]*len(subreddit_ids))+''')
    GROUP BY R.id
    HAVING
    count(*) > 0 AND
    count(*) = (SELECT count(*)
                FROM antecedents 
                WHERE rule_id = R.id AND
                antecedents.subreddit_id IN ('''+','.join(["?"]*len(subreddit_ids))+'''))
    '''
    
    cur.execute(sql, tuple(subreddit_ids)+tuple(subreddit_ids))
    rule_candidates = [el[0] for el in cur.fetchall()]
    return rule_candidates


def get_recomendations(subreddits, cur):
    subreddit_ids = get_subreddit_ids(subreddits, cur)
    rule_candidates = get_appropriate_rule_ids(subreddit_ids, cur)
    sql = '''
    SELECT S.name
    FROM consequents AS C
    INNER JOIN Rules as R ON C.rule_id = R.id
    INNER JOIN subreddits as S ON C.subreddit_id = S.id
    WHERE 
    C.rule_id IN ('''+','.join(["?"]*len(rule_candidates))+''') AND
    C.subreddit_id NOT IN ('''+','.join(["?"]*len(subreddit_ids))+''')
    GROUP BY C.subreddit_id
    ORDER BY MAX(R.confidence + R.support) DESC
    '''
    cur.execute(sql, tuple(rule_candidates)+ tuple(subreddit_ids))
    return cur.fetchall()
    

In [15]:
get_subreddit_ids({'funny', 'WTF', 'math', 'fffffffuuuuuuuuuuuu', 'bestof'}, cur)

{183, 436}

In [16]:
# ids = get_appropriate_rule_ids(get_subreddit_ids({'funny', 'WTF', 'math', 'fffffffuuuuuuuuuuuu', 'bestof'},cur), cur)
# for idx in [1,2,3]:
#     get_rule(idx, cur)

In [20]:
get_recomendations({ 'Bitcoin', 'Ethereum', 'SHIBArmy', 'ethtrader', 'Balls'}, cur)

[('CryptoCurrency',), ('cardano',)]

In [18]:
%timeit -n 5 get_recomendations({ 'Bitcoin', 'Ethereum', 'SHIBArmy', 'ethtrader'}, cur)

2.83 s ± 77 ms per loop (mean ± std. dev. of 7 runs, 5 loops each)


In [19]:
#better sql
# sql = '''
#     SELECT R.id
#     FROM rules as R
#     INNER JOIN antecedents as A ON R.id = A.rule_id
#     INNER JOIN Consequents as C ON R.id = C.rule_id
#     WHERE 
#     C.subreddit_id NOT IN ('''+','.join(["?"]*len(subreddit_ids))+''')
#     GROUP BY R.id
#     HAVING 
#     A.subreddit_id ALL IN ('''+','.join(["?"]*len(subreddit_ids))+''')
#     '''