In [9]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

from sklearn.metrics import cohen_kappa_score, jaccard_score
import scipy.stats as stats

In [10]:
data = pd.read_csv("../data/aggregated.csv")
data.dropna(inplace=True)
data.set_index("Date", inplace=True)
data.head()

companies = ['INTC', 'NVDA', 'AMZN', 'IBM', 'GOOG', 'META', 'MSFT', 'TSLA', 'ORCL', 'AAPL']

In [11]:
df_binary = data.applymap(lambda x : 1 if x > 0 else 0)
df_binary.head()

Unnamed: 0_level_0,AAPL,AMZN,GOOG,IBM,INTC,META,ORCL,NVDA,MSFT,TSLA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6/8/2021,1,1,1,1,0,0,1,0,0,0
6/9/2021,0,1,0,1,0,0,0,0,0,0
6/10/2021,0,1,1,0,1,1,0,1,1,1
6/11/2021,1,0,0,1,1,0,1,1,0,0
6/14/2021,1,1,1,0,1,1,0,1,1,1


In [12]:
frequent_itemsets = apriori(df_binary, min_support=0.01, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

frequent_itemsets = frequent_itemsets[frequent_itemsets['length'] >= 2]
frequent_itemsets.sort_values(by='support', ascending=False, inplace=True)
frequent_itemsets.head(10)



Unnamed: 0,support,itemsets,length
17,0.440476,"(AAPL, MSFT)",2
11,0.424603,"(AAPL, GOOG)",2
16,0.410714,"(AAPL, NVDA)",2
32,0.404762,"(MSFT, GOOG)",2
14,0.402778,"(AAPL, META)",2
29,0.400794,"(META, GOOG)",2
10,0.39881,"(AAPL, AMZN)",2
47,0.392857,"(META, MSFT)",2
52,0.392857,"(NVDA, MSFT)",2
46,0.388889,"(META, NVDA)",2


In [13]:
# Generate association rules with support_only=True
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.1, support_only=True)

# Sort rules by confidence
rules = rules.sort_values('confidence', ascending=False)

# Display the top 5 and bottom 5 rules
print('Top 5 rules by confidence:')
print(rules.head(5))
print('\nBottom 5 rules by confidence:')
print(rules.tail(5))

Top 5 rules by confidence:
  antecedents consequents  antecedent support  consequent support   support  \
0      (AAPL)      (MSFT)                 NaN                 NaN  0.440476   
1      (MSFT)      (AAPL)                 NaN                 NaN  0.440476   
2      (AAPL)      (GOOG)                 NaN                 NaN  0.424603   
3      (GOOG)      (AAPL)                 NaN                 NaN  0.424603   
4      (AAPL)      (NVDA)                 NaN                 NaN  0.410714   

   confidence  lift  leverage  conviction  zhangs_metric  
0         NaN   NaN       NaN         NaN            NaN  
1         NaN   NaN       NaN         NaN            NaN  
2         NaN   NaN       NaN         NaN            NaN  
3         NaN   NaN       NaN         NaN            NaN  
4         NaN   NaN       NaN         NaN            NaN  

Bottom 5 rules by confidence:
      antecedents                                        consequents  \
56997      (MSFT)  (AAPL, IBM, ORCL, AMZN

In [14]:
# Compute the support of each itemset
support = df_binary[companies].mean()

# Generate association rules manually
rules = []
for index, row in frequent_itemsets.iterrows():
    itemset = list(row['itemsets'])
    for i in range(len(itemset)):
        antecedent = itemset[i]
        consequent = itemset[:i] + itemset[i+1:]
        support_antecedent = support[antecedent]
        support_itemset = row['support']
        confidence = support_itemset / support_antecedent
        rules.append((antecedent, consequent, confidence))

# Sort rules by confidence
rules.sort(key=lambda x: x[2], reverse=True)

# Display the top 5 and bottom 5 rules
print('Top 5 rules by confidence:')
for rule in rules[:5]:
    print(f'{rule[0]} -> {rule[1]}: {rule[2]}')
print('\nBottom 5 rules by confidence:')
for rule in rules[-5:]:
    print(f'{rule[0]} -> {rule[1]}: {rule[2]}')

Top 5 rules by confidence:
MSFT -> ['AAPL']: 0.850574712643678
AMZN -> ['AAPL']: 0.8271604938271605
GOOG -> ['AAPL']: 0.8075471698113207
AAPL -> ['MSFT']: 0.8043478260869564
NVDA -> ['AAPL']: 0.796153846153846

Bottom 5 rules by confidence:
ORCL -> ['AAPL', 'IBM', 'AMZN', 'INTC', 'MSFT', 'NVDA', 'GOOG', 'META', 'TSLA']: 0.19696969696969696
GOOG -> ['AAPL', 'IBM', 'ORCL', 'AMZN', 'INTC', 'MSFT', 'NVDA', 'META', 'TSLA']: 0.1962264150943396
META -> ['AAPL', 'IBM', 'ORCL', 'AMZN', 'INTC', 'MSFT', 'NVDA', 'GOOG', 'TSLA']: 0.1962264150943396
AAPL -> ['IBM', 'ORCL', 'AMZN', 'INTC', 'NVDA', 'GOOG', 'META', 'TSLA']: 0.19202898550724637
AAPL -> ['IBM', 'ORCL', 'AMZN', 'INTC', 'MSFT', 'NVDA', 'GOOG', 'META', 'TSLA']: 0.18840579710144925


In [15]:
# Compute measures of interest for the top 5 rules
measures = []
for rule in rules[:5]:
    antecedent, consequent, confidence = rule
    support_antecedent = support[antecedent]
    support_consequent = support[consequent[0]] if len(consequent) == 1 else 0
    support_itemset = frequent_itemsets[frequent_itemsets['itemsets'] == set([antecedent] + consequent)]['support'].values[0]
    lift = support_itemset / (support_antecedent * support_consequent) if support_consequent > 0 else 0
    leverage = support_itemset - support_antecedent * support_consequent
    conviction = (1 - support_consequent) / (1 - confidence) if confidence < 1 else 0
    measures.append((antecedent, consequent, support_itemset, confidence, lift, leverage, conviction))

# Display the measures of interest
for measure in measures:
    print(f'{measure[0]} -> {measure[1]}: support = {measure[2]}, confidence = {measure[3]}, lift = {measure[4]}, leverage = {measure[5]}, conviction = {measure[6]}')

MSFT -> ['AAPL']: support = 0.44047619047619047, confidence = 0.850574712643678, lift = 1.5532233883058466, leverage = 0.15688775510204073, conviction = 3.0274725274725243
AMZN -> ['AAPL']: support = 0.39880952380952384, confidence = 0.8271604938271605, lift = 1.5104669887278581, leverage = 0.13477891156462585, conviction = 2.61734693877551
GOOG -> ['AAPL']: support = 0.4246031746031746, confidence = 0.8075471698113207, lift = 1.4746513535684984, leverage = 0.1366685563114134, conviction = 2.3506069094304376
AAPL -> ['MSFT']: support = 0.44047619047619047, confidence = 0.8043478260869564, lift = 1.5532233883058466, leverage = 0.15688775510204073, conviction = 2.464285714285713
NVDA -> ['AAPL']: support = 0.4107142857142857, confidence = 0.796153846153846, lift = 1.4538461538461536, leverage = 0.12821239606953888, conviction = 2.2192273135669343


In [16]:
# Compute measures of interest for the top 5 rules
measures = []
for rule in rules[:5]:
    antecedent, consequent, _ = rule

    # Compute correlation
    correlation = df_binary[antecedent].corr(df_binary[consequent[0]])

    # Compute odds ratio
    contingency_table = pd.crosstab(df_binary[antecedent], df_binary[consequent[0]])
    odds_ratio, _ = stats.fisher_exact(contingency_table)

    # Compute kappa
    kappa = cohen_kappa_score(df_binary[antecedent], df_binary[consequent[0]])

    # Compute interest
    interest = support_itemset / (support_antecedent * support_consequent)

    # Compute Jaccard index
    jaccard = jaccard_score(df_binary[antecedent], df_binary[consequent[0]])

    measures.append((antecedent, consequent, correlation, odds_ratio, kappa, interest, jaccard))

# Display the measures of interest
for measure in measures:
    print(f'{measure[0]} -> {measure[1]}: correlation = {measure[2]}, odds ratio = {measure[3]}, kappa = {measure[4]}, interest = {measure[5]}, Jaccard index = {measure[6]}')

MSFT -> ['AAPL']: correlation = 0.6308190014901631, odds ratio = 19.923076923076923, kappa = 0.6296928327645052, interest = 1.4538461538461536, Jaccard index = 0.7047619047619048
AMZN -> ['AAPL']: correlation = 0.5419230988411418, odds ratio = 11.868571428571428, kappa = 0.5372881355932203, interest = 1.4538461538461536, Jaccard index = 0.6320754716981132
GOOG -> ['AAPL']: correlation = 0.5499026691868946, odds ratio = 11.979127134724857, kappa = 0.5493733383972654, interest = 1.4538461538461536, Jaccard index = 0.654434250764526
AAPL -> ['MSFT']: correlation = 0.6308190014901631, odds ratio = 19.923076923076923, kappa = 0.6296928327645052, interest = 1.4538461538461536, Jaccard index = 0.7047619047619048
NVDA -> ['AAPL']: correlation = 0.5154511794191912, odds ratio = 9.90566037735849, kappa = 0.5144048521607278, interest = 1.4538461538461536, Jaccard index = 0.6291793313069909
