In [1]:
import pandas as pd
select_columns = ['fund_category', 'fund_family', 'exchange_code', 'investment_type', 'size_type']
df=pd.read_csv('ETFs.csv')
ETFS= pd.DataFrame(df, columns=select_columns)
df=pd.read_csv('MutualFunds.csv')
MutualFunds= pd.DataFrame(df, columns=select_columns)

In [2]:
ETFS.shape

(2310, 5)

In [3]:
MutualFunds.shape

(23783, 5)

In [4]:
import logging

logging.basicConfig(level=logging.DEBUG)

try:
    # Your code here, e.g., potentially failing operation
    result = 10 / 0
except Exception as e:
    logging.error("An error occurred: %s", str(e))
    # Optionally, more detailed logging:
    logging.debug("Exception details:", exc_info=True)

ERROR:root:An error occurred: division by zero
DEBUG:root:Exception details:
Traceback (most recent call last):
  File "C:\Users\venus\AppData\Local\Temp\ipykernel_612\4044794201.py", line 7, in <module>
    result = 10 / 0
             ~~~^~~
ZeroDivisionError: division by zero


In [5]:
import pandas as pd
from collections import defaultdict
from itertools import combinations
from datetime import datetime
import logging
from tqdm import tqdm

# Set up basic configuration for logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class Apriori:
    def __init__(self, filename):
        self.file = filename
        self.transactions = list()
        self.k_itemset = defaultdict(int)
        self.GlobalFreqItem = {}
        
    def LoadData(self):
        select_columns = ['fund_category', 'fund_family', 'exchange_code', 'investment_type', 'size_type']
        df = pd.read_csv(self.file)
        filtered_df = pd.DataFrame(data=df, columns=select_columns)
        
        # Convert each row to a list of sets
        self.transactions = [set(row.dropna().tolist()) for index, row in filtered_df.iterrows()]
        
        # Count initial item frequencies
        for transaction in self.transactions:
            for item in transaction:
                self.k_itemset[frozenset([item])] += 1
        logging.info(f"Loaded and processed {len(self.transactions)} transactions.")

    def Count_Support(self, candidates, k):
        k_support = defaultdict(int)
        for transaction in tqdm(self.transactions, desc=f"Counting support for {k}-itemsets"):
            if len(transaction) >= k:
                subsets = list(combinations(transaction, k))
                for subset in subsets:
                    subset = frozenset(subset)
                    if subset in candidates:
                        k_support[subset] += 1
        return k_support

    def freq_item_dict(self, item_dict, support):
        return {k: v for k, v in item_dict.items() if v >= support}
  
    
    def apriori(self, minFreq):
        time_start = datetime.now()
        self.LoadData()
        
        minSup = minFreq * len(self.transactions)
        Lk = self.freq_item_dict(self.k_itemset, minSup)
        K = 1
        self.GlobalFreqItem[K] = Lk
        logging.info(f"Initial frequent itemsets: {len(Lk)} found.")
        while len(Lk) > 0:
            K += 1
            candidates = set()
            itemsets = list(Lk.keys())
            for i in tqdm(range(len(itemsets)), desc=f"Generating candidates for {K}-itemsets"):
                for j in range(i + 1, len(itemsets)):
                    union_set = itemsets[i].union(itemsets[j])
                    if len(union_set) == K:
                        candidates.add(union_set)
            
            C_k = self.Count_Support(candidates, K)
            Lk = self.freq_item_dict(C_k, minSup)
            self.GlobalFreqItem[K] = Lk
            self.GlobalFreqItem[K] = sorted(self.GlobalFreqItem[K].items(), key=lambda x: x[1], reverse=True)
            logging.info(f"{K}-itemsets: {len(Lk)} frequent itemsets found.")
            
        time_elapsed = datetime.now() - time_start
        logging.info(f"Apriori algorithm completed in {time_elapsed}.")
        return self.GlobalFreqItem, time_elapsed
    
    
    

In [6]:
time=0
ETFs_model = Apriori('ETFs.csv')
ETFsItems,t = ETFs_model.apriori(minFreq=0.1)

INFO:root:Loaded and processed 2310 transactions.
INFO:root:Initial frequent itemsets: 10 found.
Generating candidates for 2-itemsets: 100%|██████████| 10/10 [00:00<?, ?it/s]
Counting support for 2-itemsets: 100%|██████████| 2310/2310 [00:00<00:00, 362823.63it/s]
INFO:root:2-itemsets: 8 frequent itemsets found.
Generating candidates for 3-itemsets: 100%|██████████| 8/8 [00:00<?, ?it/s]
Counting support for 3-itemsets: 100%|██████████| 2310/2310 [00:00<00:00, 371291.14it/s]
INFO:root:3-itemsets: 1 frequent itemsets found.
Generating candidates for 4-itemsets: 100%|██████████| 1/1 [00:00<?, ?it/s]
Counting support for 4-itemsets: 100%|██████████| 2310/2310 [00:00<00:00, 842727.86it/s]
INFO:root:4-itemsets: 0 frequent itemsets found.
INFO:root:Apriori algorithm completed in 0:00:00.371272.


In [7]:
time=0
MutualFunds_model = Apriori('MutualFunds.csv')
MutualFundsItems,t = MutualFunds_model.apriori(minFreq=0.1)

INFO:root:Loaded and processed 23783 transactions.
INFO:root:Initial frequent itemsets: 7 found.
Generating candidates for 2-itemsets: 100%|██████████| 7/7 [00:00<?, ?it/s]
Counting support for 2-itemsets: 100%|██████████| 23783/23783 [00:00<00:00, 293325.14it/s]
INFO:root:2-itemsets: 10 frequent itemsets found.
Generating candidates for 3-itemsets: 100%|██████████| 10/10 [00:00<?, ?it/s]
Counting support for 3-itemsets: 100%|██████████| 23783/23783 [00:00<00:00, 352236.88it/s]
INFO:root:3-itemsets: 4 frequent itemsets found.
Generating candidates for 4-itemsets: 100%|██████████| 4/4 [00:00<?, ?it/s]
Counting support for 4-itemsets: 100%|██████████| 23783/23783 [00:00<00:00, 597588.94it/s]
INFO:root:4-itemsets: 0 frequent itemsets found.
INFO:root:Apriori algorithm completed in 0:00:03.287406.


## Freq Itemsets for ETFs.csv

In [8]:
ETFsItems[3]

[(frozenset({'Blend', 'Large', 'PCX'}), 293)]

In [9]:
ETFsItems[2]

[(frozenset({'Large', 'PCX'}), 592),
 (frozenset({'Blend', 'Large'}), 545),
 (frozenset({'Blend', 'PCX'}), 448),
 (frozenset({'PCX', 'Value'}), 332),
 (frozenset({'American Century Investments', 'PCX'}), 281),
 (frozenset({'Medium', 'PCX'}), 275),
 (frozenset({'BTS', 'Large'}), 270),
 (frozenset({'Large', 'Value'}), 264)]

In [10]:
## Support

In [11]:
ETFsItems[3][0][1]/2310

0.12683982683982684

In [12]:
for i in range(8):
    print(ETFsItems[2][i][1]/2310)

0.25627705627705627
0.23593073593073594
0.19393939393939394
0.14372294372294372
0.12164502164502164
0.11904761904761904
0.11688311688311688
0.11428571428571428


## Freq Itemsets for MutualFunds.csv

In [13]:
MutualFundsItems[3]

[(frozenset({'Blend', 'Large', 'NAS'}), 7457),
 (frozenset({'Large', 'NAS', 'Value'}), 2876),
 (frozenset({'Growth', 'Large', 'NAS'}), 2866),
 (frozenset({'Blend', 'Medium', 'NAS'}), 2423)]

In [14]:
MutualFundsItems[2]

[(frozenset({'Large', 'NAS'}), 13199),
 (frozenset({'Blend', 'NAS'}), 10902),
 (frozenset({'Blend', 'Large'}), 7457),
 (frozenset({'NAS', 'Value'}), 6847),
 (frozenset({'Medium', 'NAS'}), 5582),
 (frozenset({'Growth', 'NAS'}), 4925),
 (frozenset({'NAS', 'Small'}), 3893),
 (frozenset({'Large', 'Value'}), 2876),
 (frozenset({'Growth', 'Large'}), 2866),
 (frozenset({'Blend', 'Medium'}), 2423)]

In [15]:
## Support

In [16]:
for i in range(4):
    print(MutualFundsItems[3][i][1]/23783)

0.3135432872219653
0.12092671235756633
0.12050624395576673
0.10187949375604423


In [17]:
for i in range(10):
    print(MutualFundsItems[2][i][1]/23783)

0.5549762435352983
0.4583946516419291
0.3135432872219653
0.28789471471218936
0.2347054618845394
0.20708068788630535
0.16368834882058614
0.12092671235756633
0.12050624395576673
0.10187949375604423


In [18]:
list_ETFS=list()
for i in ETFS.columns:
    list_ETFS.append(ETFS[i].value_counts().index)

In [19]:
list_MutualFunds=list()
for i in MutualFunds.columns:
    list_MutualFunds.append(MutualFunds[i].value_counts().index)

In [20]:
ETFS

Unnamed: 0,fund_category,fund_family,exchange_code,investment_type,size_type
0,,DWS,PCX,,
1,Foreign Large Growth,Virtus,NGM,Blend,Large
2,Pacific/Asia ex-Japan Stk,American Century Investments,NGM,Blend,Large
3,Large Value,Thrivent Funds,PCX,Value,Large
4,Miscellaneous Sector,American Century Investments,PCX,Growth,Medium
...,...,...,...,...,...
2305,Pacific/Asia ex-Japan Stk,CBOE Vest,PCX,Blend,Large
2306,Miscellaneous Region,Buffalo,PCX,Value,Large
2307,Miscellaneous Region,Buffalo,PCX,Blend,Large
2308,Miscellaneous Region,Buffalo,PCX,Blend,Large


In [21]:
list_ETFS[0]

Index(['Large Blend', 'Large Value', 'Trading--Leveraged Equity',
       'Trading--Inverse Equity', 'Diversified Emerging Mkts', 'Large Growth',
       'Foreign Large Blend', 'Technology', 'Small Blend',
       'Miscellaneous Region', 'High Yield Bond', 'Mid-Cap Blend',
       'China Region', 'Health', 'Foreign Large Value', 'Ultrashort Bond',
       'Natural Resources', 'Corporate Bond', 'Mid-Cap Value',
       'Intermediate-Term Bond', 'Short-Term Bond', 'Mid-Cap Growth',
       'Real Estate', 'Miscellaneous Sector', 'Consumer Cyclical',
       'Europe Stock', 'Financial', 'Energy Limited Partnership',
       'Small Value', 'Emerging Markets Bond', 'Commodities Broad Basket',
       'Muni National Interm', 'Industrials', 'Small Growth',
       'Trading--Miscellaneous', 'Foreign Large Growth', 'Multisector Bond',
       'Equity Energy', 'Preferred Stock', 'Inflation-Protected Bond',
       'Intermediate Government', 'Long-Short Equity', 'Japan Stock',
       'Tactical Allocation', 'Lo

In [22]:
list_ETFS[1]

Index(['American Century Investments', 'BlackRock', 'AllianceBernstein',
       'Invesco', 'American Funds', 'Virtus', 'American Beacon', 'Alger',
       'MFS', 'Baillie Gifford Funds',
       ...
       'Argent', 'Berkshire', 'Biondo Investment Advisor', 'BFS',
       'American Trust Company', 'Barrett', 'Soundwatch Capital',
       'Alpha Fiduciary', 'Beech Hill', 'Adirondack Funds'],
      dtype='object', name='fund_family', length=150)

In [23]:
list_ETFS[2]

Index(['PCX', 'BTS', 'NGM', 'PNK'], dtype='object', name='exchange_code')

In [24]:
list_ETFS[3]

Index(['Blend', 'Value', 'Growth'], dtype='object', name='investment_type')

In [25]:
list_ETFS[4]

Index(['Large', 'Medium', 'Small'], dtype='object', name='size_type')