In [1]:
import pandas as pd
import numpy as np
import statistics as st
import matplotlib.pyplot as plt
from scipy.stats import norm
df=pd.DataFrame()
df['tid']=['t1','t2','t3','t4','t5','t6','t7','t8','t9','t10']
df['itemset'] = ['CE','CEF','BE','ABCDF','AC','ACEF','ABC','AEF','ABCDE','DEF']
data=df['itemset']

 1.1 

In [2]:
def printSet(itemSets,minSupport):
    if itemSets != {}:
        df=pd.DataFrame()
        df['itemset']=[list(item) for item in itemSets.keys()]
        df['support']=itemSets.values()
        df['length'] = df['itemset'].apply(lambda x: len(x))
        df[''] = [False if val/10<minSupport else True for val in itemSets.values()]
        print(df)


In [3]:
from collections import defaultdict


def returnItemsWithMinSupport(itemSet, transactionList, minSupport, freqSet):
    combItem = {}
    for item in itemSet:
        for transaction in transactionList:
            if item.issubset(transaction):
                freqSet[item] += 1
                combItem.setdefault(item, 0)
                combItem[item] += 1
    printSet(combItem,minSupport)
    num_items=10
    combItem = {item:val for item,val in combItem.items() if val/num_items >= minSupport}
    return combItem


def joinSet(itemSet, length):
    return set([i.union(j) for i in itemSet for j in itemSet if len(i.union(j)) == length])


def getItemSetTransactionList(dataset):
    transactionList = list( frozenset(transaction) for transaction in dataset)
    itemSet = set(frozenset([item]) for transaction in dataset for item in transaction)
    return itemSet, transactionList


def runApriori(data_iter, minSupport):
    itemSet, transactionList = getItemSetTransactionList(data_iter)
    print('Unique elements:',[list(item) for item in itemSet])
    freqSet = defaultdict(int)
    resultSet = {}
    print('\nStep 1')
    currentLSet = returnItemsWithMinSupport(itemSet, transactionList, minSupport, freqSet)
    resultSet.update(currentLSet)
    currentLSet = set(currentLSet.keys())
    k = 2
    while currentLSet != set([]): 
        currentLSet = joinSet(currentLSet, k)
        print('\nStep %d'% k)
        currentCSet = returnItemsWithMinSupport(currentLSet, transactionList, minSupport, freqSet)
        resultSet.update(currentCSet)
        currentLSet = set(currentCSet)
        k = k + 1
    return resultSet

In [4]:
print('Data:', list(df['itemset']))
print('Minsup ratio:',5/10)
print('Minsup value:',5)
result = runApriori(df['itemset'],0.5)
print('\nResult:')
printSet(result,0.5)

Data: ['CE', 'CEF', 'BE', 'ABCDF', 'AC', 'ACEF', 'ABC', 'AEF', 'ABCDE', 'DEF']
Minsup ratio: 0.5
Minsup value: 5
Unique elements: [['E'], ['B'], ['D'], ['A'], ['C'], ['F']]

Step 1
  itemset  support  length       
0     [E]        7       1   True
1     [B]        4       1  False
2     [D]        3       1  False
3     [A]        6       1   True
4     [C]        7       1   True
5     [F]        5       1   True

Step 2
  itemset  support  length       
0  [F, A]        3       2  False
1  [F, C]        3       2  False
2  [E, C]        4       2  False
3  [F, E]        4       2  False
4  [A, C]        5       2   True
5  [E, A]        3       2  False

Step 3

Result:
  itemset  support  length      
0     [E]        7       1  True
1     [A]        6       1  True
2     [C]        7       1  True
3     [F]        5       1  True
4  [A, C]        5       2  True


1.2.

In [5]:
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(df['itemset']).transform(df['itemset'])
df = pd.DataFrame(te_ary, columns=te.columns_)

In [6]:
from mlxtend.frequent_patterns import apriori
results = apriori(df, min_support=0.5, use_colnames=True)
results['length'] = results['itemsets'].apply(lambda x: len(x))
print(results)

   support itemsets  length
0      0.6      (A)       1
1      0.7      (C)       1
2      0.7      (E)       1
3      0.5      (F)       1
4      0.5   (A, C)       2


1.3

In [7]:
def getSortingUnique(dataset):
    itemSet = set(item for transaction in dataset for item in transaction)
    combItem = {}
    for item in itemSet:
        for transaction in dataset:
            if transaction.find(item)>-1:
                combItem.setdefault(item, 0)
                combItem[item] += 1
    sorted_dict={}
    sorted_keys=sorted(combItem, key = combItem.get,reverse=True)
    for key in sorted_keys:
        sorted_dict[key]=combItem[key]
    return sorted_dict

In [8]:
def getSortingData(dataset,sorted_dict):
    sort_data=[]
    for transaction in dataset: 
        sort_trans=[]
        for item in sorted_dict.keys():
             if transaction.find(item)>-1:
                    sort_trans.append(item)
        sort_data.append(sort_trans)
    return sort_data
           

In [9]:
min_sup=0.3
from collections import namedtuple
#path=namedtuple('path','start end')
class FPNode(object):
    def __init__(self,names,frequency=1):
        self.name=names
        self.frequency=frequency
        self.ancestors=None
        self.children={}
        self.new=True
    def changeFreq(self):
        self.frequency+=1
    def addNode(self, node):
        if not node.name in node.children:
            self.children[node.name] = node
            node.ancestors = self
    def findNode(self, item):
        try:
            return self.children[item]
        except KeyError:
            return None
    def disp(self, ind=1):
        if self.name != None:
            if self.ancestors and self.ancestors.new==True:
                print(" (", end="")
                self.ancestors.new=False
            print(self.name, end="")
            print(":", end="")
            print(self.frequency, end="")
            print(":", end="")
            parent = self.ancestors
            p = ''
            while parent != None and parent.name != None:
                p = p + parent.name
                parent = parent.ancestors
            print(p[::-1], end="")
            if self.children == {}:
                print("(", end="")
        for child in self.children.values():
            child.disp(ind+1)
            print("),", end="")
class FPTree:
    def __init__(self):
        self.head = FPNode(None)
        self.uniq={}
    def addNode(self, transaction):
        t_node = self.head
       # head=self.head
        for item in transaction:
            n_node = t_node.findNode(item)
            if n_node:
                n_node.changeFreq()
            else:
                n_node = FPNode(item)
                t_node.addNode(n_node)
                #p=path(head,n_node)
                self.uniq.setdefault(item,[])
                self.uniq[item].append(n_node)
            t_node = n_node   
    def findNode(s_node):
        for node in self:
            if node.name == s_node:
                node.frequency+=1
    def getAncestors(self):
        for current_node in self:
            pass
        if current_node!=self.head:
            return current_node.ancestors+current_node.name
        return ''
    def disp(self):
        self.head.disp()
    def findAncetsetors(self,item):
        if self.name==item:
            return 
    def findSuffixPrefix(self):
        for item,arr_path in self.uniq.items():
            prefix={}
            for path in arr_path:
                node = path
                item=node.name
                frequency=node.frequency
                name_path=''
                node=node.ancestors
                prefix.setdefault(item,[])
                while node is not None and node.name is not None:
                    name_path+=node.name
                    node=node.ancestors
                if name_path!='':
                    prefix[item].append((name_path[::-1],frequency))
            print('sufix: %s, prefix: %s, empty prefix list: %s'%(item,prefix[item],True if prefix[item]== [] else False))
            makeCPTree(prefix[item],item)
    def findPrefixWithMinsup(self,frequency):
        for item,node in self.uniq.items():
            support=node[0].frequency
            if support/10>=min_sup:
                return(item,support)
             
def makeCPTree(prefix_list,suffix):
    tree=FPTree()
    node_sup={}
    for prefix in prefix_list: 
        path=prefix[0]
        frequency=prefix[1]
        for i in range(frequency):    
            tree.addNode(path)
    node_sup.setdefault(suffix,[])
    node_sup[suffix].append(tree.findPrefixWithMinsup(frequency))
    print(node_sup)
    tree.disp()
    print('\n')
        

In [10]:
llist.findSuffixPrefix()

NameError: name 'llist' is not defined

In [None]:
llist=FPTree()
for tr in b:
    llist.addNode(tr)
llist.disp()

In [None]:
a = getSortingUnique(data)
a

In [None]:
b=getSortingData(data,a)
b

1.4

In [None]:
from mlxtend.frequent_patterns import fpgrowth
results = fpgrowth(df, min_support=0.3, use_colnames=True)
results['length'] = results['itemsets'].apply(lambda x: len(x))
print(results)

### Задание 2

In [None]:
dataset=['CDHJ','ABDEFHJ','AEGJ','ACEFG','CFGI','CDEHIJ','BEFGJ','ADHI','ABF','DHI']

In [None]:
print('Data:', dataset)
print('Minsup ratio:',3/10)
print('Minsup value:',3)

2.1

In [None]:
itemSet,_ = getItemSetTransactionList(dataset)
print('Simple set count:',len(itemSet))

2.2

In [None]:
symbol_map={'A':'A','B':'B','C':'L','K':'L','G':'L','D':'K','E':'K','F':'K','H':'N','M':'N','I':'M','J':'M'}
levels=3
while(levels>1):
    new_data=[]
    for transaction in dataset:
        str=''
        for item, val in symbol_map.items():
            if transaction.find(item)>-1:
                str=str+val
        new_data.append(str)
    levels-=1
    dataset=new_data  
result = runApriori(new_data,0.3)

In [None]:
print('Symbol map:',symbol_map)
print('R data:',new_data)
print('\nResult:')
printSet(result,0.3)