In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy 
import pandas 
import collections
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

In [3]:
df = pandas.read_csv('Training.csv')

In [8]:
class FungusClassifier(object):
    """Infers a hidden variable and uses Bayesian classification to predict whether a fungus is 
    edible or poisonous"""
    def __init__(self,filename):
        data=pandas.read_csv(filename,index_col=False)
        clusters=[]
        for (i,row) in data.iterrows():
            best=-1
            sim=0.5
            for (j,cluster) in enumerate(clusters):
                x=sum(cluster[key][value]/sum(cluster[key].values())
                      for (key,value) in row.iteritems())/(data.shape[1])
                if x>sim:
                    best=j
                    sim=x
            if best==-1:
                clusters.append(collections.defaultdict(lambda: collections.defaultdict(float)))
                print(i+1,'rows analysed',len(clusters),'clusters found')
            for (key,value) in row.iteritems():
                clusters[best][key][value]+=1.0
        index=[]
        for column in data.columns:
            index.extend([(column,value) for value in data[column].unique()])
        self.probabilities=pandas.DataFrame({(key,value):[cluster[key][value]+1.0 for cluster in clusters]
                                            for (key,value) in index}).T
        self.prior=self.probabilities.sum(axis=0)
        self.prior/=self.prior.sum()
        self.edibility_prior=self.probabilities.loc['class'].sum(axis=1)
        self.edibility_prior/=self.edibility_prior.sum()
        def normalize(group):
            return group.div(group.sum(axis=0),axis='columns')
        self.probabilities=self.probabilities.groupby(axis=0,level=0).apply(normalize)
        
    def __call__(self,**kwargs):
        "Estimates the probability that a fungus is edible given the features in kwargs"
        category=self.prior.copy()
        for (key,value) in kwargs.items():
            category*=self.probabilities.loc[(key,value)]
            category/=category.sum()
        result=self.edibility_prior*((self.probabilities.loc['class']*category).sum(axis=1))
        return result/result.sum()
    
    def test(self,filename):
        """Produces KDE plots of the estimated probability"""
        data=pandas.read_csv(filename,index_col=False)
        observables=[column for column in data.columns if column!='class']
        results=pandas.DataFrame([self(**row) for (i,row) in data[observables].iterrows()])
        results.loc[:,'class']=data['class']
        return results

In [9]:
BBN=FungusClassifier('mushrooms.csv')

1 rows analysed 1 clusters found
15 rows analysed 2 clusters found
1817 rows analysed 3 clusters found
3985 rows analysed 4 clusters found
4024 rows analysed 5 clusters found
4077 rows analysed 6 clusters found
4195 rows analysed 7 clusters found
6039 rows analysed 8 clusters found
6416 rows analysed 9 clusters found
6913 rows analysed 10 clusters found
6968 rows analysed 11 clusters found


In [11]:
data=pandas.read_csv('mushrooms.csv',index_col=False)
clusters=[]
for (i,row) in data.iterrows():
    best=-1
    sim=0.5
    for (j,cluster) in enumerate(clusters):
        x=sum(cluster[key][value]/sum(cluster[key].values()) for (key,value) in row.iteritems())/(data.shape[1])
        if x>sim:
            best=j
            sim=x
    if best==-1:
        clusters.append(collections.defaultdict(lambda: collections.defaultdict(float)))
        print(i+1,'rows analysed',len(clusters),'clusters found')
    for (key,value) in row.iteritems():
        clusters[best][key][value]+=1.0

1 rows analysed 1 clusters found
15 rows analysed 2 clusters found
1817 rows analysed 3 clusters found
3985 rows analysed 4 clusters found
4024 rows analysed 5 clusters found
4077 rows analysed 6 clusters found
4195 rows analysed 7 clusters found
6039 rows analysed 8 clusters found
6416 rows analysed 9 clusters found
6913 rows analysed 10 clusters found
6968 rows analysed 11 clusters found


In [12]:
clusters

[defaultdict(<function __main__.<lambda>()>,
             {'class': defaultdict(float, {'p': 426.0, 'e': 1576.0}),
              'cap-shape': defaultdict(float,
                          {'x': 1085.0,
                           'b': 257.0,
                           's': 1.0,
                           'f': 659.0,
                           'k': 0.0,
                           'c': 0.0}),
              'cap-surface': defaultdict(float,
                          {'s': 488.0, 'y': 1441.0, 'f': 73.0, 'g': 0.0}),
              'cap-color': defaultdict(float,
                          {'n': 512.0,
                           'y': 357.0,
                           'w': 447.0,
                           'g': 337.0,
                           'e': 289.0,
                           'p': 58.0,
                           'b': 2.0,
                           'u': 0.0,
                           'c': 0.0,
                           'r': 0.0}),
              'bruises': defaultdict(float, {'t': 1836.0

In [13]:
import tensorflow as tf

In [14]:
tf.__version__

'2.3.2'