### Week 2 task by Denis Khryashchev

In [1]:
# Importing the Apriori implementation of mlxtend library
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import OnehotTransactions
import numpy as np
import pandas as pd
import csv

# Setting pandas to display wider columns (up to 150 characters max per column)
pd.set_option("display.max_colwidth", 150)

In [2]:
# Loading the dataset
# The data were accessed from https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/
#filename = "agaricus-lepiota.data"
filename = "expanded.csv"
with open(filename, 'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    mushrooms = map(lambda mushroom: mushroom, reader)

In [3]:
# Field names from https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names
colnames = ['classes', 'cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor', 'gill-attachment', 'gill-spacing',
            'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 
            'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type',
            'spore-print-color', 'population', 'habitat']

In [4]:
# Pairing example
zip(colnames, mushrooms[0])

[('classes', 'EDIBLE'),
 ('cap-shape', 'CONVEX'),
 ('cap-surface', 'SMOOTH'),
 ('cap-color', 'WHITE'),
 ('bruises?', 'BRUISES'),
 ('odor', 'ALMOND'),
 ('gill-attachment', 'FREE'),
 ('gill-spacing', 'CROWDED'),
 ('gill-size', 'NARROW'),
 ('gill-color', 'WHITE'),
 ('stalk-shape', 'TAPERING'),
 ('stalk-root', 'BULBOUS'),
 ('stalk-surface-above-ring', 'SMOOTH'),
 ('stalk-surface-below-ring', 'SMOOTH'),
 ('stalk-color-above-ring', 'WHITE'),
 ('stalk-color-below-ring', 'WHITE'),
 ('veil-type', 'PARTIAL'),
 ('veil-color', 'WHITE'),
 ('ring-number', 'ONE'),
 ('ring-type', 'PENDANT'),
 ('spore-print-color', 'PURPLE'),
 ('population', 'SEVERAL'),
 ('habitat', 'WOODS')]

In [5]:
# Encoding feature names so they don't get lost along the way having multiple colors with the same values
mushrooms = map(lambda mushroom: map(lambda feat: str(feat)+':'+mushroom[feat], range(len(mushroom))), mushrooms)

In [6]:
# Top 5 mushrooms
for mushroom in mushrooms[:7]:
    print mushroom

['0:EDIBLE', '1:CONVEX', '2:SMOOTH', '3:WHITE', '4:BRUISES', '5:ALMOND', '6:FREE', '7:CROWDED', '8:NARROW', '9:WHITE', '10:TAPERING', '11:BULBOUS', '12:SMOOTH', '13:SMOOTH', '14:WHITE', '15:WHITE', '16:PARTIAL', '17:WHITE', '18:ONE', '19:PENDANT', '20:PURPLE', '21:SEVERAL', '22:WOODS']
['0:EDIBLE', '1:CONVEX', '2:SMOOTH', '3:WHITE', '4:BRUISES', '5:ALMOND', '6:FREE', '7:CROWDED', '8:NARROW', '9:WHITE', '10:TAPERING', '11:BULBOUS', '12:SMOOTH', '13:SMOOTH', '14:WHITE', '15:WHITE', '16:PARTIAL', '17:WHITE', '18:ONE', '19:PENDANT', '20:BROWN', '21:SEVERAL', '22:WOODS']
['0:EDIBLE', '1:CONVEX', '2:SMOOTH', '3:WHITE', '4:BRUISES', '5:ALMOND', '6:FREE', '7:CROWDED', '8:NARROW', '9:PINK', '10:TAPERING', '11:BULBOUS', '12:SMOOTH', '13:SMOOTH', '14:WHITE', '15:WHITE', '16:PARTIAL', '17:WHITE', '18:ONE', '19:PENDANT', '20:PURPLE', '21:SEVERAL', '22:WOODS']
['0:EDIBLE', '1:CONVEX', '2:SMOOTH', '3:WHITE', '4:BRUISES', '5:ALMOND', '6:FREE', '7:CROWDED', '8:NARROW', '9:PINK', '10:TAPERING', '11:BULB

In [7]:
# Performing one hot encoding
oht = OnehotTransactions()
oht_ary = oht.fit(mushrooms).transform(mushrooms)
encoded_shrooms = pd.DataFrame(oht_ary, columns=oht.columns_)
encoded_shrooms.head(7)

Unnamed: 0,0:EDIBLE,0:POISONOUS,10:ENLARGING,10:TAPERING,11:?,11:BULBOUS,11:CLUB,11:EQUAL,11:ROOTED,12:FIBROUS,...,9:BUFF,9:CHOCOLATE,9:GRAY,9:GREEN,9:ORANGE,9:PINK,9:PURPLE,9:RED,9:WHITE,9:YELLOW
0,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [8]:
# Listing the available classes
print encoded_shrooms.columns

Index([u'0:EDIBLE', u'0:POISONOUS', u'10:ENLARGING', u'10:TAPERING', u'11:?',
       u'11:BULBOUS', u'11:CLUB', u'11:EQUAL', u'11:ROOTED', u'12:FIBROUS',
       ...
       u'9:BUFF', u'9:CHOCOLATE', u'9:GRAY', u'9:GREEN', u'9:ORANGE',
       u'9:PINK', u'9:PURPLE', u'9:RED', u'9:WHITE', u'9:YELLOW'],
      dtype='object', length=119)


In [9]:
# Executing the apriori algorithm with minimum support of 0.29 and maximal size of frequent itemsets of 9
frequent = apriori(encoded_shrooms, min_support = 0.27, max_len = 9, use_colnames = True)
frequent['size'] = frequent['itemsets'].apply(lambda x: len(x))
frequent['isEdible'] = frequent['itemsets'].apply(lambda x: '0:EDIBLE' in x)
frequent.head(10)

Unnamed: 0,support,itemsets,size,isEdible
0,0.53327,[0:EDIBLE],1,True
1,0.46673,[0:POISONOUS],1,False
2,0.422053,[10:ENLARGING],1,False
3,0.577947,[10:TAPERING],1,False
4,0.294677,[11:?],1,False
5,0.451521,[11:BULBOUS],1,False
6,0.28327,[12:SILKY],1,False
7,0.631654,[12:SMOOTH],1,False
8,0.273764,[13:SILKY],1,False
9,0.603137,[13:SMOOTH],1,False


### 1. Some itemsets of size 9 that include EDIBLE mushrooms'

In [13]:
frequent[(frequent['size'] == 9) & (frequent['isEdible'] == True)]

Unnamed: 0,support,itemsets,size,isEdible
3717,0.277567,"[0:EDIBLE, 12:SMOOTH, 13:SMOOTH, 16:PARTIAL, 17:WHITE, 18:ONE, 19:PENDANT, 4:BRUISES, 6:FREE]",9,True
3718,0.277567,"[0:EDIBLE, 12:SMOOTH, 13:SMOOTH, 16:PARTIAL, 17:WHITE, 18:ONE, 19:PENDANT, 6:FREE, 7:CLOSE]",9,True
3719,0.292776,"[0:EDIBLE, 12:SMOOTH, 13:SMOOTH, 16:PARTIAL, 17:WHITE, 4:BRUISES, 6:FREE, 7:CLOSE, 8:BROAD]",9,True
3720,0.288973,"[0:EDIBLE, 12:SMOOTH, 16:PARTIAL, 17:WHITE, 18:ONE, 19:PENDANT, 4:BRUISES, 6:FREE, 7:CLOSE]",9,True
3721,0.288973,"[0:EDIBLE, 12:SMOOTH, 16:PARTIAL, 17:WHITE, 18:ONE, 19:PENDANT, 4:BRUISES, 6:FREE, 8:BROAD]",9,True
3722,0.288973,"[0:EDIBLE, 12:SMOOTH, 16:PARTIAL, 17:WHITE, 18:ONE, 19:PENDANT, 4:BRUISES, 7:CLOSE, 8:BROAD]",9,True
3723,0.288973,"[0:EDIBLE, 12:SMOOTH, 16:PARTIAL, 17:WHITE, 18:ONE, 19:PENDANT, 6:FREE, 7:CLOSE, 8:BROAD]",9,True
3724,0.288973,"[0:EDIBLE, 12:SMOOTH, 16:PARTIAL, 17:WHITE, 18:ONE, 4:BRUISES, 6:FREE, 7:CLOSE, 8:BROAD]",9,True
3725,0.292776,"[0:EDIBLE, 12:SMOOTH, 16:PARTIAL, 17:WHITE, 19:PENDANT, 4:BRUISES, 6:FREE, 7:CLOSE, 8:BROAD]",9,True
3726,0.288973,"[0:EDIBLE, 12:SMOOTH, 16:PARTIAL, 18:ONE, 19:PENDANT, 4:BRUISES, 6:FREE, 7:CLOSE, 8:BROAD]",9,True


In [11]:
# Collecting indices for queries
zipped = zip(range(len(colnames)), colnames)
veiltype = filter(lambda z: z[1] == 'veil-type', zipped)[0][0]
veilcolor = filter(lambda z: z[1] == 'veil-color', zipped)[0][0]
gillsize = filter(lambda z: z[1] == 'gill-size', zipped)[0][0]
gillspacing = filter(lambda z: z[1] == 'gill-spacing', zipped)[0][0]

In [12]:
edibles = frequent[frequent['isEdible'] == True]
edibles.head(7)

Unnamed: 0,support,itemsets,size,isEdible
0,0.53327,[0:EDIBLE],1,True
34,0.338403,"[0:EDIBLE, 10:TAPERING]",2,True
35,0.449144,"[0:EDIBLE, 12:SMOOTH]",2,True
36,0.420627,"[0:EDIBLE, 13:SMOOTH]",2,True
37,0.360266,"[0:EDIBLE, 14:WHITE]",2,True
38,0.351711,"[0:EDIBLE, 15:WHITE]",2,True
39,0.53327,"[0:EDIBLE, 16:PARTIAL]",2,True


### 2. Checking if certain mushrooms are edible
#### a. A partial white-veiled mushroom with broad, closely spaced gills.

#### b. A partial white-veiled mushroom with closely spaced gills, a single ring and a silky surface below the ring.

#### c. A mushroom with smooth surface above the ring.