In [158]:
'''1. cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y
4. bruises?: bruises=t,no=f
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s
6. gill-attachment: attached=a,descending=d,free=f,notched=n
7. gill-spacing: close=c,crowded=w,distant=d
8. gill-size: broad=b,narrow=n
9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e, white=w,yellow=y
10. stalk-shape: enlarging=e,tapering=t
11. stalk-root: bulbous=b,club=c,cup=u,equal=e, rhizomorphs=z,rooted=r,missing=?
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
16. veil-type: partial=p,universal=u
17. veil-color: brown=n,orange=o,white=w,yellow=y
18. ring-number: none=n,one=o,two=t
19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l, none=n,pendant=p,sheathing=s,zone=z
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r, orange=o,purple=u,white=w,yellow=y
21. population: abundant=a,clustered=c,numerous=n, scattered=s,several=v,solitary=y
22. habitat: grasses=g,leaves=l,meadows=m,paths=p, urban=u,waste=w,woods=d'''

columns = ['edible', 'cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor', 'gill-attachment', 
           'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 
           'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 
           'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']


In [159]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedKFold
import sklearn.metrics as metrics

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

In [160]:
#read data
ds = pd.read_csv('data', sep=',', names=columns)
ds[:3]

Unnamed: 0,edible,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m


In [161]:
#prepare not categorical data
ds = ds.replace({'edible':'e'},1)
ds = ds.replace({'edible':'p'},0)

ds = ds.replace({'ring-number':'n'},0)
ds = ds.replace({'ring-number':'o'},1)
ds = ds.replace({'ring-number':'t'},2)

ds = ds.replace({'bruises?':'t'},1)
ds = ds.replace({'bruises?':'f'},0)

In [162]:
# Vectorize categorical data
ds_cat = ds.drop(['edible', 'ring-number', 'bruises?'],axis=1) #drop not-cat columns
ds_cat = ds_cat.T.to_dict().values() #prepare dictionary
vectorizer = DictVectorizer(sparse = False)
ds_cat_vec = vectorizer.fit_transform(ds_cat)
ds_cat_vec.shape

(8124, 112)

In [163]:
# Prepare and export a final digital only dataset 
ds_vectorized_features = pd.DataFrame(data=ds_cat_vec, columns=vectorizer.get_feature_names())
ds_numerical_features = ds_cat = ds[['edible', 'ring-number', 'bruises?']]

ds_final = pd.concat([ds_numerical_features,ds_vectorized_features], axis=1).astype(int)
ds_final.to_csv('data_cleaned_vectorized.csv' )
ds_final[:3]

Unnamed: 0,edible,ring-number,bruises?,cap-color=b,cap-color=c,cap-color=e,cap-color=g,cap-color=n,cap-color=p,cap-color=r,...,stalk-surface-above-ring=y,stalk-surface-below-ring=f,stalk-surface-below-ring=k,stalk-surface-below-ring=s,stalk-surface-below-ring=y,veil-color=n,veil-color=o,veil-color=w,veil-color=y,veil-type=p
0,0,1,1,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,1
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
2,1,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1


In [164]:
ds[:3]

Unnamed: 0,edible,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,x,s,n,1,p,f,c,n,k,...,s,w,w,p,w,1,p,k,s,u
1,1,x,s,y,1,a,f,c,b,k,...,s,w,w,p,w,1,p,n,n,g
2,1,b,s,w,1,l,f,c,b,n,...,s,w,w,p,w,1,p,n,n,m


In [165]:
# Split dataset into train/set

X = ds_final.drop('edible', axis=1)
y = ds_final.edible

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print('X_train shape is {}'.format(X_train.shape))
print('y_train shape is {}'.format(y_train.shape))
print('X_test shape is {}'.format(X_test.shape))
print('y_test shape is {}'.format(y_test.shape))

X_train shape is (6499, 114)
y_train shape is (6499,)
X_test shape is (1625, 114)
y_test shape is (1625,)


In [166]:
#Build baseline NN model in Keras
def make_baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(114, input_dim=114))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# make sklearn classifier
clf = KerasClassifier(build_fn=make_baseline_model, nb_epoch=5, batch_size=5, verbose=0)

In [None]:
# evaluate model via stratified K-fold cross validation on a train subset
skf = StratifiedKFold(y=np.array(y), n_folds=10, shuffle=True)
results = cross_val_score(clf, np.array(X), np.array(y), cv=skf)
print("Results: %.2f%% ({.2f%%})".format(results.mean()*100, results.std()*100))

In [183]:
# train model in a simple way train/split
clf.fit(np.array(X_train), np.array(y_train))

<keras.callbacks.History at 0x114d0cc18>

In [185]:
y_pred = clf.predict(np.array(X_test))
cf = metrics.confusion_matrix(y_test, y_pred)

print('scorrect score is {}'.format((cf[1][1]+cf[0][0])/(cf[1][0]+cf[0][1]) ))
print('Confusiton matrix is ')
cf


scorrect score is []
Confusiton matrix is 


array([[789,   0],
       [  0, 836]])