In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import plot_confusion_matrix

In [2]:
#Dataset import + Missing Values filling
path = "mushroom_data_all.csv"
data = pd.read_csv(path)

data_filled = data.fillna( 'Unknown' )
data_filled 

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [3]:
#Target definition
y = data_filled.class_edible

In [4]:
#Encoding 
features = data_filled.columns.drop( 'class_edible' )
X = data_filled[features]
le = preprocessing.LabelEncoder()
X_encoded = X.apply(le.fit_transform)
X_encoded

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5,2,4,1,6,1,0,1,4,0,...,2,7,7,0,2,1,4,2,3,5
1,5,2,9,1,0,1,0,0,4,0,...,2,7,7,0,2,1,4,3,2,1
2,0,2,8,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,3,2,3
3,5,3,8,1,6,1,0,1,5,0,...,2,7,7,0,2,1,4,2,3,5
4,5,2,3,0,5,1,1,0,4,1,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,3,2,4,0,5,0,0,0,11,0,...,2,5,5,0,1,1,4,0,1,2
8120,5,2,4,0,5,0,0,0,11,0,...,2,5,5,0,0,1,4,0,4,2
8121,2,2,4,0,5,0,0,0,5,0,...,2,5,5,0,1,1,4,0,1,2
8122,3,3,4,0,8,1,0,1,0,1,...,1,7,7,0,2,1,0,7,4,2


In [5]:
#Train/Test Split
train_X, test_X, train_y, test_y = train_test_split(X_encoded, y, test_size = 0.2)

In [6]:
#Feature Selection
logistic = LogisticRegression(C = 0.0015, penalty = "l2", max_iter = 100).fit(train_X, train_y)
model = SelectFromModel(logistic, prefit = True)

selections = model.transform(train_X)
X_columns = pd.DataFrame(model.inverse_transform(selections),
                               columns = features)

selected_columns = X_columns.columns[X_columns.var() != 0]
train_X = train_X[selected_columns]
test_X = test_X[selected_columns]


In [7]:
train_X

Unnamed: 0,bruises,odor,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,ring-type,population,habitat
2328,1,5,0,0,10,1,1,2,4,4,0
4321,0,2,0,0,7,0,1,1,2,5,1
3965,0,2,0,0,3,0,1,1,2,5,1
6293,0,7,0,1,0,1,0,1,0,4,2
3942,1,5,0,0,7,1,1,2,4,5,0
...,...,...,...,...,...,...,...,...,...,...,...
1253,1,6,0,1,7,0,3,2,4,4,1
3288,1,5,0,0,7,1,1,2,4,5,0
844,1,0,0,0,2,0,2,2,4,2,1
6034,0,7,0,1,0,1,0,1,0,4,0


In [8]:
test_X

Unnamed: 0,bruises,odor,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,ring-type,population,habitat
4414,0,2,0,0,7,0,1,1,2,5,1
895,1,3,0,0,7,0,4,2,4,3,1
5422,0,2,0,0,2,0,1,1,2,4,0
2225,1,5,0,0,7,1,1,2,4,4,0
1952,1,5,0,0,7,1,1,2,4,5,0
...,...,...,...,...,...,...,...,...,...,...,...
827,1,0,1,1,10,1,1,2,4,4,0
7730,0,2,0,1,0,1,0,2,0,4,4
3794,0,1,1,1,2,0,1,2,4,3,0
6807,0,8,0,1,0,1,0,2,0,4,2


In [9]:
#Training
tree = DecisionTreeClassifier(random_state=0)
tree.fit(train_X, train_y)
print("Training set accuracy: {:.2f}%".format(tree.score(train_X, train_y)*100))

Training set accuracy: 100.00%


In [10]:
#Predictions
print("Test set predictions:", tree.predict(test_X))

Test set predictions: ['p' 'e' 'p' ... 'p' 'p' 'e']


In [11]:
#Test Accuracy
print("Test set accuracy: {:.2f}%".format(tree.score(test_X, test_y)*100))

Test set accuracy: 100.00%
