In [None]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import plot_confusion_matrix

## Dataset retrival ##

In [None]:
mushroom_data = pd.read_csv('mushroom.csv')

In [None]:
X = mushroom_data

In [None]:
y=mushroom_data.class_edible

## Dataset preprocessing using sklearn LabelEncoder##

sklearn offers a tool capable of preprocessing which converts non-numeric features in numeric ones. 

In [None]:
# Stores all data objects different from numeric data
categorical_feature_mask = X.dtypes==object
# Filter categorical columns using mask and turn it into a list for next encoding
categorical_cols = X.columns[categorical_feature_mask].tolist()

In [None]:
le = LabelEncoder()

In [None]:
# Apply LabelEncoder le on categorical feature columns, this turns non-numeric data into numeric
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

In [None]:
X[categorical_cols]

## Features selection for best accuracy ##

sklearn offers a series of tools capable of finding the best features for a Random Forest Classifier, in general it's a suitable algorithm for a tree classifier. The goal here is to determinate the best features to use for classification

function train_test_split uses a test_size of 70% of dataset for an efficient prediction of best features. In this scenario a best feature is the one which allows to increase the accuracy of the model actual prediction. 


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.7)

In [None]:
selector = SelectFromModel(RandomForestClassifier(n_estimators = 5000))
selector.fit(X_train, y_train)

In [None]:
selected_features= X_train.columns[(selector.get_support())]

## Model Training using selected features ##

The classificator used is a Random Forest, a generic classificator with good performances and low overfitting

In [None]:
X = mushroom_data[selected_features]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.3)

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

In [None]:
clf.fit(X_train,y_train)

In [None]:
print("Test set accuracy: {:.2f}".format(clf.score(X_test, y_test)))

In [None]:
plot_confusion_matrix(clf, X_test, y_test, cmap=plt.cm.YlGn)