In [None]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from ucimlrepo import fetch_ucirepo 
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
  
df = fetch_ucirepo(id=73) 
# data (as pandas dataframes) 
features = df.data.features 
targets = df.data.targets 

features.dropna(inplace=True)

targets = targets[targets.index.isin(features.index)]

trainingdata, testdata, traininglabels, testlabels = train_test_split(features, targets, test_size=0.2, random_state=42)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.dropna(inplace=True)


Individual Category Features

Model #1: All Mushroom Cap Features

In [2]:
# Optimal K value usually found is the square root of N, where N is the total number of samples
k = int(len(targets.index)**.5)
#Pipeline to standarize data 
pipeline = make_pipeline(
    OneHotEncoder(),
    KNeighborsClassifier(n_neighbors=k)
)

#Map 'e' and 'p' to 0 and 1
y_test = traininglabels['poisonous'].map({'e':1, 'p':0})
Y_test = testlabels['poisonous'].map({'e': 1, 'p': 0})


#Train Model
pipeline.fit(X=trainingdata[["cap-shape", "cap-surface", 'cap-color']], y=y_test)
#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[['cap-shape', 'cap-surface', 'cap-color']], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[['cap-shape', 'cap-surface', 'cap-color']]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[['cap-shape', 'cap-surface', 'cap-color']]))))

R2 Score: 0.7360496014171833
F1 Score: 0.7936288088642659
Log Loss: 9.513736678438361


Model #2: All Gill Features

In [3]:
#Train Model
pipeline.fit(X=trainingdata[["gill-attachment", "gill-spacing", 'gill-size', 'gill-color']], y=y_test)

#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[["gill-attachment", "gill-spacing", 'gill-size', 'gill-color']], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[["gill-attachment", "gill-spacing", 'gill-size', 'gill-color']]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[["gill-attachment", "gill-spacing", 'gill-size', 'gill-color']]))))

R2 Score: 0.8210806023029229
F1 Score: 0.8426791277258567
Log Loss: 6.448908755183051


Model 3: All Stalk Features

In [4]:
#Train Model
pipeline.fit(X=trainingdata[["stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring']], y=y_test)

#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[["stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring']], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[["stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring']]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[["stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring']]))))

R2 Score: 0.9433126660761736
F1 Score: 0.9536903039073806
Log Loss: 2.043218615503541


Model 4: Veil Features

In [5]:
#Train Model
pipeline.fit(X=trainingdata[["veil-type", "veil-color"]], y=y_test)

#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[["veil-type", "veil-color"]], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[["veil-type", "veil-color"]]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[["veil-type", "veil-color"]]))))

R2 Score: 0.6244464127546502
F1 Score: 0.7688113413304253
Log Loss: 13.53632332771096


Model 5: Ring Features

In [6]:
#Train Model
pipeline.fit(X=trainingdata[["ring-number", "ring-type"]], y=y_test)

#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[["ring-number", "ring-type"]], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[["ring-number", "ring-type"]]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[["ring-number", "ring-type"]]))))

R2 Score: 0.8547387068201948
F1 Score: 0.8947368421052632
Log Loss: 5.235747702227824


Model 6: Odor

In [7]:
#Train Model
pipeline.fit(X=trainingdata[["odor"]], y=y_test)

#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[["odor"]], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[["odor"]]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[["odor"]]))))

R2 Score: 0.983170947741364


F1 Score: 0.986703988803359
Log Loss: 0.606580526477614


Model 7: Bruises

In [8]:
#Train Model
pipeline.fit(X=trainingdata[["bruises"]], y=y_test)

#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[["bruises"]], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[["bruises"]]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[["bruises"]]))))

R2 Score: 0.70859167404783


F1 Score: 0.753558052434457
Log Loss: 10.503420695322891


Model 8: Spore Color

In [9]:
#Train Model
pipeline.fit(X=trainingdata[["spore-print-color"]], y=y_test)

#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[["spore-print-color"]], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[["spore-print-color"]]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[["spore-print-color"]]))))

R2 Score: 0.9069973427812223


F1 Score: 0.9306930693069307
Log Loss: 3.352155541060497


Model 9: Population

In [10]:
#Train Model
pipeline.fit(X=trainingdata[["population"]], y=y_test)

#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[["population"]], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[["population"]]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[["population"]]))))

R2 Score: 0.6324180690876883
F1 Score: 0.6999276934201012
Log Loss: 13.248995709905774


Model 10: Habitat

In [11]:
#Train Model
pipeline.fit(X=trainingdata[["habitat"]], y=y_test)

#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[["habitat"]], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[["habitat"]]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[["habitat"]]))))

R2 Score: 0.6944198405668733
F1 Score: 0.7912885662431942
Log Loss: 11.014225349198776


Combination of Features

Model 11: Cap and Gill Features

In [12]:
#Train Model
pipeline.fit(X=trainingdata[["cap-shape", "cap-surface", 'cap-color', "gill-attachment", "gill-spacing", 'gill-size', 'gill-color']], y=y_test)
#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[["cap-shape", "cap-surface", 'cap-color', "gill-attachment", "gill-spacing", 'gill-size', 'gill-color']], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[["cap-shape", "cap-surface", 'cap-color', "gill-attachment", "gill-spacing", 'gill-size', 'gill-color']]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[["cap-shape", "cap-surface", 'cap-color', "gill-attachment", "gill-spacing", 'gill-size', 'gill-color']]))))

R2 Score: 0.9229406554472985
F1 Score: 0.9370021723388848
Log Loss: 2.777500305450126


Model 12: Gill and Stalk Features

In [13]:
#Train Model
pipeline.fit(X=trainingdata[["gill-attachment", "gill-spacing", 'gill-size', 'gill-color', "stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring']], y=y_test)
#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[["gill-attachment", "gill-spacing", 'gill-size', 'gill-color', "stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring']], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[["gill-attachment", "gill-spacing", 'gill-size', 'gill-color', "stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring']]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[["gill-attachment", "gill-spacing", 'gill-size', 'gill-color', "stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring']]))))

R2 Score: 0.9459698848538529
F1 Score: 0.9569513055751588
Log Loss: 1.9474427429018129


Model 13: Stalk and Cap Features

In [14]:
#Train Model
pipeline.fit(X=trainingdata[["stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', "cap-shape", "cap-surface", 'cap-color']], y=y_test)
#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[["stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', "cap-shape", "cap-surface", 'cap-color']], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[["stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', "cap-shape", "cap-surface", 'cap-color']]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[["stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', "cap-shape", "cap-surface", 'cap-color']]))))

R2 Score: 0.978742249778565
F1 Score: 0.9828571428571429
Log Loss: 0.766206980813828


Model 14: Veil and Ring Features

In [15]:
pipeline.fit(X=trainingdata[["veil-type", "veil-color", "ring-number", "ring-type"]], y=y_test)

#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[["veil-type", "veil-color", "ring-number", "ring-type"]], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[["veil-type", "veil-color", "ring-number", "ring-type"]]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[["veil-type", "veil-color", "ring-number", "ring-type"]]))))

R2 Score: 0.8547387068201948
F1 Score: 0.8947368421052632
Log Loss: 5.235747702227824


Model 15: Top 3 Best Individual Features

In [16]:
pipeline.fit(X=trainingdata[["stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', "spore-print-color", 'odor']], y=y_test)

#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[["stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', "spore-print-color", 'odor']], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[["stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', "spore-print-color", 'odor']]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[["stalk-shape", "stalk-root", 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', "spore-print-color", 'odor']]))))

R2 Score: 0.995571302037201
F1 Score: 0.99644128113879
Log Loss: 0.15962645433621434


Model 16: Physical Mushroom Characteristics

In [17]:
pipeline.fit(X=trainingdata[['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type']], y=y_test)

#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type']], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type']]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type']]))))

R2 Score: 0.9946855624446412
F1 Score: 0.9957386363636364
Log Loss: 0.19155174520345716


Model 17: Non Physical Mushroom Characteristics

In [18]:
#Train Model
pipeline.fit(X=trainingdata[["spore-print-color", 'population', "habitat"]], y=y_test)

#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[["spore-print-color", 'population', "habitat"]], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[["spore-print-color", 'population', "habitat"]]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[["spore-print-color", 'population', "habitat"]]))))

R2 Score: 0.941541186891054
F1 Score: 0.9547945205479452
Log Loss: 2.107069197238027


Model 18: All Features

In [19]:
#Train Model
pipeline.fit(X=trainingdata[['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']], y=y_test)
#Test Model
print('R2 Score: ' + str(pipeline.score(X=testdata[['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']], y=Y_test)))
print('F1 Score: ' + str(f1_score(Y_test, pipeline.predict(testdata[['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']]))))
print('Log Loss: ' + str(log_loss(Y_test, pipeline.predict(testdata[['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']]))))

R2 Score: 0.995571302037201
F1 Score: 0.9964513839602555
Log Loss: 0.15962645433621434
