In [51]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn.model_selection
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as ttsplit

%matplotlib inline

In [52]:
#selecting three columns, and giving them names for readability
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data', 
                  sep = ',', 
                  header = None, 
                  usecols = [0,5,15], 
                  names = ["Mushroom_Safe", "Odor", "Mushroom_Stalk_Color_Below_Ring"])
data.head()

Unnamed: 0,Mushroom_Safe,Odor,Mushroom_Stalk_Color_Below_Ring
0,p,p,w
1,e,a,w
2,e,l,w
3,p,p,w
4,e,n,w


In [53]:
#converting mushroom safety, odor and mushroom stalk color below ring labels to 
#numerical values
data.replace(to_replace={"Mushroom_Safe":{'p':1, 'e':0}}, inplace = True)
data.replace(to_replace={"Odor":{'a':0, 'l':1, 'c':2, 'y':3, 'f':4, 'm':5, 'n':6, 'p':7, 's':8}}, inplace=True)
data.replace(to_replace={"Mushroom_Stalk_Color_Below_Ring":{'n':0, 'b':1, 'c':2, 'g':3, 'o':4, 'p':5, 'u':6, 'e':7, 'w':8, 'y':9}}, inplace=True)
data.head()

Unnamed: 0,Mushroom_Safe,Odor,Mushroom_Stalk_Color_Below_Ring
0,1,7,8
1,0,0,8
2,0,1,8
3,1,7,8
4,0,6,8


In [54]:
#counting the number of mushrooms that are poisonous and edible
count = data['Mushroom_Safe'].value_counts()
count

0    4208
1    3916
Name: Mushroom_Safe, dtype: int64

In [55]:
#counting the number of mushrooms with each odor type
count = data['Odor'].value_counts()
count

6    3528
4    2160
3     576
8     576
0     400
1     400
7     256
2     192
5      36
Name: Odor, dtype: int64

In [56]:
#counting the number of mushrooms with each mushroom stalk color below ring type
count = data['Mushroom_Stalk_Color_Below_Ring'].value_counts()
count

8    4384
5    1872
3     576
0     512
1     432
4     192
7      96
2      36
9      24
Name: Mushroom_Stalk_Color_Below_Ring, dtype: int64

In [57]:
#converting categorical variables into dummy variables
data = pd.get_dummies(data, columns=['Odor', 'Mushroom_Stalk_Color_Below_Ring'])

#spliting the dataset into training and testing sets
X = data.drop('Mushroom_Safe', axis=1)
y = data['Mushroom_Safe']
X_train, X_test, y_train, y_test = ttsplit(X, y, test_size=0.2, random_state=42)

print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(6499, 18) (1625, 18) (6499,) (1625,)


In [60]:
#using logistic regression models prediction 
odor_model = LogisticRegression().fit(X_train.loc[:, 'Odor_0':'Odor_8'], y_train)
stalk_color_model = LogisticRegression().fit(X_train.loc[:, 'Mushroom_Stalk_Color_Below_Ring_0':'Mushroom_Stalk_Color_Below_Ring_9'], y_train)

#evaluating performance on testing set
odor_pred = odor_model.predict(X_test.loc[:, 'Odor_0':'Odor_8'])
stalk_color_pred = stalk_color_model.predict(X_test.loc[:, 'Mushroom_Stalk_Color_Below_Ring_0':'Mushroom_Stalk_Color_Below_Ring_9'])

odor_acc = metrics.accuracy_score(y_test, odor_pred)
stalk_color_acc = metrics.accuracy_score(y_test, stalk_color_pred)

print("Accuracy of Mushroom odor model:", odor_acc)
print("Accuracy of Mushroom stalk color below ring model:", stalk_color_acc)


Accuracy of Mushroom odor model: 0.9846153846153847
Accuracy of Mushroom stalk color below ring model: 0.7058461538461539


Based on the above analysis, it appears that the odor predictor column is a better predictor of whether or not a mushroom is poisonous than the Mushroom_Stalk_Color_Below_Ring predictor column. However, this analysis only considers two predictor variables, and there may be other variables that are better predictors