In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv(r"D:\data_for_analysis\Mushroom\mush.csv")
df.head()  

Unnamed: 0,p,x,s,n,t,p.1,f,c,n.1,k,...,s.2,w,w.1,p.2,w.2,o,p.3,k.1,s.3,u
0,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
1,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
2,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
3,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
4,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g


Dataset seems to have no column names so we add some.

In [4]:
df.columns = ["Classifier","Cap_shape","Cap_surface","Cap_color","Bruises","Odor","gill-attachment","gill-spacing","gill-size","gill-color","stalk-shape","stalk-root","stalk-surface-above-ring","stalk-surface-below-ring","stalk-color-above-ring","stalk-color-below-ring","veil-type","veil-color","ring-number","ring-type","spore-print-color","population","habitat"]

Unfortunately a separate name file was not provided so the column names had to be entered manually. Most column names have hyphens and are lower cased, we should fix that.

In [5]:
df.columns = df.columns.str.replace("-","_")

In [6]:
df.columns

Index(['Classifier', 'Cap_shape', 'Cap_surface', 'Cap_color', 'Bruises',
       'Odor', 'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color',
       'stalk_shape', 'stalk_root', 'stalk_surface_above_ring',
       'stalk_surface_below_ring', 'stalk_color_above_ring',
       'stalk_color_below_ring', 'veil_type', 'veil_color', 'ring_number',
       'ring_type', 'spore_print_color', 'population', 'habitat'],
      dtype='object')

In [7]:
df.columns = [i.capitalize() for i in df.columns]

In [8]:
df.columns

Index(['Classifier', 'Cap_shape', 'Cap_surface', 'Cap_color', 'Bruises',
       'Odor', 'Gill_attachment', 'Gill_spacing', 'Gill_size', 'Gill_color',
       'Stalk_shape', 'Stalk_root', 'Stalk_surface_above_ring',
       'Stalk_surface_below_ring', 'Stalk_color_above_ring',
       'Stalk_color_below_ring', 'Veil_type', 'Veil_color', 'Ring_number',
       'Ring_type', 'Spore_print_color', 'Population', 'Habitat'],
      dtype='object')

In [9]:
df.head(3)

Unnamed: 0,Classifier,Cap_shape,Cap_surface,Cap_color,Bruises,Odor,Gill_attachment,Gill_spacing,Gill_size,Gill_color,...,Stalk_surface_below_ring,Stalk_color_above_ring,Stalk_color_below_ring,Veil_type,Veil_color,Ring_number,Ring_type,Spore_print_color,Population,Habitat
0,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
1,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
2,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u


Now that the column names are fixed, lets take a look at the data. Let's start by looking at the NaN values in the dataset.

In [10]:
df.isna().sum()

Classifier                  0
Cap_shape                   0
Cap_surface                 0
Cap_color                   0
Bruises                     0
Odor                        0
Gill_attachment             0
Gill_spacing                0
Gill_size                   0
Gill_color                  0
Stalk_shape                 0
Stalk_root                  0
Stalk_surface_above_ring    0
Stalk_surface_below_ring    0
Stalk_color_above_ring      0
Stalk_color_below_ring      0
Veil_type                   0
Veil_color                  0
Ring_number                 0
Ring_type                   0
Spore_print_color           0
Population                  0
Habitat                     0
dtype: int64

It seems there are no NaN values present in the dataset. However there are missing values, the dataset comes with a warning of missing values being replaced with a "?" instead of being left blank. Let us look at those occurences throughout the dataset.

In [11]:
for i in df.columns:
    print("{}: {}".format(i, len(df[df[i]=="?"])))

Classifier: 0
Cap_shape: 0
Cap_surface: 0
Cap_color: 0
Bruises: 0
Odor: 0
Gill_attachment: 0
Gill_spacing: 0
Gill_size: 0
Gill_color: 0
Stalk_shape: 0
Stalk_root: 2480
Stalk_surface_above_ring: 0
Stalk_surface_below_ring: 0
Stalk_color_above_ring: 0
Stalk_color_below_ring: 0
Veil_type: 0
Veil_color: 0
Ring_number: 0
Ring_type: 0
Spore_print_color: 0
Population: 0
Habitat: 0


Looking at the data it seems all the missing values are present in a single column. Let us quickly look at how that translates to a percentage value.

In [12]:
(2480/len(df))*100

30.530592145758956

The "Stalk_root" column is missing 30.5 % of its data. It also happens to be the only dataset with any missing values. But Stalk-root can play an important role in mushroom identification, and the column still possesses 70% of its values. So we'll just replace all the "?" values with "unknown"

In [13]:
df["Stalk_root"].replace("?","unknown", inplace=True)

In [14]:
df["Stalk_root"].value_counts()

b          3776
unknown    2480
e          1119
c           556
r           192
Name: Stalk_root, dtype: int64

The goal here is to try and create the most accurate model from all the parameters present in the regression function. Therefore there will be multiple models created in order to choose the best one available

Before splitting our dataset into train and test, we need to encode our values into numerical terms, since our dataset is entirely composed of letters not conductive to logistic regression.

In [15]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df["Classifier"] = encoder.fit_transform(df["Classifier"])
df.head(10)

Unnamed: 0,Classifier,Cap_shape,Cap_surface,Cap_color,Bruises,Odor,Gill_attachment,Gill_spacing,Gill_size,Gill_color,...,Stalk_surface_below_ring,Stalk_color_above_ring,Stalk_color_below_ring,Veil_type,Veil_color,Ring_number,Ring_type,Spore_print_color,Population,Habitat
0,0,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
1,0,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
2,1,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
3,0,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
4,0,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
5,0,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
6,0,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
7,1,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
8,0,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m
9,0,x,y,y,t,l,f,c,b,g,...,s,w,w,p,w,o,p,n,n,g


All we've done above is convert our "Classifier" column, which is also our target variable, to numerical terms with the help of the LabelEncoder function. Let us check the values contained in the "Classifier" column.

In [16]:
df["Classifier"].value_counts()

0    4208
1    3915
Name: Classifier, dtype: int64

We can see that the "e"(edible) values have been converted into a 0 and the "p"(poisonous) values have been converted into a 1. 

We could've converted every column into a numerical value using the LabelEncoder, but in doing so we run into a problem. If the LabelEncoder is used on every column the model will believe that higher values, like 2, have a higher weight to them than lower values, 1 or 0. Thus the model sees relationships between variables that aren't there, like 2>1>0, skewing the dataset. This is obviously not true as these values are simply placeholders to their alphabetical counterparts, therefore it is better to LabelEncode the Classifiers before One Hot Encoding the entire dataset. 

In [17]:
df = pd.get_dummies(df)
df.head()

Unnamed: 0,Classifier,Cap_shape_b,Cap_shape_c,Cap_shape_f,Cap_shape_k,Cap_shape_s,Cap_shape_x,Cap_surface_f,Cap_surface_g,Cap_surface_s,...,Population_s,Population_v,Population_y,Habitat_d,Habitat_g,Habitat_l,Habitat_m,Habitat_p,Habitat_u,Habitat_w
0,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [18]:
df.shape

(8123, 118)

One Hot Encoding creates several new columns for each value in the Classifier. For example, in the first column of the dataset, the "Cap_shape" was "x". One Hot Encoding created a new column for every type of "Cap_shape", then entered one for the "Cap_shape" that was associated with the Classifier. This is done for every column in the dataset, leading to the creation of 118 columns from the original 22. With this we can be sure that the model will not interpret the data fed into it correctly.

In [19]:
y = df["Classifier"].values.reshape(-1,1)
X = df
X = df.drop("Classifier",1)
X.head()

Unnamed: 0,Cap_shape_b,Cap_shape_c,Cap_shape_f,Cap_shape_k,Cap_shape_s,Cap_shape_x,Cap_surface_f,Cap_surface_g,Cap_surface_s,Cap_surface_y,...,Population_s,Population_v,Population_y,Habitat_d,Habitat_g,Habitat_l,Habitat_m,Habitat_p,Habitat_u,Habitat_w
0,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [21]:
X_train.shape

(6498, 117)

In [22]:
X_test.shape

(1625, 117)

In [23]:
print(y_train.shape, y_test.shape)

(6498, 1) (1625, 1)


Now we can finally begin building out models.

In [24]:
model_1 = LogisticRegression()
model_1.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
model_1.score(X_train, y_train)

1.0

This is a suspicious value. Let's test it out on the test set.

In [26]:
y_pred_1 = model_1.predict(X_test)

In [27]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred_1)

In [28]:
confusion_matrix

array([[838,   0],
       [  0, 787]], dtype=int64)

It seems the pre-processing itself was enough to build a "perfect" classifier. The false_positive and false_negative values for the test set are 0, showing that the logistic model built with the default values was enough to build a very accurate model. It seems we dont get to tinker with the rest of the parameters after all. This speaks more for pre-processing than the actual implementation of the machine learning method. The majority of the success is owed to the LabelEncoding followed by One Hot Encoding. I've seen models that worked with a data pre-processed only using the LabelEncoded values perform in the low 90 percent range, showcasing that simple but correct pre-processing of data can help build extremely accurate models. 