# Case Study on Probability for Data Science

### To make a suitable machine learning algorithm to predict if the mushroom is edible or poisonous (e or p) using the given dataset.(Along with other ML algorithms, Naïve Bayes’ Classifier should be applied)Also, if some data pre-processing is necessary do that as well.

In [11]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [12]:
#importing data
data= pd.read_csv("mushrooms.csv")

In [13]:
#viewing data
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [14]:
#checking null values
data.isna().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

## Preprocessing

### Encoding

In [15]:
#label Encoding
import sklearn
from sklearn.preprocessing import LabelEncoder
label_en=LabelEncoder()
data["class"]= label_en.fit_transform(data["class"])

In [16]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,0,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,0,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,1,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,0,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [17]:
#one hot encoding
data=pd.get_dummies(data)

In [49]:
data.head()

Unnamed: 0,class,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Columns: 118 entries, class to habitat_w
dtypes: int32(1), uint8(117)
memory usage: 960.1 KB


## Modelling

### Splitting data into train and test

In [20]:
from sklearn.model_selection import train_test_split

In [23]:
x=data.drop("class",axis=1)
y=data["class"]

In [28]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

### Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)

In [30]:
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score

In [34]:
print("Accuracy score is ",accuracy_score(y_test,y_pred))
print("precision_score is ",precision_score(y_test,y_pred))
print("recall_score is ",recall_score(y_test,y_pred))
print("f1_score is ",f1_score(y_test,y_pred))

Accuracy score is  1.0
precision_score is  1.0
recall_score is  1.0
f1_score is  1.0


In [35]:
print(confusion_matrix(y_test,y_pred))

[[1055    0]
 [   0  976]]


### KNN

In [57]:
from sklearn.neighbors import KNeighborsClassifier
ac_values=[]
neighbors=np.arange(3,10)
for k in neighbors:
    knn=KNeighborsClassifier(n_neighbors=k,metric="minkowski")
    knn.fit(x_train,y_train)
    y_pred=knn.predict(x_test)
    ac=accuracy_score(y_test,y_pred)
    ac_values.append(ac)

In [58]:
ac

1.0

In [59]:
print("Accuracy score is ",accuracy_score(y_test,y_pred))

Accuracy score is  1.0


### Random Forest Classifier

In [40]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred=lr.predict(x_test)

In [42]:
print("Accuracy score is ",accuracy_score(y_test,y_pred))
print("precision_score is ",precision_score(y_test,y_pred,average="macro"))
print("recall_score is ",recall_score(y_test,y_pred,average='macro'))
print("f1_score is ",f1_score(y_test,y_pred,average='macro'))

Accuracy score is  1.0
precision_score is  1.0
recall_score is  1.0
f1_score is  1.0


### Gaussian Naive Bayes

In [43]:
from sklearn.naive_bayes import GaussianNB
g_classifier=GaussianNB()
g_classifier.fit(x_train,y_train)
y_pred=g_classifier.predict(x_test)

In [44]:
print("Accuracy score is ",accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

Accuracy score is  0.9556868537666174
[[965  90]
 [  0 976]]


### Bernoulli Naive Bayes

In [47]:
from sklearn.naive_bayes import BernoulliNB
b_classifier=BernoulliNB()
b_classifier.fit(x_train,y_train)
y_pred=b_classifier.predict(x_test)

In [48]:
print("Accuracy score is ",accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

Accuracy score is  0.9458394879369768
[[1042   13]
 [  97  879]]


## Insight

>Most of the models give full accuracy other than naive bayes 

>Naive bayes also gives high Accuracy