In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv('mushrooms.csv')
data

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.isna().sum()

### Target Variable

In [None]:
data['class'].value_counts()

In [None]:
data['class'].hist(grid=False)
plt.title('edible vs poisonous')
plt.show

In [None]:
from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()

In [None]:
data["class"]=LE.fit_transform(data["class"])
data["class"].value_counts()

In [None]:
data["cap-shape"]=LE.fit_transform(data["cap-shape"])
data["cap-surface"]=LE.fit_transform(data["cap-surface"])
data["cap-color"]=LE.fit_transform(data["cap-color"])
data["bruises"]=LE.fit_transform(data["bruises"])
data["odor"]=LE.fit_transform(data["odor"])
data["gill-attachment"]=LE.fit_transform(data["gill-attachment"])
data["gill-spacing"]=LE.fit_transform(data["gill-spacing"])
data["gill-size"]=LE.fit_transform(data["gill-size"])
data["gill-color"]=LE.fit_transform(data["gill-color"])
data["stalk-shape"]=LE.fit_transform(data["stalk-shape"])
data["stalk-root"]=LE.fit_transform(data["stalk-root"])
data["stalk-surface-above-ring"]=LE.fit_transform(data["stalk-surface-above-ring"])
data["stalk-surface-below-ring"]=LE.fit_transform(data["stalk-surface-below-ring"])
data["stalk-color-above-ring"]=LE.fit_transform(data["stalk-color-above-ring"])
data["stalk-color-below-ring"]=LE.fit_transform(data["stalk-color-below-ring"])
data["veil-type"]=LE.fit_transform(data["veil-type"])
data["veil-color"]=LE.fit_transform(data["veil-color"])
data["ring-number"]=LE.fit_transform(data["ring-number"])
data["ring-type"]=LE.fit_transform(data["ring-type"])
data["spore-print-color"]=LE.fit_transform(data["spore-print-color"])
data["population"]=LE.fit_transform(data["population"])
data["habitat"]=LE.fit_transform(data["habitat"])


In [None]:
data

### Checking Correlation

In [None]:
corre_matrix=data.corr()
corre_matrix['class'].sort_values(ascending=False)

### Splitting the dataset in independent and target variable

In [None]:
x=data.drop('class',axis=1)
y=data['class']

In [None]:
x

In [None]:
y

### Finding best random state

In [None]:
maxAccu=0 #maximum accuracy
maxRS=0  # best random state value for which max accuracy is achieved
for i in range(1,200):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=i)
    LR=LogisticRegression()
    LR.fit(x_train,y_train)
    pred=LR.predict(x_test)
    acc=accuracy_score(y_test,pred)
    if acc>maxAccu:
        maxAccu=acc
        maxRS=i
print('Max Accuracy is',maxAccu,'on Random State ',maxRS)

### Creating the Training and testing data

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=21)





In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
x_test.shape

In [None]:
y_test.shape

### Logistic Regression

In [None]:
LR=LogisticRegression()
LR.fit(x_train,y_train)
pred=LR.predict(x_test)
print('Accuracy ',accuracy_score(y_test,pred)*100)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT=DecisionTreeClassifier()
DT.fit(x_train,y_train)
pred=DT.predict(x_test)
print('Accuracy ',accuracy_score(y_test,pred)*100)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF=RandomForestClassifier()
RF.fit(x_train,y_train)
pred=RF.predict(x_test)
print('Accuracy ',accuracy_score(y_test,pred)*100)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

### Cross Validation


In [None]:
from sklearn.model_selection import cross_val_score
cvs=cross_val_score(LR,x,y,cv=5)
print("Cross Validation score for Logistic Regression ",cvs.mean())

In [None]:
cvs=cross_val_score(DT,x,y,cv=5)
print("Cross Validation score for Decision Tree Classifier ",cvs.mean())

In [None]:
cvs=cross_val_score(RF,x,y,cv=5)
print("Cross Validation score for Random Forest Classifier ",cvs.mean())

Decision Tree Classifier is performing better among all so we will continue with Decision Tree Classifier

### Hyper Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid_param={'criterion':['gini','entropy'],'max_depth':range(15,20),'max_leaf_nodes':range(6,8),'min_samples_leaf':range(15,20),'min_samples_split':range(3,10)}
grid_search=GridSearchCV(estimator=DT,param_grid=grid_param,cv=5,n_jobs=-1)


In [None]:
grid_search.fit(x_train,y_train) # fitting the data in model

In [None]:
best_parameters=grid_search.best_params_  
print(best_parameters)  # printing the best parameters found by GridSearchCV

In [None]:
GCV_pred=grid_search.best_estimator_.predict(x_test) #predicting with best parameters
accuracy_score(y_test,GCV_pred)

### ROC AUC plot

In [None]:
from sklearn.metrics import plot_roc_curve

In [None]:
plot_roc_curve(grid_search.best_estimator_,x_test,y_test)
plt.title('ROC AUC CURVE')
plt.show