In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo

# fetch dataset
mushroom = fetch_ucirepo(id=73)

# data (as pandas dataframes)
X = mushroom.data.features
y = mushroom.data.targets

# metadata
print(mushroom.metadata)
print(mushroom.variables)

In [None]:
import missingno as msno
msno.bar(X)

In [None]:
for j in range(len(X.columns)):
    count = 0
    for i in X.index:
        if type(X.iloc[i, j]) == float:
            count += 1
    indicator = '-----' if count > 0 else ''
    print(indicator, X.columns[j], count, indicator)

In [None]:
for i in X.columns:
    print(i, '\t\t\t', X[i].unique())
x = X.copy()
x = x.drop(columns=['veil-type'], axis=1)

In [None]:
print(y.value_counts())
plt.bar(['p', 'e'], y.value_counts().values)
plt.title('Class Label')
plt.show()

In [None]:
c = 1
for i in x.columns:
    plt.bar(x[i].value_counts().index, x[i].value_counts().values)
    plt.title(str(c) + '. ' + i)
    c+=1
    plt.show()

In [None]:
x = x.drop(columns=['stalk-root'])
x['bruises'].replace({'t': 1, 'f': 0}, inplace=True)
x['gill-attachment'].replace({'a': 1, 'f': 0}, inplace=True)
x['gill-spacing'].replace({'c': 1, 'w': 0}, inplace=True)
x['gill-size'].replace({'n': 1, 'b': 0}, inplace=True)
x['stalk-shape'].replace({'e': 1, 't': 0}, inplace=True)
x['ring-number'].replace({'n': 0, 'o': 1, 't': 2}, inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for i in x.columns:
    if x[i].dtypes == 'object':
        x[i] = label_encoder.fit_transform(x[i])

print(x.dtypes)
y = y.replace({'p': 1, 'e': 0})
y = y.to_numpy()
y.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
y_train = y_train.ravel()
y_test = y_test.ravel()

classifiers = {
    'Decision Tree': DecisionTreeClassifier(),
    'Decision Tree (Entropy)': DecisionTreeClassifier(criterion='entropy'),
    'Naive Bayes': MultinomialNB(),
    'Support Vector Machine': SVC(),
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f'Results for {name}:')
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print('\nClassification Report:')
    print(classification_report(y_test, y_pred))
    print('=' * 50)

In [None]:
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix for {name}')
    plt.show()

In [None]:
feature_importances1 = pd.Series(classifiers['Decision Tree'].feature_importances_, index=x.columns)
feature_importances1.sort_values(ascending=False, inplace=True)
feature_importances1.plot(kind='bar', figsize=(10, 6))
plt.title('Feature Importance from Decision Tree (Gini)')
plt.ylabel('Importance')
plt.xlabel('Features')
plt.show()

In [None]:
feature_importances2 = pd.Series(classifiers['Decision Tree (Entropy)'].feature_importances_, index=x.columns)
feature_importances2.sort_values(ascending=False, inplace=True)
feature_importances2.plot(kind='bar', figsize=(10, 6))
plt.title('Feature Importance from Decision Tree (Entropy)')
plt.ylabel('Importance')
plt.xlabel('Features')
plt.show()

In [None]:
df1 = pd.DataFrame(feature_importances1, columns=['Gini Importance'])
df2 = pd.DataFrame(feature_importances2, columns=['Entropy Importance'])
combined_df = pd.concat([df1, df2], axis=1)
combined_df.to_csv('combined_df.csv', index=True)
from google.colab import files
files.download('combined_df.csv')