<a href="https://colab.research.google.com/github/0zzge/mushroom-classification-ml/blob/main/mushroom_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#necessary libraries
import pandas as pd
import numpy as np
import math
from sklearn.feature_selection import mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
#upload the data
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')
val_df=pd.read_csv('validation.csv')


In [None]:
#clean the different coloumn names
train_df.columns = train_df.columns.str.strip()
test_df.columns = test_df.columns.str.strip()
val_df.columns = val_df.columns.str.strip()

In [None]:
#feature selection
X_train=train_df.drop(['target'],axis=1)
y_train=train_df['target']
X_test=test_df.drop(['target'],axis=1)
y_test=test_df['target']
X_val=val_df.drop(['target'],axis=1)
y_val=val_df['target']

In [None]:
#choose most meaningful 10 feature by using mutual information
mut_info=mutual_info_classif(X_train,y_train)
mut_info=pd.Series(mut_info)
mut_info.index=X_train.columns
mut_info.sort_values(ascending=False)
selected_feature=mut_info.head(10).index.values
mut_info.head(10)

Unnamed: 0,0
dangerous_shape,0.011644
irregular_surface,0.008619
dark_cap_color,0.0
has_bruises,0.135172
strong_odor,0.455955
non_free_gills,0.005703
dense_gills,0.008794
narrow_gills,0.161262
dark_gill_color,0.0
tapering_stalk,0.005185


In [None]:
#1st Model:Decision Tree
model=DecisionTreeClassifier(random_state=42)
model.fit(X_train[selected_feature],y_train)
dt_preds=model.predict(X_test[selected_feature])

In [None]:
#2nd Model KNN
class KNN:
    def __init__(self, k=5, distance_metric='hamming'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self,X,y):
        self.X_train=X
        self.y_train=y

    def hamming_distance(self,x1,x2):
        distance=0
        for i in range(len(x1)):
            if x1[i]!=x2[i]:
                distance+=1
        return distance

    def predict(self, X):
        X = X.to_numpy()
        preds = []
        for x in X:
            distances = [self.hamming_distance(x, train_x) for train_x in self.X_train]
            k_indices = np.argsort(distances)[:self.k]
            k_labels = self.y_train[k_indices]
            majority_vote = Counter(k_labels).most_common(1)[0][0]
            preds.append(majority_vote)
        return np.array(preds)


knn=KNN(k=5,distance_metric="hamming_distance")
knn.fit(X_train[selected_feature],y_train)
pred_knn=knn.predict(X_test[selected_feature])
print("Predictions:",Counter(pred_knn))

Predictions: Counter({np.int64(0): 1625})


In [None]:
def load_data():
    train = pd.read_csv('train.csv')
    val = pd.read_csv('validation.csv')
    test = pd.read_csv('test.csv')

    # Assuming binary features already encoded as 0/1
    # and target column named 'target' (0=edible, 1=poisonous)
    X_train = train.drop('target', axis=1).values
    y_train = train['target'].values

    X_val = val.drop('target', axis=1).values
    y_val = val['target'].values

    X_test = test.drop('target', axis=1).values
    y_test = test['target'].values

    return (X_train, y_train), (X_val, y_val), (X_test, y_test)


def train_naive_bayes(X_train, y_train):
    classes = np.unique(y_train)
    n_features = X_train.shape[1]

    # Calculate priors P(y)
    priors = np.array([np.mean(y_train == c) for c in classes])

    # Calculate likelihoods P(x_i=1|y) with Laplace smoothing
    likelihoods = np.zeros((len(classes), n_features))
    for i, c in enumerate(classes):
        X_c = X_train[y_train == c]
        likelihoods[i] = (X_c.sum(axis=0) + 1) / (len(X_c) + 2)

    return {'classes': classes, 'priors': priors, 'likelihoods': likelihoods}

def predict_naive_bayes(model, X):
    classes = model['classes']
    priors = model['priors']
    likelihoods = model['likelihoods']

    predictions = []
    for x in X:
        log_posteriors = []
        for i, c in enumerate(classes):
            log_posterior = np.log(priors[i])
            log_posterior += np.sum(np.where(x == 1,
                                        np.log(likelihoods[i]),
                                        np.log(1 - likelihoods[i])))
            log_posteriors.append(log_posterior)
        predictions.append(classes[np.argmax(log_posteriors)])
    return np.array(predictions)

# Load data
(X_train, y_train), (X_val, y_val), (X_test, y_test) = load_data()

# Train model
model = train_naive_bayes(X_train, y_train)

# Validate
y_val_pred = predict_naive_bayes(model, X_val)

# Test
nb_preds = predict_naive_bayes(model, X_test)


In [None]:
# Get all the predictions from three  models
classifier_predictions = [nb_preds,pred_knn,dt_preds]
all_preds = np.vstack(classifier_predictions)

In [None]:
#4th Model Hard Voting
def hard_vote(predictions):
    final_preds = []
    for i in range(predictions.shape[1]):
        counts = {}
        for pred in predictions[:, i]:
            counts[pred] = counts.get(pred, 0) + 1
        majority_class = max(counts, key=counts.get)
        final_preds.append(majority_class)
    return np.array(final_preds)

In [None]:
# Get final predictions
voted_preds = hard_vote(all_preds)
print(voted_preds)

[1 1 0 ... 0 0 1]


In [None]:
#Measure the how much classification model work well
def evaluate(true,pred):
    cm=confusion_matrix(true,pred)
    f1=f1_score(true,pred)
    accuracy=accuracy_score(true,pred)
    precision=precision_score(true,pred)
    recall=recall_score(true,pred)
    print(f"Confusion Matrix: \n{cm}")
    print(f"F1 Score: {f1}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")

In [None]:
evaluate(y_test,dt_preds)

Confusion Matrix: 
[[842   0]
 [ 18 765]]
F1 Score: 0.9883720930232558
Accuracy: 0.9889230769230769
Precision: 1.0
Recall: 0.9770114942528736


In [None]:
evaluate(y_test,pred_knn)

Confusion Matrix: 
[[842   0]
 [783   0]]
F1 Score: 0.0
Accuracy: 0.5181538461538462
Precision: 0.0
Recall: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
evaluate(y_test,nb_preds)


Confusion Matrix: 
[[782  60]
 [ 17 766]]
F1 Score: 0.9521441889372281
Accuracy: 0.9526153846153846
Precision: 0.927360774818402
Recall: 0.9782886334610472


In [None]:
evaluate(y_test,voted_preds)

Confusion Matrix: 
[[842   0]
 [ 22 761]]
F1 Score: 0.9857512953367875
Accuracy: 0.9864615384615385
Precision: 1.0
Recall: 0.9719029374201787


**ANALYSIS** **OF** **MODELS**

In this project, we applied four different machine learning models to solve a classification problem and evaluated their performance using various metrics. The models we used were Decision Tree, K-Nearest Neighbors (KNN), Naive Bayes, and Hard Voting. To assess the performance of each model, we considered metrics such as accuracy, precision, recall, F1 score, and the confusion matrix.

Our first model, the Decision Tree, predicted only a single class (0). We observed that the model failed to distinguish between the two classes. The precision, recall, and F1 score were all 0.0, and the accuracy was 51.8%. These results indicated that the model was not suitable for our classification task.

In contrast, the KNN model performed much better. It achieved an F1 score of 0.9521, accuracy of 95.26%, precision of 92.74%, and recall of 97.83%. We observed that the KNN model was able to effectively differentiate between both classes and performed particularly well in identifying poisonous mushrooms.

The third model, Naive Bayes, also delivered strong results. It achieved perfect precision (1.0), indicating that it made no false positive predictions. Additionally, we observed a recall of 97.19%, F1 score of 0.9858, and accuracy of 98.65%. This showed that Naive Bayes was both reliable and effective.

Finally, the Hard Voting model outperformed all others. It achieved an F1 score of 0.9884, accuracy of 98.89%, precision of 1.0, and recall of 97.70%. We observed that this ensemble model successfully combined the strengths of individual models and was able to classify poisonous mushrooms with near-perfect accuracy.

To sum up when comparing all four models, we concluded that the Hard Voting model delivered the best overall performance. It provided high accuracy and minimal misclassifications, making it the most reliable model for our classification task. While Naive Bayes and KNN also performed well, the Decision Tree model was insufficient and failed to deliver acceptable results for this problem.