In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

In [124]:
# Load the dataset
df = pd.read_csv('mushroom_mixed_50000.csv')

# Identify the target column
target_column = 'class'  # Change this if the target column name is different

# Separate features and target variable
X = df.drop(target_column, axis=1)
y = df[target_column]

# Convert categorical target column to numeric labels (if necessary)
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Apply One-Hot Encoding to categorical columns
X = pd.get_dummies(X, columns=categorical_cols)


In [125]:
# Ensure y is a Pandas Series
y = pd.Series(y, name=target_column)

print("First 10 rows of Features (X):")
print(X.head(10))

print("\nFirst 10 rows of Target (y):")
print(y.head(10))


First 10 rows of Features (X):
   cap-diameter  stem-height  stem-width  cap-shape_b  cap-shape_c  \
0          9.39         9.04       16.26        False        False   
1         15.42         6.15       32.78        False        False   
2          6.07         6.80        6.53        False        False   
3          4.64         8.37        6.52        False        False   
4         17.87        19.03       18.39        False        False   
5          7.96         7.81       20.94        False        False   
6          5.64         4.10        6.34        False        False   
7          3.05         4.85        4.20         True        False   
8         15.67         6.43       30.60        False        False   
9          8.49         6.04       24.96        False        False   

   cap-shape_f  cap-shape_o  cap-shape_p  cap-shape_s  cap-shape_x  ...  \
0         True        False        False        False        False  ...   
1         True        False        False        

## Code for Decision Tree


In [118]:
def run_decision_tree(param_grid, folds=10):
    start_time = time.time()
    
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    fold_accuracies = []
    best_params_overall = None
    
    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]  # Indexing directly for numpy array

        # Initialize DecisionTreeClassifier
        clf = DecisionTreeClassifier(random_state=0)
        
        # Use GridSearchCV to find the best parameters
        grid_search = GridSearchCV(clf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_params_overall = best_params
        
        print(f"Fold {fold}: Best parameters: {best_params}")
        
        y_pred = best_model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        fold_accuracies.append(accuracy)
        print(f"Fold {fold}: Accuracy: {accuracy * 100:.2f}%")
        
        cm = confusion_matrix(y_test, y_pred)
        print(f"Fold {fold}: Confusion Matrix:\n{cm}\n")
        
    mean_accuracy = np.mean(fold_accuracies)
    std_accuracy = np.std(fold_accuracies)
    
    final_model = grid_search.best_estimator_
    final_accuracy = accuracy_score(y, final_model.predict(X))
    end_time = time.time()
    
    print(f"Final mean accuracy over {folds} folds: {mean_accuracy * 100:.2f}%")
    print(f"Final standard deviation of accuracy over {folds} folds: {std_accuracy * 100:.2f}%")
    print(f"Final accuracy of the model on the entire dataset: {final_accuracy * 100:.2f}%")
    print(f"Total execution time: {end_time - start_time:.2f} seconds")
    print(f"\nBest parameters overall: {best_params_overall}")

## Code for Random Forest

In [121]:
def run_random_forest(param_grid, folds=10):
    start_time = time.time()
    
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    fold_accuracies = []
    best_params_overall = None
    
    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]  # Ensuring correct indexing for Pandas DataFrame

        # Initialize RandomForestClassifier
        clf = RandomForestClassifier(random_state=42)
        
        # Perform GridSearchCV to find the best parameters
        grid_search = GridSearchCV(clf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_params_overall = best_params
        
        print(f"Fold {fold}: Best parameters: {best_params}")
        
        y_pred = best_model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        fold_accuracies.append(accuracy)
        print(f"Fold {fold}: Accuracy: {accuracy * 100:.2f}%")
        
        cm = confusion_matrix(y_test, y_pred)
        print(f"Fold {fold}: Confusion Matrix:\n{cm}\n")
        
    mean_accuracy = np.mean(fold_accuracies)
    std_accuracy = np.std(fold_accuracies)
    
    final_model = grid_search.best_estimator_
    final_accuracy = accuracy_score(y, final_model.predict(X))
    end_time = time.time()
    
    print(f"Final mean accuracy over {folds} folds: {mean_accuracy * 100:.2f}%")
    print(f"Final standard deviation of accuracy over {folds} folds: {std_accuracy * 100:.2f}%")
    print(f"Final accuracy of the model on the entire dataset: {final_accuracy * 100:.2f}%")
    print(f"Total execution time: {end_time - start_time:.2f} seconds")
    print(f"\nBest parameters overall: {best_params_overall}")


## Code for Nearest Neighbour Algorithm

In [130]:


# Function to run K-Nearest Neighbors with GridSearchCV
def run_knn(param_grid, folds=10):
    start_time = time.time()
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    fold_accuracies = []
    best_params_overall = None

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Initialize KNN Classifier
        clf_knn = KNeighborsClassifier()

        # Use GridSearchCV to find the best parameters
        grid_search = GridSearchCV(clf_knn, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # Get the best model and parameters
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_params_overall = best_params

        # Predict on the test set using the best model
        y_pred_knn = best_model.predict(X_test)

        # Compute accuracy for this fold
        accuracy = accuracy_score(y_test, y_pred_knn)
        fold_accuracies.append(accuracy)

    # Calculate mean accuracy across folds
    mean_accuracy = np.mean(fold_accuracies)
    return mean_accuracy * 100, best_params_overall

## Runs of Decision Tree

In [None]:
param_grid = {
    'criterion': ['entropy'],
    'max_depth': [3, 5, 10],  # Remove None to avoid deep trees
    'min_samples_split': [5, 10, 20],  # Increase min_samples_split to force generalization
    'min_samples_leaf': [2, 4, 10],  # Increase min_samples_leaf
    'ccp_alpha': [0.01, 0.05, 0.1]  # Apply pruning
}


print("Running Decision Tree with 10 folds:")
run_decision_tree(param_grid)

print("\nRunning Decision Tree with 20 folds:")
run_decision_tree(param_grid, folds=20)

In [None]:
param_grid_2 = {
    'criterion': ['entropy'],
    'max_depth': [7, 9, 12],  # Slightly deeper to regain accuracy
    'min_samples_split': [5, 10, 15],  # Allow splits at a reasonable level
    'min_samples_leaf': [2, 4, 6],  # Ensure smaller leaf nodes to capture patterns
    'splitter': ['best'],  # Deterministic behavior
    'ccp_alpha': [0.001, 0.005, 0.01]  # Moderate pruning for controlled growth
}

print("Running Decision Tree with 10 folds:")
run_decision_tree(param_grid_2)

print("\nRunning Decision Tree with 20 folds:")
run_decision_tree(param_grid_2, folds=20)

In [None]:


param_grid_3 = {
    'criterion': ['entropy'],
    'max_depth': [10, 12, 15],  # Reducing depth for better generalization
    'min_samples_split': [15, 20, 25],  # Larger splits to avoid small branches
    'min_samples_leaf': [6, 7, 8],  # Ensures minimum samples per leaf
    'splitter': ['best'],  # Keeping only deterministic behavior
    'ccp_alpha': [0.01, 0.05, 0.1]  # Stronger pruning for simplicity
}

print("Running Decision Tree with 10 folds:")
run_decision_tree(param_grid_3)

print("\nRunning Decision Tree with 20 folds:")
run_decision_tree(param_grid_3, folds=20)



In [None]:
param_grid_4 = {
    'criterion': ['entropy'],
    'max_depth': [9, 12, 15],  # Increased to allow deeper insights
    'min_samples_split': [2, 5, 10],  # Slightly lower split requirement
    'min_samples_leaf': [1, 2, 4],  # Allows for better granularity
    'splitter': ['best'],  # Deterministic behavior
    'ccp_alpha': [0.0005, 0.001, 0.005]  # Fine-tuned pruning
}

print("Running Decision Tree with 10 folds:")
run_decision_tree(param_grid_4)

print("\nRunning Decision Tree with 20 folds:")
run_decision_tree(param_grid_4, folds=20)


## Runs with Random Forest

In [None]:



param_grid_1 = {
    'n_estimators': [50, 100],  # Reduce tree count to prevent excessive fitting
    'criterion': ['entropy'],  
    'max_depth': [5, 10],  # Strict depth limit
    'min_samples_split': [10, 20],  # Require more samples to split
    'min_samples_leaf': [4, 6],  # Larger leaf size for regularization
    'ccp_alpha': [0.01, 0.05, 0.1],  # Stronger pruning
    'max_features': ['log2']  # Limit feature selection to prevent memorization
}
run_random_forest(param_grid_1)



In [None]:


param_grid_2 = {
    'n_estimators': [100],  
    'criterion': ['entropy'],
    'max_depth': [15],  
    'min_samples_split': [20],  
    'min_samples_leaf': [6],  
    'ccp_alpha': [0.01],  
    'max_features': ['log2']
}
run_random_forest(param_grid_2)



In [None]:
param_grid_3 = {
    'n_estimators': [10],  
    'criterion': ['entropy'],
    'max_depth': [5, 10],  
    'min_samples_split': [10, 20],  
    'min_samples_leaf': [4, 6],  
    'ccp_alpha': [0.01, 0.1],  
    'max_features': ['sqrt', 'log2']
}
run_random_forest(param_grid_3)

In [None]:

param_grid_4 = {
    'n_estimators': [50],  
    'criterion': ['entropy'],
    'max_depth': [10, 15],  
    'min_samples_split': [10, 20],  
    'min_samples_leaf': [4, 6],  
    'ccp_alpha': [0.01],  
    'max_features': ['sqrt', 'log2']
}
run_random_forest(param_grid_4)

## Runs of Nearest Neighbour

In [129]:
param_grid_knn = {
    'n_neighbors': [30, 33 ,39],  # Balanced number of neighbors
    'p': [1, 2],  # Manhattan (p=1) and Euclidean (p=2) distances
    'weights': ['distance']  # Avoid distance weighting to prevent overfitting
}

# Run KNN with 10-Fold and 20-Fold Cross-Validation
results = []
for folds in [3, 5]:
    accuracy, best_params = run_knn(param_grid_knn, folds=folds)
    results.append((f"{folds}-Fold CV", accuracy, best_params))

# Convert results to DataFrame and print
df_results = pd.DataFrame(results, columns=['Run', 'Accuracy (%)', 'Best Parameters'])
print(df_results)

         Run  Accuracy (%)                                    Best Parameters
0  3-Fold CV        99.968  {'n_neighbors': 30, 'p': 1, 'weights': 'distan...
1  5-Fold CV        99.968  {'n_neighbors': 30, 'p': 1, 'weights': 'distan...


In [131]:
param_grid_knn = {
    'n_neighbors': [15, 17, 19],  # Balanced number of neighbors
    'p': [1, 2],  # Manhattan (p=1) and Euclidean (p=2) distances
    'weights': ['distance']  # Avoid distance weighting to prevent overfitting
}

# Run KNN with 10-Fold and 20-Fold Cross-Validation
results = []
for folds in [3, 5]:
    accuracy, best_params = run_knn(param_grid_knn, folds=folds)
    results.append((f"{folds}-Fold CV", accuracy, best_params))

# Convert results to DataFrame and print
df_results = pd.DataFrame(results, columns=['Run', 'Accuracy (%)', 'Best Parameters'])
print(df_results)

         Run  Accuracy (%)                                    Best Parameters
0  3-Fold CV         99.98  {'n_neighbors': 15, 'p': 1, 'weights': 'distan...
1  5-Fold CV         99.98  {'n_neighbors': 15, 'p': 1, 'weights': 'distan...


In [132]:
param_grid_knn = {
    'n_neighbors': [9, 11, 13],  # Balanced number of neighbors
    'p': [1, 2],  # Manhattan (p=1) and Euclidean (p=2) distances
    'weights': ['distance']  # Avoid distance weighting to prevent overfitting
}

# Run KNN with 10-Fold and 20-Fold Cross-Validation
results = []
for folds in [3, 5]:
    accuracy, best_params = run_knn(param_grid_knn, folds=folds)
    results.append((f"{folds}-Fold CV", accuracy, best_params))

# Convert results to DataFrame and print
df_results = pd.DataFrame(results, columns=['Run', 'Accuracy (%)', 'Best Parameters'])
print(df_results)

         Run  Accuracy (%)                                    Best Parameters
0  3-Fold CV        99.986  {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
1  5-Fold CV        99.986  {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}


In [133]:
param_grid_knn = {
    'n_neighbors': [8, 11, 12],  # Balanced number of neighbors
    'p': [1, 2],  # Manhattan (p=1) and Euclidean (p=2) distances
    'weights': ['distance']  # Avoid distance weighting to prevent overfitting
}

# Run KNN with 10-Fold and 20-Fold Cross-Validation
results = []
for folds in [3, 5]:
    accuracy, best_params = run_knn(param_grid_knn, folds=folds)
    results.append((f"{folds}-Fold CV", accuracy, best_params))

# Convert results to DataFrame and print
df_results = pd.DataFrame(results, columns=['Run', 'Accuracy (%)', 'Best Parameters'])
print(df_results)

         Run  Accuracy (%)                                    Best Parameters
0  3-Fold CV        99.986  {'n_neighbors': 8, 'p': 1, 'weights': 'distance'}
1  5-Fold CV        99.986  {'n_neighbors': 8, 'p': 1, 'weights': 'distance'}


## Result
Judging from above runs, it is found that between Random forest and Decision tree, the Random forest classification is giving better results. Therefor, I will be choosing the best Accuracy result from Random forest runs for making the model.

In [138]:
import pickle

# Define hyperparameter grid
param_grid = {
    'criterion': ['entropy'],
    'max_depth': [10, 15, 20],  
    'min_samples_split': [5, 10, 15],  
    'min_samples_leaf': [2, 4, 6],  
    'ccp_alpha': [0.001, 0.005, 0.01]  
}

# Function to run Decision Tree with cross-validation and select the best model
def run_decision_tree(param_grid, folds=10):
    start_time = time.time()
    
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    fold_accuracies = []
    best_model = None
    best_params_overall = None

    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Initialize DecisionTreeClassifier
        clf = DecisionTreeClassifier(random_state=42)
        
        # Use GridSearchCV to find the best parameters
        grid_search = GridSearchCV(clf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_params = grid_search.best_params_
        best_model = grid_search.best_estimator_  # Store the best model from the final fold
        best_params_overall = best_params
        
        print(f"Fold {fold}: Best parameters: {best_params}")
        
        y_pred = best_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        fold_accuracies.append(accuracy)
        print(f"Fold {fold}: Accuracy: {accuracy * 100:.2f}%")
        
        cm = confusion_matrix(y_test, y_pred)
        print(f"Fold {fold}: Confusion Matrix:\n{cm}\n")

    mean_accuracy = np.mean(fold_accuracies)
    std_accuracy = np.std(fold_accuracies)
    end_time = time.time()

    print("\n===== FINAL RESULTS =====")
    print(f"Mean Accuracy over {folds} folds: {mean_accuracy * 100:.2f}%")
    print(f"Standard Deviation: {std_accuracy * 100:.2f}%")
    print(f"Total Execution Time: {end_time - start_time:.2f} seconds")
    print(f"\nBest Parameters Overall: {best_params_overall}")

    return best_model

# Run the Decision Tree model and obtain the best model
best_model = run_decision_tree(param_grid, folds=10)

# Save the best Decision Tree model to a pickle file
model_filename = 'best_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(best_model, file)

print(f"Best Decision Tree model saved to {model_filename}")

Fold 1: Best parameters: {'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5}
Fold 1: Accuracy: 98.50%
Fold 1: Confusion Matrix:
[[2138   30]
 [  45 2787]]

Fold 2: Best parameters: {'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5}
Fold 2: Accuracy: 98.84%
Fold 2: Confusion Matrix:
[[2211   22]
 [  36 2731]]

Fold 3: Best parameters: {'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 15}
Fold 3: Accuracy: 98.82%
Fold 3: Confusion Matrix:
[[2164   31]
 [  28 2777]]

Fold 4: Best parameters: {'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5}
Fold 4: Accuracy: 99.10%
Fold 4: Confusion Matrix:
[[2149   19]
 [  26 2806]]

Fold 5: Best parameters: {'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5}
Fold 5: Accur

## Testing

In [None]:
import pickle


# Load the trained model
model_filename = "best_model.pkl"  # Ensure correct model file name
with open(model_filename, "rb") as file:
    loaded_model = pickle.load(file)

# Load the test dataset
test_file = "mushroom_mixed_50000.csv"
test_data = pd.read_csv(test_file)

# Set the correct target column
target_col = "class"  # Represents edible ('e') or poisonous ('p')

# Separate features and target variable
X_test = test_data.drop(columns=[target_col], errors="ignore")  # Features
y_test = test_data[target_col]  # Target variable

# Apply label encoding to match the trained model
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)  # Converts 'e'/'p' into 0/1

# Ensure categorical variables are encoded as they were during training
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_test_encoded = pd.DataFrame(encoder.fit_transform(X_test.select_dtypes(include=["object"])))

# Rename columns with original names
X_test_encoded.columns = encoder.get_feature_names_out(X_test.select_dtypes(include=["object"]).columns)

# Add back numerical columns
X_test_final = X_test_encoded.join(X_test.select_dtypes(exclude=["object"]).reset_index(drop=True))

# Ensure features are in the same order as training
expected_features = loaded_model.feature_names_in_
X_test_final = X_test_final.reindex(columns=expected_features, fill_value=0)  # Fill missing features with 0

# Make predictions
y_pred_test = loaded_model.predict(X_test_final)

# Compute accuracy
test_accuracy = accuracy_score(y_test_encoded, y_pred_test)  # Use encoded y_test
print(f"\n✅ Accuracy of the loaded model on the test set: {test_accuracy * 100:.2f}%")

# Compute confusion matrix
cm = confusion_matrix(y_test_encoded, y_pred_test)  # Compare encoded labels
print("\nConfusion Matrix on Test Data:")
print(cm)



✅ Accuracy of the loaded model on the test set: 98.48%

Confusion Matrix on Test Data:
[[22164    69]
 [  691 27076]]
