In [29]:
import time
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# Load the dataset
df = pd.read_csv('mushroom_mixed_50000.csv')

# Identify the target column
target_column = 'class'  # Change this if the target column name is different

# Separate features and target variable
X = df.drop(target_column, axis=1)
y = df[target_column]

# Convert categorical target column to numeric labels (if necessary)
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Apply One-Hot Encoding to categorical columns
X = pd.get_dummies(X, columns=categorical_cols)

In [None]:
# Ensure y is a Pandas Series
y = pd.Series(y, name=target_column)

print("First 10 rows of Features (X):")
print(X.head(10))

print("\nFirst 10 rows of Target (y):")
print(y.head(10))


## Code for Decision Tree


In [None]:
def run_decision_tree(param_grid, folds=10):
    start_time = time.time()
    
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    fold_accuracies = []
    best_params_overall = None
    
    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]  # Indexing directly for numpy array

        # Initialize DecisionTreeClassifier
        clf = DecisionTreeClassifier(random_state=0)
        
        # Use GridSearchCV to find the best parameters
        grid_search = GridSearchCV(clf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_params_overall = best_params
        
        print(f"Fold {fold}: Best parameters: {best_params}")
        
        y_pred = best_model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        fold_accuracies.append(accuracy)
        print(f"Fold {fold}: Accuracy: {accuracy * 100:.2f}%")
        
        cm = confusion_matrix(y_test, y_pred)
        print(f"Fold {fold}: Confusion Matrix:\n{cm}\n")
        
    mean_accuracy = np.mean(fold_accuracies)
    std_accuracy = np.std(fold_accuracies)
    
    final_model = grid_search.best_estimator_
    final_accuracy = accuracy_score(y, final_model.predict(X))
    end_time = time.time()
    
    print(f"Final mean accuracy over {folds} folds: {mean_accuracy * 100:.2f}%")
    print(f"Final standard deviation of accuracy over {folds} folds: {std_accuracy * 100:.2f}%")
    print(f"Final accuracy of the model on the entire dataset: {final_accuracy * 100:.2f}%")
    print(f"Total execution time: {end_time - start_time:.2f} seconds")
    print(f"\nBest parameters overall: {best_params_overall}")

## Code for Random Forest

In [30]:
def run_random_forest(param_grid, folds=10):
    start_time = time.time()
    
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    fold_accuracies = []
    best_params_overall = None
    
    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]  # Ensuring correct indexing for Pandas DataFrame

        # Initialize RandomForestClassifier
        clf = RandomForestClassifier(random_state=42)
        
        # Perform GridSearchCV to find the best parameters
        grid_search = GridSearchCV(clf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_params_overall = best_params
        
        print(f"Fold {fold}: Best parameters: {best_params}")
        
        y_pred = best_model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        fold_accuracies.append(accuracy)
        print(f"Fold {fold}: Accuracy: {accuracy * 100:.2f}%")
        
        cm = confusion_matrix(y_test, y_pred)
        print(f"Fold {fold}: Confusion Matrix:\n{cm}\n")
        
    mean_accuracy = np.mean(fold_accuracies)
    std_accuracy = np.std(fold_accuracies)
    
    final_model = grid_search.best_estimator_
    final_accuracy = accuracy_score(y, final_model.predict(X))
    end_time = time.time()
    
    print(f"Final mean accuracy over {folds} folds: {mean_accuracy * 100:.2f}%")
    print(f"Final standard deviation of accuracy over {folds} folds: {std_accuracy * 100:.2f}%")
    print(f"Final accuracy of the model on the entire dataset: {final_accuracy * 100:.2f}%")
    print(f"Total execution time: {end_time - start_time:.2f} seconds")
    print(f"\nBest parameters overall: {best_params_overall}")


## Code for Nearest Neighbour Algorithm

In [None]:
def run_knn(param_grid, folds=10):
    start_time = time.time()
    
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    fold_accuracies = []
    best_params_overall = None
    
    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Initialize KNN classifier
        knn = KNeighborsClassifier()

        # Perform GridSearchCV to find the best parameters
        grid_search = GridSearchCV(knn, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_params_overall = best_params
        
        print(f"Fold {fold}: Best parameters: {best_params}")

        y_pred = best_model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        fold_accuracies.append(accuracy)
        print(f"Fold {fold}: Accuracy: {accuracy * 100:.2f}%")

        cm = confusion_matrix(y_test, y_pred)
        print(f"Fold {fold}: Confusion Matrix:\n{cm}\n")

    mean_accuracy = np.mean(fold_accuracies)
    std_accuracy = np.std(fold_accuracies)

    final_model = grid_search.best_estimator_
    final_accuracy = accuracy_score(y, final_model.predict(X))
    end_time = time.time()

    print(f"Final mean accuracy over {folds} folds: {mean_accuracy * 100:.2f}%")
    print(f"Final standard deviation of accuracy over {folds} folds: {std_accuracy * 100:.2f}%")
    print(f"Final accuracy of the model on the entire dataset: {final_accuracy * 100:.2f}%")
    print(f"Total execution time: {end_time - start_time:.2f} seconds")
    print(f"\nBest parameters overall: {best_params_overall}")

# Example parameter grid for tuning
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

## Code for k-

## Runs of Decision Tree

In [None]:
param_grid = {
    'criterion': ['entropy'],
    'max_depth': [3, 5, 10],  # Remove None to avoid deep trees
    'min_samples_split': [5, 10, 20],  # Increase min_samples_split to force generalization
    'min_samples_leaf': [2, 4, 10],  # Increase min_samples_leaf
    'ccp_alpha': [0.01, 0.05, 0.1]  # Apply pruning
}


print("Running Decision Tree with 10 folds:")
run_decision_tree(param_grid)

print("\nRunning Decision Tree with 20 folds:")
run_decision_tree(param_grid, folds=20)

In [None]:
param_grid_4 = {
    'criterion': ['entropy'],
    'max_depth': [10, 12, 15],  # Reducing depth for better generalization
    'min_samples_split': [15, 20, 25],  # Larger splits to avoid small branches
    'min_samples_leaf': [6, 7, 8],  # Ensures minimum samples per leaf
    'splitter': ['best'],  # Keeping only deterministic behavior
    'ccp_alpha': [0.01, 0.05, 0.1]  # Stronger pruning for simplicity
}

print("Running Decision Tree with 10 folds:")
run_decision_tree(param_grid_4)

print("\nRunning Decision Tree with 20 folds:")
run_decision_tree(param_grid_4, folds=20)


In [None]:
param_grid_2 = {
    'criterion': ['entropy'],
    'max_depth': [7, 9, 12],  # Slightly deeper to regain accuracy
    'min_samples_split': [5, 10, 15],  # Allow splits at a reasonable level
    'min_samples_leaf': [2, 4, 6],  # Ensure smaller leaf nodes to capture patterns
    'splitter': ['best'],  # Deterministic behavior
    'ccp_alpha': [0.001, 0.005, 0.01]  # Moderate pruning for controlled growth
}

print("Running Decision Tree with 10 folds:")
run_decision_tree(param_grid_2)

print("\nRunning Decision Tree with 20 folds:")
run_decision_tree(param_grid_2, folds=20)


In [None]:
param_grid_3 = {
    'criterion': ['entropy'],
    'max_depth': [9, 12, 15],  # Increased to allow deeper insights
    'min_samples_split': [2, 5, 10],  # Slightly lower split requirement
    'min_samples_leaf': [1, 2, 4],  # Allows for better granularity
    'splitter': ['best'],  # Deterministic behavior
    'ccp_alpha': [0.0005, 0.001, 0.005]  # Fine-tuned pruning
}

print("Running Decision Tree with 10 folds:")
run_decision_tree(param_grid_3)

print("\nRunning Decision Tree with 20 folds:")
run_decision_tree(param_grid_3, folds=20)


## Runs with Random Forest

In [40]:
param_grid_1 = {
    'n_estimators': [10],  
    'criterion': ['entropy'],
    'max_depth': [5, 10],  
    'min_samples_split': [10, 20],  
    'min_samples_leaf': [4, 6],  
    'ccp_alpha': [0.01, 0.1],  
    'max_features': ['sqrt', 'log2']
}
run_random_forest(param_grid_1)


Fold 1: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 20, 'n_estimators': 10}
Fold 1: Accuracy: 85.28%
Fold 1: Confusion Matrix:
[[1764  404]
 [ 332 2500]]

Fold 2: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 6, 'min_samples_split': 20, 'n_estimators': 10}
Fold 2: Accuracy: 84.96%
Fold 2: Confusion Matrix:
[[1761  472]
 [ 280 2487]]

Fold 3: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 10}
Fold 3: Accuracy: 85.16%
Fold 3: Confusion Matrix:
[[1675  520]
 [ 222 2583]]

Fold 4: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 6, 'min_samples_split': 20, 'n_estimators': 10}
Fold 4: Accuracy: 82.82%
Fold 4: Confusion Matrix:


In [41]:
param_grid_2 = {
    'n_estimators': [50],  
    'criterion': ['entropy'],
    'max_depth': [10, 15],  
    'min_samples_split': [10, 20],  
    'min_samples_leaf': [4, 6],  
    'ccp_alpha': [0.01],  
    'max_features': ['sqrt', 'log2']
}
run_random_forest(param_grid_2)


Fold 1: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 6, 'min_samples_split': 20, 'n_estimators': 50}
Fold 1: Accuracy: 89.50%
Fold 1: Confusion Matrix:
[[1807  361]
 [ 164 2668]]

Fold 2: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 6, 'min_samples_split': 10, 'n_estimators': 50}
Fold 2: Accuracy: 87.44%
Fold 2: Confusion Matrix:
[[1807  426]
 [ 202 2565]]

Fold 3: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 6, 'min_samples_split': 10, 'n_estimators': 50}
Fold 3: Accuracy: 87.60%
Fold 3: Confusion Matrix:
[[1740  455]
 [ 165 2640]]

Fold 4: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 6, 'min_samples_split': 20, 'n_estimators': 50}
Fold 4: Accuracy: 87.70%
Fold 4: Confusion Matrix:


In [42]:
param_grid_3 = {
    'n_estimators': [100],  
    'criterion': ['entropy'],
    'max_depth': [15],  
    'min_samples_split': [20],  
    'min_samples_leaf': [6],  
    'ccp_alpha': [0.01],  
    'max_features': ['log2']
}
run_random_forest(param_grid_3)


Fold 1: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 6, 'min_samples_split': 20, 'n_estimators': 100}
Fold 1: Accuracy: 84.80%
Fold 1: Confusion Matrix:
[[1596  572]
 [ 188 2644]]

Fold 2: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 6, 'min_samples_split': 20, 'n_estimators': 100}
Fold 2: Accuracy: 84.22%
Fold 2: Confusion Matrix:
[[1645  588]
 [ 201 2566]]

Fold 3: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 6, 'min_samples_split': 20, 'n_estimators': 100}
Fold 3: Accuracy: 82.92%
Fold 3: Confusion Matrix:
[[1579  616]
 [ 238 2567]]

Fold 4: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 6, 'min_samples_split': 20, 'n_estimators': 100}
Fold 4: Accuracy: 83.16%
Fold 4: Confusion Matr

In [44]:
param_grid = {
    'n_estimators': [50, 100],  # Reduce tree count to prevent excessive fitting
    'criterion': ['entropy'],  
    'max_depth': [5, 10],  # Strict depth limit
    'min_samples_split': [10, 20],  # Require more samples to split
    'min_samples_leaf': [4, 6],  # Larger leaf size for regularization
    'ccp_alpha': [0.01, 0.05, 0.1],  # Stronger pruning
    'max_features': ['log2']  # Limit feature selection to prevent memorization
}
run_random_forest(param_grid)


Fold 1: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 20, 'n_estimators': 100}
Fold 1: Accuracy: 83.74%
Fold 1: Confusion Matrix:
[[1542  626]
 [ 187 2645]]

Fold 2: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 6, 'min_samples_split': 20, 'n_estimators': 100}
Fold 2: Accuracy: 81.52%
Fold 2: Confusion Matrix:
[[1538  695]
 [ 229 2538]]

Fold 3: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 20, 'n_estimators': 100}
Fold 3: Accuracy: 82.88%
Fold 3: Confusion Matrix:
[[1516  679]
 [ 177 2628]]

Fold 4: Best parameters: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 6, 'min_samples_split': 10, 'n_estimators': 100}
Fold 4: Accuracy: 82.54%
Fold 4: Confusion Matr

## Result
Judging from above runs, it is found that between Random forest and Decision tree, the Random forest classification is giving better results. Therefor, I will be choosing the best Accuracy result from Random forest runs for making the model.

In [None]:
import pickle

param_grid = {
    'criterion': 'entropy',
    'max_depth': 20,
    'min_samples_split': 10,
    'min_samples_leaf': 4
}

# Initialize and fit the RandomForestClassifier with the chosen parameters
rf_model = RandomForestClassifier(
    criterion=param_grid['criterion'],
    max_depth=param_grid['max_depth'],
    min_samples_split=param_grid['min_samples_split'],
    min_samples_leaf=param_grid['min_samples_leaf'],
    random_state=42
)

# Fit the model on the training data
rf_model.fit(X, y)

# Now, save the trained model to a pickle file
model_filename = 'result_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(rf_model, file)

print(f"Model saved to {model_filename}")


## Testing

In [None]:
model_filename = 'result_model.pkl'
with open(model_filename, 'rb') as file:
    loaded_model = pickle.load(file)

test_data = pd.read_csv('test kunal.csv')

X_test = test_data.drop('Has_diabetes', axis=1)  # Features
y_test = test_data['Has_diabetes']  # Target column

y_pred_test = loaded_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Accuracy of the loaded model on the test set: {test_accuracy * 100:.2f}%")

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_test)
print(f"Confusion Matrix on Test Data:\n{cm}")