In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pickle

def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

def encode_categorical_columns(df):
    categorical_columns = df.select_dtypes(include=['object']).columns
    label_encoders = {}
    
    for column in categorical_columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le
    
    with open('preprocessor.pkl', 'wb') as f:
        pickle.dump(label_encoders, f)
    
    return df

def train_and_save_best_model(X_train, X_test, y_train, y_test):
    models = [
        ('Random Forest', RandomForestClassifier()),
        ('SVM', SVC()),
        ('K-Nearest Neighbors', KNeighborsClassifier())
    ]
    best_accuracy = 0.0
    best_model = None
    
    for model_name, model in models:
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        accuracy = accuracy_score(y_test, preds)
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model_name
    
    with open('model.pkl', 'wb') as f:
        pickle.dump(best_model, f)
    
    return best_model, best_accuracy

if __name__ == "__main__":
    # Load dataset
    dataset_path = '/workspaces/mushroom-classification/notebook/data/mushrooms.csv'
    df = load_data(dataset_path)

    # Encode all categorical columns and save label encoders
    df_encoded = encode_categorical_columns(df)

    # Specify target column
    target_column = 'class'
    
    # Split data
    X = df_encoded.drop(target_column, axis=1)
    y = df_encoded[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train models and save the best one
    best_model_name, best_accuracy = train_and_save_best_model(X_train, X_test, y_train, y_test)
    
    print(f"The best model trained is: {best_model_name}")
    print(f"Accuracy of the best model: {best_accuracy:.2%}")


The best model trained is: Random Forest
Accuracy of the best model: 100.00%
