In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [7]:
# Load the breast cancer dataset 
df = pd.read_csv('C:/Users/dimpa/OneDrive - stevens.edu/CS-513/HW_04_NB/breast-cancer-wisconsin.csv')

# Drop rows with missing values
df.dropna(inplace=True)

# Drop irrelevant columns if any
df.drop(columns=['Sample'], inplace=True)

# Convert F6 column to numeric and fill missing values with mean
df['F6'] = pd.to_numeric(df['F6'], errors='coerce').fillna(df['F6'].mean())

# Define features and target
X = df.drop('Class', axis=1)
y = df['Class']


TypeError: Could not convert 1102411010111113391111011071?17111111511111107?31011191183458856110232821211091121104211311112948101111111111610551313101019291083521032121010711011011110112111?11551?821101105311011?1010113?210111111101010111101111010181081810111171111010111105111108110105114111058101105110781101?1029102115121091?1101010810111810101010311010411011041?1117111010101010151011?10?105?110411011010113511111?1081510?11011101410811101011011101011110111181131011310471010331110101111111111111101111101121101111111191141111211?4110310121310111101211111181011111043211111101111016103111511141010111111111111011510131103411011051111111111154111111101011110115101111111011111111121111110115111511111111111101310510101121111111010111101311101011011111111110811101102101111?111211146511111311121111111111214111111110111111111158111111111101011111111151121345 to numeric

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

# Define categorical columns
categorical_columns = ['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9']

# Create preprocessor for categorical data and scaling for numerical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns),
        ('num', StandardScaler(), ~X.columns.isin(categorical_columns))
    ])

# Create a Categorical Naive Bayes model
cnb = CategoricalNB()

# Create a pipeline with preprocessing and model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', cnb)])

# Define hyperparameters for grid search
param_grid = {
    'classifier__alpha': [0.1, 0.5, 1.0, 2.0],
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print("Best Parameters:", grid_search.best_params_)

NameError: name 'X' is not defined