In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer


In [4]:
# Load dataset
column_names = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'class']
dataset = pd.read_csv('crx.data', header=None, names=column_names, na_values='?')

# Display the first few rows
dataset.head()


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
1,b,30.83,0,u,g,w,v,1.25,t,t,01,f,g,00202,0,+
2,a,58.67,4.46,u,g,q,h,3.04,t,t,06,f,g,00043,560,+
3,a,24.50,0.5,u,g,q,h,1.5,t,f,0,f,g,00280,824,+
4,b,27.83,1.54,u,g,w,v,3.75,t,t,05,t,g,00100,3,+


In [5]:
# Load dataset
column_names = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'class']
dataset = pd.read_csv('crx.data', header=None, names=column_names, na_values='?')

# Display the first few rows
dataset.head()

# Separate categorical and numerical columns
categorical_cols = ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']
numerical_cols = ['A2', 'A3', 'A8', 'A11', 'A14', 'A15']

# Convert numeric columns to float, coercing errors
for col in numerical_cols:
    dataset[col] = pd.to_numeric(dataset[col], errors='coerce')

# Impute missing values for numerical and categorical data
imputer_num = SimpleImputer(strategy='mean')
dataset[numerical_cols] = imputer_num.fit_transform(dataset[numerical_cols])

imputer_cat = SimpleImputer(strategy='most_frequent')
dataset[categorical_cols] = imputer_cat.fit_transform(dataset[categorical_cols])

# Encode categorical features to numerical values
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    dataset[col] = label_encoders[col].fit_transform(dataset[col])

# Convert class to binary (0/1)
dataset['class'] = dataset['class'].apply(lambda x: 1 if x == '+' else 0)

# Display dataset after preprocessing
dataset.head()


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,0,31.568171,4.758725,0,0,0,0,2.223406,0,0,2.4,0,0,184.014771,1017.385507,0
1,2,30.83,0.0,2,1,13,8,1.25,2,2,1.0,1,1,202.0,0.0,1
2,1,58.67,4.46,2,1,11,4,3.04,2,2,6.0,1,1,43.0,560.0,1
3,1,24.5,0.5,2,1,11,4,1.5,2,1,0.0,1,1,280.0,824.0,1
4,2,27.83,1.54,2,1,13,8,3.75,2,2,5.0,2,1,100.0,3.0,1


In [9]:
# Split the data into train and test sets
X = dataset.drop('class', axis=1)
y = dataset['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Naïve Bayes Model
model = GaussianNB()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy_no_preprocessing = accuracy_score(y_test, y_pred)
print(f"Accuracy without filter and discretization: {accuracy_no_preprocessing:.4f}")


Accuracy without filter and discretization: 0.7788


In [7]:
# Apply discretization (binning) on numerical features
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
X[numerical_cols] = discretizer.fit_transform(X[numerical_cols])

# Split the data again
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Naïve Bayes Model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy_with_discretization = accuracy_score(y_test, y_pred)
print(f"Accuracy with filter and discretization: {accuracy_with_discretization:.4f}")


Accuracy with filter and discretization: 0.7788


In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Split the data into train and test sets
X = dataset.drop('class', axis=1)
y = dataset['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train initial model with all features
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
initial_accuracy = accuracy_score(y_test, y_pred)
print(f"Initial accuracy with all features: {initial_accuracy:.4f}")

# Backward elimination process
def backward_elimination(X_train, y_train, X_test, y_test, model):
    features = X_train.columns.tolist()
    best_accuracy = initial_accuracy
    best_features = features.copy()

    for i in range(len(features)):
        accuracies = []
        for feature in features:
            # Remove one feature at a time
            reduced_X_train = X_train.drop(feature, axis=1)
            reduced_X_test = X_test.drop(feature, axis=1)

            # Train the model with reduced features
            model.fit(reduced_X_train, y_train)
            y_pred = model.predict(reduced_X_test)
            accuracy = accuracy_score(y_test, y_pred)
            accuracies.append((feature, accuracy))

        # Find the feature whose removal has the least negative impact on accuracy
        worst_feature, worst_accuracy = min(accuracies, key=lambda x: x[1])

        # If removing this feature improves accuracy, continue eliminating it
        if worst_accuracy >= best_accuracy:
            best_accuracy = worst_accuracy
            best_features.remove(worst_feature)
            print(f"Removed feature {worst_feature}, new accuracy: {best_accuracy:.4f}")
        else:
            break  # Stop if accuracy decreases

    return best_features, best_accuracy

# Run backward elimination
best_features, best_accuracy = backward_elimination(X_train, y_train, X_test, y_test, model)

# Train the final model with the selected features
print(f"Best features: {best_features}")
print(f"Best accuracy after backward elimination: {best_accuracy:.4f}")


Initial accuracy with all features: 0.7788
Best features: ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15']
Best accuracy after backward elimination: 0.7788
