In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load dataset
df = pd.read_csv('cancer.csv')

# Creating a binary target variable
df['Cancer_Risk'] = df['Level'].apply(lambda x: 1 if x in ['High', 'Medium'] else 0)

# Creating synthetic noisy features
np.random.seed(0)
df['Synthetic_Noise1'] = np.random.normal(0, 1, df.shape[0])
df['Synthetic_Noise2'] = np.random.normal(0, 1, df.shape[0])

# Define features (X) by dropping the specified columns
X = df.drop(['Wheezing', 'Coughing of Blood', 'Smoking'], axis=1)

# Identify categorical and numerical columns in X, including synthetic features
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64', 'float']]

# Preprocessors
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = GaussianNB()

# Split the dataset into training (70%) and temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the temp data into testing (50%) and unseen data (50% of the temp)
X_test, X_unseen, y_test, y_unseen = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Drop non-numeric columns
X_train_numeric = X_train.drop(['Patient Id', 'Gender', 'Level'], axis=1)
X_test_numeric = X_test.drop(['Patient Id', 'Gender', 'Level'], axis=1)
X_unseen_numeric = X_unseen.drop(['Patient Id', 'Gender', 'Level'], axis=1)

# Encode categorical variables
X_train_encoded = pd.get_dummies(X_train_numeric)
X_test_encoded = pd.get_dummies(X_test_numeric)
X_unseen_encoded = pd.get_dummies(X_unseen_numeric)

# Fit the model on the preprocessed training data
model.fit(X_train_encoded, y_train)


# Print the shapes of each set
print("Training set:", X_train_encoded.shape, y_train.shape)
print("Testing set:", X_test_encoded.shape, y_test.shape)
print("Unseen data set:", X_unseen_encoded.shape, y_unseen.shape)

from sklearn.metrics import accuracy_score

# Predict the target variable for the unseen data
y_unseen_pred = model.predict(X_unseen_encoded)

# Calculate the accuracy score
accuracy_unseen = accuracy_score(y_unseen, y_unseen_pred)

# Print the accuracy score of the unseen data
print("Accuracy Score on Unseen Data:", accuracy_unseen)



Training set: (700, 23) (700,)
Testing set: (150, 23) (150,)
Unseen data set: (150, 23) (150,)
Accuracy Score on Unseen Data: 0.7266666666666667
