In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load dataset
df = pd.read_csv('dataset_with_random.csv')

# Creating a binary target variable
df['Cancer_Risk'] = df['Level'].apply(lambda x: 1 if x in ['High', 'Medium'] else 0)

# Creating synthetic noisy features
np.random.seed(0)
df['Synthetic_Noise1'] = np.random.normal(0, 1, df.shape[0])
df['Synthetic_Noise2'] = np.random.normal(0, 1, df.shape[0])

# Define features (X) by dropping the specified columns
X = df.drop(['Wheezing', 'Coughing of Blood', 'Smoking'], axis=1)

# Define the target variable (y) as the 'Genetic Risk' column
y = df['Smoking']

# Identify categorical and numerical columns in X, including synthetic features
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64', 'float']]

# Preprocessors
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = GaussianNB()

# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the test set into validation and unseen sets
X_val, X_unseen, y_val, y_unseen = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# Fit the model on the training data
pipeline.fit(X_train, y_train)

# Predict and evaluate the model on the validation set
predictions_val = pipeline.predict(X_val)
accuracy_val = accuracy_score(y_val, predictions_val)
print(f"Validation Accuracy: {accuracy_val}")

# Predict and evaluate the model on the unseen set
predictions_unseen = pipeline.predict(X_unseen)
accuracy_unseen = accuracy_score(y_unseen, predictions_unseen)
print(f"Unseen Data Accuracy: {accuracy_unseen}")


Validation Accuracy: 0.65
Unseen Data Accuracy: 0.56


In [5]:
# Print the feature set
print("Feature Set:")
print(X_train.columns)


Feature Set:
Index(['Patient ID', 'Age', 'Gender', 'Air Pollution', 'Alcohol use',
       'Dust Allergy', 'OccuPational Hazards', 'Genetic Risk',
       'chronic Lung Disease', 'Balanced Diet', 'Obesity', 'Passive Smoker',
       'Chest Pain', 'Coughing of Blood', 'Fatigue', 'Weight Loss',
       'Shortness of Breath', 'Wheezing', 'Swallowing Difficulty',
       'Clubbing of Finger Nails', 'Frequent Cold', 'Dry Cough', 'Snoring',
       'Synthetic_Noise1', 'Synthetic_Noise2'],
      dtype='object')


In [10]:
# Generate random data
num_rows = len(df)  # Number of rows in your dataset
num_cols = 7  # Number of random columns to add

# Generate random values for each column
random_data = np.random.rand(num_rows, num_cols)

# Create DataFrame from random data
random_df = pd.DataFrame(random_data, columns=[f'Random_{i}' for i in range(1, num_cols + 1)])

# Concatenate the original DataFrame with the random DataFrame along columns
df_with_random = pd.concat([df, random_df], axis=1)

# Optionally, you can save the DataFrame with random columns to a new CSV file
df_with_random.to_csv('dataset_with_random.csv', index=False)