# Import necessary libraries

In [19]:
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer as ColumnTransformer_old
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from scipy.stats import skew
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Generate dataset using the given script

In [20]:
id_1 = 7361
id_2 = 7371
id_3 = 7501
random_seed = id_1 + id_2 + id_3
random.seed(random_seed)

# Script to generate dataset variant
data_path = "Data.csv"
output_path = "your_data.csv"

all_data = pd.read_csv(data_path, index_col=0)
all_columns = all_data.columns.tolist()

target_column = 'smoking'

selected_columns = random.sample(all_columns, 10)
selected_columns = np.append(selected_columns, target_column)
sample_df = all_data[selected_columns].copy()
sample_df.to_csv(output_path)

# Load and split the generated dataset

In [21]:
data_path = 'your_data.csv'
df = pd.read_csv(data_path, index_col=0)
df = df.dropna(subset=['smoking'])

# Set the target column
target_column = 'smoking'

# Split the data into training, validation, and test sets
X = df.drop(target_column, axis=1)
y = df[target_column]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# # Save the split datasets to separate CSV files
# train_path = 'train_data.csv'
# valid_path = 'valid_data.csv'
# test_path = 'test_data.csv'

y_train = y.loc[X_train.index]
y_valid = y.loc[X_valid.index]
y_test = y.loc[X_test.index]

# X_train.to_csv(train_path, index=False)
# X_valid.to_csv(valid_path, index=False)
# X_test.to_csv(test_path, index=False)

# Display the shapes of the split datasets
print("Train set shape:", X_train.shape)
print("Validation set shape:", X_valid.shape)
print("Test set shape:", X_test.shape)
print("Columns:" ,selected_columns)


Train set shape: (111479, 10)
Validation set shape: (23888, 10)
Test set shape: (23889, 10)
Columns: ['dental caries' 'Cholesterol' 'eyesight(right)' 'systolic' 'age'
 'Urine protein' 'hearing(right)' 'eyesight(left)' 'height(cm)' 'Gtp'
 'smoking']


# Feature engineering:

In [25]:
# Accuracy before feature engineering
model_before = RandomForestClassifier(random_state=42)
model_before.fit(X_train, y_train)
y_pred_before = model_before.predict(X_valid)
accuracy_before = accuracy_score(y_valid, y_pred_before)
print(f'Accuracy before feature engineering: {accuracy_before}')

# Feature Engineering (including handling outliers)
# Function to calculate skewness of numerical features
def calculate_skewness(df):
    skewness = df.apply(lambda x: skew(x))
    skewness = skewness[abs(skewness) > 0.5]
    return skewness.index

# Function for log transformation of skewed features
def log_transform_skewed_features(df):
    skewed_features = calculate_skewness(df)
    df[skewed_features] = np.log1p(df[skewed_features])
    return df

# Custom transformer to select numerical or categorical columns
class FeatureSelector:
    def __init__(self, feature_names):
        self.feature_names = feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.feature_names]

# Custom transformer to handle missing values
class MissingValueHandler:
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.fillna(X.median())  # Replace missing values with median (you can customize this)

# Apply feature engineering to training, validation, and test datasets
feature_engineering_pipeline = Pipeline([
    ('features', FeatureSelector(X_train.columns)),
    ('log_transform', FunctionTransformer(log_transform_skewed_features)),
    ('impute', SimpleImputer(strategy='median')),  # Impute missing values
    ('scaler', RobustScaler())  # Use RobustScaler for normalization
])

X_train_processed = feature_engineering_pipeline.fit_transform(X_train)
X_valid_processed = feature_engineering_pipeline.transform(X_valid)
X_test_processed = feature_engineering_pipeline.transform(X_test)

# Accuracy after feature engineering
model_after = RandomForestClassifier(random_state=42)
model_after.fit(X_train_processed, y_train)
y_pred_after = model_after.predict(X_valid_processed)
accuracy_after = accuracy_score(y_valid, y_pred_after)
print(f'Accuracy after feature engineering: {accuracy_after}')


Accuracy before feature engineering: 0.735976222371065
Accuracy after feature engineering: 0.737399531145345


# Bagging

In [26]:
class BaggingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimator, n_estimators, random_state=None):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.models = []

    def fit(self, X, y, max_depth=None):
        self.models = []
        for i in range(self.n_estimators):
            model = self.base_estimator(random_state=self.random_state + i)
            self.models.append(model.fit(X, y))
        return self

    def predict(self, X):
        predictions = [model.predict(X) for model in self.models]
        return np.round(np.mean(predictions, axis=0))

    def score(self, X, y):
        predictions = self.predict(X)
        return accuracy_score(y, predictions)

    def get_params(self, deep=True):
        return {
            'base_estimator': self.base_estimator,
            'n_estimators': self.n_estimators,
            'random_state': self.random_state
        }

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self

# Boosting (AdaBoost)

In [27]:
class AdaBoostClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimator, n_estimators, random_state=None):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.models = []
        self.alphas = []

    def fit(self, X, y, base_estimator_max_depth=None):
        m = X.shape[0]
        weights = np.ones(m) / m

        for i in range(self.n_estimators):
            model = self.base_estimator.__class__(max_depth=base_estimator_max_depth, random_state=self.random_state + i)
            model.fit(X, y, sample_weight=weights)
            predictions = model.predict(X)

            error = np.sum(weights * (predictions != y)) / np.sum(weights)
            alpha = 0.5 * np.log((1 - error) / error)
            weights = weights * np.exp(-alpha * y * predictions)
            weights /= np.sum(weights)

            self.models.append((model, alpha))

        return self

    def predict(self, X):
        predictions = [alpha * model.predict(X) for model, alpha in self.models]
        return np.round(np.sum(predictions, axis=0) / np.sum([alpha for _, alpha in self.models]))

    def score(self, X, y):
        predictions = self.predict(X)
        return accuracy_score(y, predictions)

    def get_params(self, deep=True):
        return {
            'base_estimator': self.base_estimator,
            'n_estimators': self.n_estimators,
            'random_state': self.random_state
        }

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self


# Random Forest

In [28]:
class RandomForestClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, n_estimators=100, max_features=None, random_state=None):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.random_state = random_state
        self.models = []

    def fit(self, X, y):
        X_array = np.array(X)
        if X_array.ndim > 2:
            raise ValueError("Input data with more than two dimensions is not supported.")

        for i in range(self.n_estimators):
            if self.max_features is not None:
                selected_features = np.random.choice(X_array.shape[1], self.max_features, replace=True)
                X_subset = X_array[:, selected_features]
            else:
                X_subset = X_array

            model = DecisionTreeClassifier(random_state=self.random_state + i)
            model.fit(X_subset, y)
            self.models.append((model, selected_features if self.max_features is not None else None))

        return self

    def predict(self, X):
        X_array = np.array(X)
        if X_array.ndim > 2:
            raise ValueError("Input data with more than two dimensions is not supported.")

        predictions = [model.predict(X_array[:, features]) for model, features in self.models]
        return np.round(np.mean(predictions, axis=0))

    def score(self, X, y):
        predictions = self.predict(X)
        return accuracy_score(y, predictions)

    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'max_features': self.max_features,
            'random_state': self.random_state
        }

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self

# HyperParameters Tuning

**Using GridSearchCV**

In [34]:
#Bagging
bagging_param_grid = {
    'n_estimators': [5, 10, 20],
    'base_estimator__max_depth': [None, 5, 10]  # Specify within the base estimator
}

bagging_base_estimator = DecisionTreeClassifier()
bagging_model = BaggingClassifier(base_estimator=bagging_base_estimator, n_estimators=10, random_state=42)

bagging_grid_search = GridSearchCV(bagging_model, bagging_param_grid, cv=3, scoring='accuracy')
bagging_grid_search.fit(X_train, y_train)

best_bagging_model = bagging_grid_search.best_estimator_

# Evaluate on the validation set
bagging_valid_predictions = best_bagging_model.predict(X_valid)
bagging_valid_accuracy = accuracy_score(y_valid, bagging_valid_predictions)
print(f'Best Bagging Model Accuracy on Validation Set: {bagging_valid_accuracy}')

# Evaluate on the test set
bagging_test_predictions = best_bagging_model.predict(X_test)
bagging_test_accuracy = accuracy_score(y_test, bagging_test_predictions)
print(f'Best Bagging Model Accuracy on Test Set: {bagging_test_accuracy}')
# print(f'Best Bagging Model Hyperparameters: {best_bagging_model.get_params()}')

# AdaBoost
adaboost_param_grid = {
    'n_estimators': [20, 50, 100],
    'base_estimator__max_depth': [None, 5, 10]
}

adaboost_base_estimator = DecisionTreeClassifier()  # Instantiate the base estimator here
adaboost_model = AdaBoostClassifier(base_estimator=adaboost_base_estimator, n_estimators=50, random_state=42)

adaboost_grid_search = GridSearchCV(adaboost_model, adaboost_param_grid, cv=3, scoring='accuracy')
adaboost_grid_search.fit(X_train, y_train)

best_adaboost_model = adaboost_grid_search.best_estimator_

# Evaluate on the validation set
adaboost_valid_predictions = best_adaboost_model.predict(X_valid)
adaboost_valid_accuracy = accuracy_score(y_valid, adaboost_valid_predictions)
print(f'Best AdaBoost Model Accuracy on Validation Set: {adaboost_valid_accuracy}')

# Evaluate on the test set
adaboost_test_predictions = best_adaboost_model.predict(X_test)
adaboost_test_accuracy = accuracy_score(y_test, adaboost_test_predictions)
print(f'Best AdaBoost Model Accuracy on Test Set: {adaboost_test_accuracy}')
# print(f'Best AdaBoost Model Hyperparameters: {best_adaboost_model.get_params()}')

# Random Forest
random_forest_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': [None, 'sqrt', 'log2']
}

random_forest_model = RandomForestClassifier(random_state=42)

random_forest_grid_search = GridSearchCV(random_forest_model, random_forest_param_grid, cv=3, scoring='accuracy')
random_forest_grid_search.fit(X_train, y_train)

best_random_forest_model = random_forest_grid_search.best_estimator_

# Evaluate on the validation set
random_forest_valid_predictions = best_random_forest_model.predict(X_valid)
random_forest_valid_accuracy = accuracy_score(y_valid, random_forest_valid_predictions)
print(f'Best Random Forest Model Accuracy on Validation Set: {random_forest_valid_accuracy}')

# Evaluate on the test set
random_forest_test_predictions = best_random_forest_model.predict(X_test)
random_forest_test_accuracy = accuracy_score(y_test, random_forest_test_predictions)
print(f'Best Random Forest Model Accuracy on Test Set: {random_forest_test_accuracy}')
# print(f'Best Random Forest Model Hyperparameters: {best_random_forest_model.get_params()}')


Best Bagging Model Accuracy on Validation Set: 0.7458556597454788
Best Bagging Model Accuracy on Test Set: 0.7489639583071707
Best AdaBoost Model Accuracy on Validation Set: 0.7413764233087743
Best AdaBoost Model Accuracy on Test Set: 0.7441918874795931
Best Random Forest Model Accuracy on Validation Set: 0.7366041527126591
Best Random Forest Model Accuracy on Test Set: 0.7371593620494789


**Using Randomized Search:**

In [None]:
#Bagging
bagging_param_grid = {
    'n_estimators': [5, 10, 20],
    'base_estimator__max_depth': [None, 5, 10]  # Specify within the base estimator
}

bagging_base_estimator = DecisionTreeClassifier()
bagging_model = BaggingClassifier(base_estimator=bagging_base_estimator, n_estimators=10, random_state=42)

bagging_grid_search = GridSearchCV(bagging_model, bagging_param_grid, cv=3, scoring='accuracy')
bagging_grid_search.fit(X_train, y_train)

best_bagging_model = bagging_grid_search.best_estimator_

# Evaluate on the validation set
bagging_valid_predictions = best_bagging_model.predict(X_valid)
bagging_valid_accuracy = accuracy_score(y_valid, bagging_valid_predictions)
print(f'Best Bagging Model Accuracy on Validation Set: {bagging_valid_accuracy}')

# Evaluate on the test set
bagging_test_predictions = best_bagging_model.predict(X_test)
bagging_test_accuracy = accuracy_score(y_test, bagging_test_predictions)
print(f'Best Bagging Model Accuracy on Test Set: {bagging_test_accuracy}')
# print(f'Best Bagging Model Hyperparameters: {best_bagging_model.get_params()}')

# AdaBoost
adaboost_param_grid = {
    'n_estimators': [5, 10, 20],
    'base_estimator__max_depth': [None, 5, 10]
}

adaboost_base_estimator = DecisionTreeClassifier()  # Instantiate the base estimator here
adaboost_model = AdaBoostClassifier(base_estimator=adaboost_base_estimator, n_estimators=50, random_state=42)

adaboost_grid_search = GridSearchCV(adaboost_model, adaboost_param_grid, cv=3, scoring='accuracy')
adaboost_grid_search.fit(X_train, y_train)

best_adaboost_model = adaboost_grid_search.best_estimator_

# Evaluate on the validation set
adaboost_valid_predictions = best_adaboost_model.predict(X_valid)
adaboost_valid_accuracy = accuracy_score(y_valid, adaboost_valid_predictions)
print(f'Best AdaBoost Model Accuracy on Validation Set: {adaboost_valid_accuracy}')

# Evaluate on the test set
adaboost_test_predictions = best_adaboost_model.predict(X_test)
adaboost_test_accuracy = accuracy_score(y_test, adaboost_test_predictions)
print(f'Best AdaBoost Model Accuracy on Test Set: {adaboost_test_accuracy}')
# print(f'Best AdaBoost Model Hyperparameters: {best_adaboost_model.get_params()}')

# Random Forest
random_forest_param_grid = {
    'n_estimators': [5, 10, 20],
    'max_features': [None, 'sqrt', 'log2']
}

random_forest_model = RandomForestClassifier(random_state=42)

random_forest_grid_search = GridSearchCV(random_forest_model, random_forest_param_grid, cv=3, scoring='accuracy')
random_forest_grid_search.fit(X_train, y_train)

best_random_forest_model = random_forest_grid_search.best_estimator_

# Evaluate on the validation set
random_forest_valid_predictions = best_random_forest_model.predict(X_valid)
random_forest_valid_accuracy = accuracy_score(y_valid, random_forest_valid_predictions)
print(f'Best Random Forest Model Accuracy on Validation Set: {random_forest_valid_accuracy}')

# Evaluate on the test set
random_forest_test_predictions = best_random_forest_model.predict(X_test)
random_forest_test_accuracy = accuracy_score(y_test, random_forest_test_predictions)
print(f'Best Random Forest Model Accuracy on Test Set: {random_forest_test_accuracy}')
# print(f'Best Random Forest Model Hyperparameters: {best_random_forest_model.get_params()}')


Best Bagging Model Accuracy on Validation Set: 0.7458556597454788
Best Bagging Model Accuracy on Test Set: 0.7489639583071707
Best AdaBoost Model Accuracy on Validation Set: 0.7429253181513731
Best AdaBoost Model Accuracy on Test Set: 0.7445686299133493
Best Random Forest Model Accuracy on Validation Set: 0.7242967180174146
Best Random Forest Model Accuracy on Test Set: 0.7241826782201013


**Using Bayesian methods:**

In [38]:
from skopt import BayesSearchCV
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Bagging
bagging_param_dist = {
    'n_estimators': (5, 20),
    'base_estimator__max_depth': (1, 10)
}

bagging_base_estimator = DecisionTreeClassifier()
bagging_model = BaggingClassifier(base_estimator=bagging_base_estimator, n_estimators=10, random_state=42)

bagging_bayes_search = BayesSearchCV(bagging_model, bagging_param_dist, n_iter=10, cv=3, scoring='accuracy', random_state=42)
bagging_bayes_search.fit(X_train, y_train)

best_bagging_model = bagging_bayes_search.best_estimator_

# Evaluate on the validation set
bagging_valid_predictions = best_bagging_model.predict(X_valid)
bagging_valid_accuracy = accuracy_score(y_valid, bagging_valid_predictions)
print(f'Best Bagging Model Accuracy on Validation Set: {bagging_valid_accuracy}')

# Evaluate on the test set
bagging_test_predictions = best_bagging_model.predict(X_test)
bagging_test_accuracy = accuracy_score(y_test, bagging_test_predictions)
print(f'Best Bagging Model Accuracy on Test Set: {bagging_test_accuracy}')

# AdaBoost
adaboost_param_dist = {
    'n_estimators': (10, 30),
    'base_estimator__max_depth': (1, 10)
}

adaboost_base_estimator = DecisionTreeClassifier()  # Instantiate the base estimator here
adaboost_model = AdaBoostClassifier(base_estimator=adaboost_base_estimator, n_estimators=50, random_state=42)

adaboost_bayes_search = BayesSearchCV(adaboost_model, adaboost_param_dist, n_iter=10, cv=3, scoring='accuracy', random_state=42)
adaboost_bayes_search.fit(X_train, y_train)

best_adaboost_model = adaboost_bayes_search.best_estimator_

# Evaluate on the validation set
adaboost_valid_predictions = best_adaboost_model.predict(X_valid)
adaboost_valid_accuracy = accuracy_score(y_valid, adaboost_valid_predictions)
print(f'Best AdaBoost Model Accuracy on Validation Set: {adaboost_valid_accuracy}')

# Evaluate on the test set
adaboost_test_predictions = best_adaboost_model.predict(X_test)
adaboost_test_accuracy = accuracy_score(y_test, adaboost_test_predictions)
print(f'Best AdaBoost Model Accuracy on Test Set: {adaboost_test_accuracy}')

# Random Forest
random_forest_param_dist = {
    'n_estimators': (20, 50),
    'max_features': (None, 'sqrt', 'log2')
}

random_forest_model = RandomForestClassifier(random_state=42)

random_forest_bayes_search = BayesSearchCV(random_forest_model, random_forest_param_dist, n_iter=10, cv=3, scoring='accuracy', random_state=42)
random_forest_bayes_search.fit(X_train, y_train)

best_random_forest_model = random_forest_bayes_search.best_estimator_

# Evaluate on the validation set
random_forest_valid_predictions = best_random_forest_model.predict(X_valid)
random_forest_valid_accuracy = accuracy_score(y_valid, random_forest_valid_predictions)
print(f'Best Random Forest Model Accuracy on Validation Set: {random_forest_valid_accuracy}')

# Evaluate on the test set
random_forest_test_predictions = best_random_forest_model.predict(X_test)
random_forest_test_accuracy = accuracy_score(y_test, random_forest_test_predictions)
print(f'Best Random Forest Model Accuracy on Test Set: {random_forest_test_accuracy}')


Best Bagging Model Accuracy on Validation Set: 0.7461486939048895
Best Bagging Model Accuracy on Test Set: 0.7495500020930135
Best AdaBoost Model Accuracy on Validation Set: 0.7431346282652378
Best AdaBoost Model Accuracy on Test Set: 0.7450290928879401
Best Random Forest Model Accuracy on Validation Set: 0.7328365706630945
Best Random Forest Model Accuracy on Test Set: 0.7331407760894135


#Final System:

In [40]:
# Bagging
best_bagging_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=best_bagging_model.base_estimator.max_depth),
                                       n_estimators=best_bagging_model.n_estimators,
                                       random_state=42)
best_bagging_model.fit(X_train, y_train)

# Evaluate on the test set
bagging_test_predictions = best_bagging_model.predict(X_test)
bagging_test_accuracy = accuracy_score(y_test, bagging_test_predictions)
print(f'Bagging Model Accuracy on Test Set: {bagging_test_accuracy}')

# AdaBoost
best_adaboost_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=best_adaboost_model.base_estimator.max_depth),
                                         n_estimators=best_adaboost_model.n_estimators,
                                         random_state=42)
best_adaboost_model.fit(X_train, y_train)

# Evaluate on the test set
adaboost_test_predictions = best_adaboost_model.predict(X_test)
adaboost_test_accuracy = accuracy_score(y_test, adaboost_test_predictions)
print(f'AdaBoost Model Accuracy on Test Set: {adaboost_test_accuracy}')

# Random Forest
best_random_forest_model = RandomForestClassifier(n_estimators=best_random_forest_model.n_estimators,
                                                 max_features=best_random_forest_model.max_features,
                                                 random_state=42)
best_random_forest_model.fit(X_train, y_train)

# Evaluate on the test set
random_forest_test_predictions = best_random_forest_model.predict(X_test)
random_forest_test_accuracy = accuracy_score(y_test, random_forest_test_predictions)
print(f'Random Forest Model Accuracy on Test Set: {random_forest_test_accuracy}')

# Find and print the best accuracy
best_accuracy_model = max([(bagging_test_accuracy, 'Bagging'), (adaboost_test_accuracy, 'AdaBoost'), (random_forest_test_accuracy, 'Random Forest')])

print(f'\nBest Accuracy Model: {best_accuracy_model[1]} with Accuracy: {best_accuracy_model[0]}')

Bagging Model Accuracy on Test Set: 0.7495500020930135
AdaBoost Model Accuracy on Test Set: 0.7450290928879401
Random Forest Model Accuracy on Test Set: 0.7331407760894135

Best Accuracy Model: Bagging with Accuracy: 0.7495500020930135
