In [16]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from scipy.stats import pointbiserialr
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
import pandas as pd
import numpy as np



In [17]:
!pip install scikit-optimize
from skopt import BayesSearchCV

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-23.9.7-py3-none-any.whl (23 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.9.7 scikit-optimize-0.9.0


In [10]:
data = pd.read_csv('/content/your_data.csv')
data.head(5)

Unnamed: 0.1,Unnamed: 0,systolic,fasting blood sugar,AST,dental caries,Gtp,hemoglobin,serum creatinine,age,height(cm),triglyceride,smoking
0,0,135,94,22,0,27,16.5,1.0,55,165,300,1
1,1,146,147,27,1,37,16.2,1.1,70,165,55,0
2,2,118,79,27,0,53,17.4,0.8,20,170,197,1
3,3,131,91,20,1,30,15.9,1.0,35,180,203,0
4,4,121,91,19,0,17,15.4,0.8,30,165,87,1


In [11]:
# Assuming df is your DataFrame and 'target' is the column name of the target variable
correlation_matrix = data.corr()
correlation_with_target = correlation_matrix['smoking'].sort_values(ascending=False)

# Display the correlation with the target variable
print(correlation_with_target)

smoking                1.000000
hemoglobin             0.450679
height(cm)             0.447111
triglyceride           0.331975
Gtp                    0.305561
serum creatinine       0.272979
dental caries          0.106636
fasting blood sugar    0.096534
AST                    0.059394
systolic               0.058642
Unnamed: 0            -0.000632
age                   -0.206033
Name: smoking, dtype: float64


In [12]:
threshold = 0.3
# Get the features with correlation lower than the threshold
low_correlation_features = correlation_matrix[correlation_matrix['smoking'].abs() < threshold].index

# Remove the low-correlation features from the DataFrame
df_filtered = data.drop(low_correlation_features, axis=1)

# Display the updated DataFrame
df_filtered.head(5)

Unnamed: 0,Gtp,hemoglobin,height(cm),triglyceride,smoking
0,27,16.5,165,300,1
1,37,16.2,165,55,0
2,53,17.4,170,197,1
3,30,15.9,180,203,0
4,17,15.4,165,87,1


In [13]:
x = df_filtered.drop(columns = 'smoking')
y = df_filtered['smoking']
x.head(5)

Unnamed: 0,Gtp,hemoglobin,height(cm),triglyceride
0,27,16.5,165,300
1,37,16.2,165,55
2,53,17.4,170,197
3,30,15.9,180,203
4,17,15.4,165,87


In [14]:

# Split the data into training and testing sets
X_train, x_test_val, y_train, y_test_val = train_test_split(x, y, test_size=0.3, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(x_test_val, y_test_val, test_size=0.5, random_state=42)


#Bagging Model

In [29]:
# Define the parameter search space
param_space = {'max_depth': (1, 10),
               'min_samples_split': (2, 10)}

# Create a BayesSearchCV object
opt = BayesSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_space,
    n_iter=10,  # Number of evaluations
    cv=3,       # Number of cross-validation folds
    n_jobs=-1,  # Number of parallel jobs (-1 means using all processors)
    random_state=42
)

# Fit the BayesSearchCV object
opt.fit(X_train, y_train)

# Get the best hyperparameters
best_params = opt.best_params_

# Create a decision tree with the best hyperparameters
best_max_depth = best_params['max_depth']
best_min_samples_split = best_params['min_samples_split']

# Use DecisionTreeClassifier with the best hyperparameters
best_tree_stump = DecisionTreeClassifier(max_depth=best_max_depth, min_samples_split=best_min_samples_split, random_state=42)
best_params_tree = opt.best_params_



##Bagging using Decision Tree

In [30]:
# Define the number of base models (n_estimators)
best_n_estimators = 10  # You can set the desired number of base models

# Initialize variables to store base models and predictions of base models
base_models = []
predictions_list = []

# Create base models and predictions
for i in range(best_n_estimators):
    # Create bootstrap sample
    indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
    X_bootstrap = X_train.iloc[indices]
    y_bootstrap = y_train.iloc[indices]

    # Train base model (Decision Tree in this example)
    base_model = best_tree_stump  # Use the best_tree_stump as the base model
    base_model.fit(X_bootstrap, y_bootstrap)

    # Make predictions on the validation set
    predictions = base_model.predict(X_val)

    # Store the base model and its predictions
    base_models.append(base_model)
    predictions_list.append(predictions)

# Aggregate predictions using majority voting
ensemble_predictions = np.mean(predictions_list, axis=0).round().astype(int)

# Calculate accuracy
accuracy_ensemble = accuracy_score(y_val, ensemble_predictions)

# Print the results
print("Best hyperparameters for DecisionTreeClassifier (tree stump):", best_params_tree)
print("Number of base models for Ensemble:", best_n_estimators)
print("Accuracy with Ensemble:", accuracy_ensemble)

Best hyperparameters for DecisionTreeClassifier (tree stump): OrderedDict([('max_depth', 7), ('min_samples_split', 8)])
Number of base models for Ensemble: 10
Accuracy with Ensemble: 0.7518419290020094


# **Random Forest**

In [31]:
# Specify different options for n_estimators
n_estimators_options = [5, 10, 15, 20]

# Initialize variables to store the best parameters
best_accuracy = 0
best_n_estimators = 0
best_base_models = []
best_predictions_list = []

# Perform grid search
for n_estimators in n_estimators_options:
    # Lists to store base models and predictions of base models
    base_models = []
    predictions_list = []

    # Create bootstrap samples and train base models
    for i in range(n_estimators):
        # Create bootstrap sample
        indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
        X_bootstrap = X_train.iloc[indices, :]
        y_bootstrap = y_train.iloc[indices]

        # Randomly select features for each tree
        selected_features = np.random.choice(X_train.shape[1], size=2, replace=False)
        X_bootstrap = X_bootstrap.iloc[:, selected_features]

        # Train base model (Decision Tree in this example)
        base_model = DecisionTreeClassifier(random_state=42)
        base_model.fit(X_bootstrap, y_bootstrap)

        # Make predictions on the validation set
        X_val_subset = X_val.iloc[:, selected_features]
        predictions = base_model.predict(X_val_subset)

        # Store the base model and its predictions
        base_models.append(base_model)
        predictions_list.append(predictions)

    # Aggregate predictions using majority voting
    ensemble_predictions = np.mean(predictions_list, axis=0).round().astype(int)

    # Calculate accuracy
    accuracy = accuracy_score(y_val, ensemble_predictions)

    # Update best parameters if the current model is better
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_n_estimators = n_estimators
        best_base_models = base_models
        best_predictions_list = predictions_list

# Use the best parameters to print the results
print("Best n_estimators:", best_n_estimators)
print("Best Accuracy:", best_accuracy)

Best n_estimators: 10
Best Accuracy: 0.7463580040187542


#Boosting Model

In [32]:
# Specify different options for n_estimators
n_estimators_options = [5, 10, 15, 20,25]

# Initialize variables to store the best parameters
best_accuracy = 0
best_n_estimators = 0
best_base_models = []
best_model_weights = []
best_predictions_list = []
best_feature_importances = None

# Perform grid search
for n_estimators in n_estimators_options:
    # Initialize weights for the samples
    sample_weights = np.ones(len(X_train)) / len(X_train)

    # Lists to store base models, their weights, and feature importances
    base_models = []
    model_weights = []
    feature_importances_list = []

    # Perform boosting
    for _ in range(n_estimators):
        # Train a base model (tree stump)
        base_model = DecisionTreeClassifier(max_depth=1)
        base_model.fit(X_train, y_train, sample_weight=sample_weights)

        # Make predictions on the training set
        predictions = base_model.predict(X_train)

        # Calculate weighted error
        weighted_error = np.sum(sample_weights * (predictions != y_train)) / np.sum(sample_weights)

        # Calculate the base model weight
        model_weight = 0.5 * np.log((1 - weighted_error) / weighted_error)

        # Update sample weights
        sample_weights *= np.exp(-model_weight * y_train * predictions)
        sample_weights /= np.sum(sample_weights)

        # Store the base model, its weight, and feature importances
        base_models.append(base_model)
        model_weights.append(model_weight)
        feature_importances_list.append(base_model.feature_importances_)

    # Make predictions on the validation set using the weighted sum of base models
    ensemble_predictions = np.zeros_like(y_val, dtype=float)
    for i in range(n_estimators):
        predictions = base_models[i].predict(X_val)
        ensemble_predictions += model_weights[i] * predictions

    # Convert the weighted sum to binary predictions
    ensemble_predictions = np.sign(ensemble_predictions)

    # Calculate accuracy on the validation set
    accuracy = accuracy_score(y_val, ensemble_predictions)

    # Update best parameters if the current model is better
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_n_estimators = n_estimators
        best_base_models = base_models
        best_model_weights = model_weights
        best_predictions_list = ensemble_predictions
        best_feature_importances = np.mean(feature_importances_list, axis=0)

# Use the best parameters to print the results
print("Best n_estimators:", best_n_estimators)
print("Best Accuracy:", best_accuracy)

# Display feature importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': best_feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("\nFeature Importances:")
print(feature_importance_df)


Best n_estimators: 5
Best Accuracy: 0.6827277294038848

Feature Importances:
        Feature  Importance
2    height(cm)         0.8
1    hemoglobin         0.2
0           Gtp         0.0
3  triglyceride         0.0
