In [1]:

#ENSEMBLE LEARN

#Question 1 : Implement the Bagging based Ensemble Model using CART (Classification and Regression Trees) as base learners. 
#No. of base learners = 100. Use cross validation as the model estimation method.

import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Define column names
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# Load dataset from the local file
df = pd.read_csv("pima-indians-diabetes.csv", names=names)

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst Few Rows:")
print(df.head())

# Prepare features and target variable
X = df.iloc[:, :-1].values  # All columns except the target
Y = df.iloc[:, -1].values   # The target variable (class)

# Cross-validation setup (10-fold cross-validation)
Kfold = model_selection.KFold(n_splits=10, random_state=None)

# Create a Decision Tree classifier
cart = DecisionTreeClassifier()

# Create a Bagging classifier with Decision Trees as base estimator
num_trees = 100  # Number of trees in the ensemble
model = BaggingClassifier(estimator=cart, n_estimators=num_trees, random_state=7)

# Perform cross-validation
results = model_selection.cross_val_score(model, X, Y, cv=Kfold)

# Calculate the average accuracy
average_accuracy = results.mean()

# Print the average accuracy
print("\nAverage Accuracy (cross-validation):", average_accuracy)

# Train the model on the full dataset and evaluate on a separate test set
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=7)

# Train the Bagging model
model.fit(X_train, Y_train)

# Predict on the test set
predictions = model.predict(X_test)

# Evaluate accuracy on the test set
test_accuracy = accuracy_score(Y_test, predictions)
print("\nTest Accuracy:", test_accuracy)

# Optional: Train the Decision Tree classifier on the training data
cart.fit(X_train, Y_train)  # Fit the Decision Tree classifier
print("\nFeature importances from the Decision Tree classifier in Bagging model:")
print(cart.feature_importances_)


Dataset Shape: (768, 9)

First Few Rows:
   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1

Average Accuracy (cross-validation): 0.7720437457279563

Test Accuracy: 0.7359307359307359

Feature importances from the Decision Tree classifier in Bagging model:
[0.04585041 0.37355264 0.10363033 0.04155956 0.00861213 0.17702874
 0.13604727 0.11371892]


In [2]:
#Question 2 : Implement the AdaBoost Ensemble model for classification using 10 base learners and cross validation.

import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

# Define column names
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# Load dataset from the local file
df = pd.read_csv("pima-indians-diabetes.csv", names=names)

# Display dataset shape and first few rows
print("Dataset Shape:", df.shape)
print("\nFirst Few Rows:")
print(df.head())

# Prepare features and target variable
X = df.iloc[:, :-1].values  # All columns except the target
Y = df.iloc[:, -1].values   # The target variable (class)

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=7)

# Create a Decision Tree classifier as the base learner
base_learner = DecisionTreeClassifier(max_depth=1)

# Create an AdaBoost classifier with KNN as the base learner
adaboost_model = AdaBoostClassifier(estimator=base_learner, n_estimators=10, random_state=7)

# Perform 10-fold cross-validation
cv_scores = cross_val_score(adaboost_model, X, Y, cv=10)

# Print cross-validation accuracy
print("\nCross-Validation Accuracy Scores:", cv_scores)
print("Average Cross-Validation Accuracy:", cv_scores.mean())

# Train the AdaBoost model on the training set
adaboost_model.fit(X_train, Y_train)

# Predict on the test set
predictions = adaboost_model.predict(X_test)

# Evaluate the test accuracy
test_accuracy = accuracy_score(Y_test, predictions)
print("\nTest Accuracy:", test_accuracy)


Dataset Shape: (768, 9)

First Few Rows:
   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1

Cross-Validation Accuracy Scores: [0.68831169 0.77922078 0.7012987  0.64935065 0.68831169 0.79220779
 0.81818182 0.84415584 0.69736842 0.84210526]
Average Cross-Validation Accuracy: 0.7500512645249489





Test Accuracy: 0.7489177489177489


In [3]:

#Question 3: (Same as Question 1) Implement the Bagging based Ensemble Model using k-Nearest Neighbor Classifier as base learners. 
#No. of base learners = 100. Use cross validation as the model estimation method.


import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

# Define column names
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# Load dataset from the local file
df = pd.read_csv("pima-indians-diabetes.csv", names=names)

# Display dataset shape and first few rows
print("Dataset Shape:", df.shape)
print("\nFirst Few Rows:")
print(df.head())

# Prepare features and target variable
X = df.iloc[:, :-1].values  # All columns except the target
Y = df.iloc[:, -1].values   # The target variable (class)

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=7)

# Create a K-Nearest Neighbors classifier as the base learner
base_learner = KNeighborsClassifier(n_neighbors=3)

# Create a Bagging classifier with KNN as the base learner
bagging_model = BaggingClassifier(estimator=base_learner, n_estimators=10, random_state=7)

# Perform 10-fold cross-validation
cv_scores = cross_val_score(bagging_model, X, Y, cv=10)

# Print cross-validation accuracy
print("\nCross-Validation Accuracy Scores:", cv_scores)
print("Average Cross-Validation Accuracy:", cv_scores.mean())

# Train the Bagging model on the training set
bagging_model.fit(X_train, Y_train)

# Predict on the test set
predictions = bagging_model.predict(X_test)

# Evaluate the test accuracy
test_accuracy = accuracy_score(Y_test, predictions)
print("\nTest Accuracy:", test_accuracy)


Dataset Shape: (768, 9)

First Few Rows:
   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1

Cross-Validation Accuracy Scores: [0.66233766 0.68831169 0.68831169 0.66233766 0.72727273 0.76623377
 0.75324675 0.75324675 0.69736842 0.71052632]
Average Cross-Validation Accuracy: 0.7109193438140806

Test Accuracy: 0.6796536796536796
