## Adult Dataset

In [1]:
import pandas as pd
import numpy as np

# Define column headers
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]

# Load data and combine training/testing datasets for consistent processing
train_data = pd.read_csv("adult.data", header=None, names=columns, na_values=" ?", skipinitialspace=True)
test_data = pd.read_csv("adult.test", header=None, names=columns, na_values=" ?", skipinitialspace=True, skiprows=1)
adult = pd.concat([train_data, test_data], ignore_index=True)

# Drop rows with missing values
adult.dropna(axis=0, inplace=True)

# Strip whitespace and map income to binary values (-1 for '>50K', 1 for '<=50K')
adult['income'] = adult['income'].str.strip().replace({'>50K.': -1, '<=50K.': 1, '>50K': -1, '<=50K': 1})

# Process 'marital-status' into binary categories
couple_status = ["Married-civ-spouse", "Married-spouse-absent", "Married-AF-spouse"]
single_status = ["Never-married", "Divorced", "Separated", "Widowed"]
adult['marital-status'] = adult['marital-status'].apply(lambda x: 1 if x in couple_status else -1)

# One-hot encode categorical columns (excluding 'native-country')
categorical_columns = ['workclass', 'education', 'occupation', 'relationship', 'race', 'sex']
adult = pd.get_dummies(adult, columns=categorical_columns, prefix_sep='_', drop_first=True)

# Group 'native-country' into regions and one-hot encode
def map_country_to_region(country):
    region_mapping = {
        "United-States": "North_America", "Canada": "North_America", "Mexico": "North_America",
        "Puerto-Rico": "North_America", "Outlying-US(Guam-USVI-etc)": "North_America",
        "Cuba": "Central_South_America", "Jamaica": "Central_South_America",
        "Honduras": "Central_South_America", "Columbia": "Central_South_America",
        "Ecuador": "Central_South_America", "Dominican-Republic": "Central_South_America",
        "El-Salvador": "Central_South_America", "Guatemala": "Central_South_America",
        "Trinadad&Tobago": "Central_South_America", "Nicaragua": "Central_South_America",
        "Peru": "Central_South_America", "England": "Europe", "Germany": "Europe",
        "Italy": "Europe", "Poland": "Europe", "Portugal": "Europe", "France": "Europe",
        "Scotland": "Europe", "Greece": "Europe", "Ireland": "Europe", "Hungary": "Europe",
        "Holand-Netherlands": "Europe", "Yugoslavia": "Europe", "India": "Asia",
        "Iran": "Asia", "Philippines": "Asia", "Cambodia": "Asia", "Thailand": "Asia",
        "Laos": "Asia", "Taiwan": "Asia", "China": "Asia", "Japan": "Asia", "Vietnam": "Asia",
        "Hong": "Asia", "?": "Unknown", "South": "Unknown"
    }
    return region_mapping.get(country, "Other")

adult['native-country'] = adult['native-country'].apply(map_country_to_region)
adult = pd.get_dummies(adult, columns=['native-country'], drop_first=True)

# Convert binary columns from 0/1 to -1/1 for consistency
binary_columns = adult.select_dtypes(include=['int64', 'uint8']).columns
for col in binary_columns:
    if adult[col].nunique() == 2:  # Check for binary columns
        adult[col] = adult[col].apply(lambda x: 1 if x == 1 else -1)

for column in adult.columns:
    if adult[column].dtype == bool:  # Check if the column is boolean
        adult[column] = adult[column].replace({True: 1, False: -1})  # Replace True with 1 and False with -1
    elif adult[column].dtype == int and adult[column].nunique() == 2:  # Check if the column has 2 unique values (likely one-hot encoded)
        adult[column] = adult[column].replace({1: 1, 0: -1})  # Leave 1 as is, replace 0 with -1

adult


  adult['income'] = adult['income'].str.strip().replace({'>50K.': -1, '<=50K.': 1, '>50K': -1, '<=50K': 1})
  adult[column] = adult[column].replace({True: 1, False: -1})  # Replace True with 1 and False with -1


Unnamed: 0,age,fnlwgt,education-num,marital-status,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,...,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Male,native-country_Central_South_America,native-country_Europe,native-country_North_America,native-country_Other,native-country_Unknown
0,39,77516,13,-1,2174,0,40,1,-1,-1,...,-1,-1,-1,1,1,-1,-1,1,-1,-1
1,50,83311,13,1,0,0,13,1,-1,-1,...,-1,-1,-1,1,1,-1,-1,1,-1,-1
2,38,215646,9,-1,0,0,40,1,-1,-1,...,-1,-1,-1,1,1,-1,-1,1,-1,-1
3,53,234721,7,1,0,0,40,1,-1,-1,...,-1,1,-1,-1,1,-1,-1,1,-1,-1
4,28,338409,13,1,0,0,40,1,-1,-1,...,-1,1,-1,-1,-1,1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,215419,13,-1,0,0,36,1,-1,-1,...,-1,-1,-1,1,-1,-1,-1,1,-1,-1
48838,64,321403,9,-1,0,0,40,1,-1,-1,...,-1,1,-1,-1,1,-1,-1,1,-1,-1
48839,38,374983,13,1,0,0,50,1,-1,-1,...,-1,-1,-1,1,1,-1,-1,1,-1,-1
48840,44,83891,13,-1,5455,0,40,1,-1,-1,...,1,-1,-1,-1,1,-1,-1,1,-1,-1


In [2]:
#ensure no null values
print(adult.isna().sum())

age                                     0
fnlwgt                                  0
education-num                           0
marital-status                          0
capital-gain                            0
capital-loss                            0
hours-per-week                          0
income                                  0
workclass_Federal-gov                   0
workclass_Local-gov                     0
workclass_Never-worked                  0
workclass_Private                       0
workclass_Self-emp-inc                  0
workclass_Self-emp-not-inc              0
workclass_State-gov                     0
workclass_Without-pay                   0
education_11th                          0
education_12th                          0
education_1st-4th                       0
education_5th-6th                       0
education_7th-8th                       0
education_9th                           0
education_Assoc-acdm                    0
education_Assoc-voc               

Since the values in the numerical columns have significantly different ranges, it might be beneficial to standardise them, ensuring that the features are on the same scale and contribute equally to the model.

In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
adult[numerical_cols] = scaler.fit_transform(adult[numerical_cols])

adult

Unnamed: 0,age,fnlwgt,education-num,marital-status,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,...,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Male,native-country_Central_South_America,native-country_Europe,native-country_North_America,native-country_Other,native-country_Unknown
0,0.025996,-1.061979,1.136512,-1,0.146932,-0.217127,-0.034087,1,-1,-1,...,-1,-1,-1,1,1,-1,-1,1,-1,-1
1,0.828308,-1.007104,1.136512,1,-0.144804,-0.217127,-2.213032,1,-1,-1,...,-1,-1,-1,1,1,-1,-1,1,-1,-1
2,-0.046942,0.246034,-0.419335,-1,-0.144804,-0.217127,-0.034087,1,-1,-1,...,-1,-1,-1,1,1,-1,-1,1,-1,-1
3,1.047121,0.426663,-1.197259,1,-0.144804,-0.217127,-0.034087,1,-1,-1,...,-1,1,-1,-1,1,-1,-1,1,-1,-1
4,-0.776316,1.408530,1.136512,1,-0.144804,-0.217127,-0.034087,1,-1,-1,...,-1,1,-1,-1,-1,1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.025996,0.243884,1.136512,-1,-0.144804,-0.217127,-0.356894,1,-1,-1,...,-1,-1,-1,1,-1,-1,-1,1,-1,-1
48838,1.849433,1.247492,-0.419335,-1,-0.144804,-0.217127,-0.034087,1,-1,-1,...,-1,1,-1,-1,1,-1,-1,1,-1,-1
48839,-0.046942,1.754865,1.136512,1,-0.144804,-0.217127,0.772930,1,-1,-1,...,-1,-1,-1,1,1,-1,-1,1,-1,-1
48840,0.390683,-1.001612,1.136512,-1,0.587220,-0.217127,-0.034087,1,-1,-1,...,1,-1,-1,-1,1,-1,-1,1,-1,-1


## Mushroom Dataset

In [21]:
#primary_data = pd.read_csv('primary_data.csv', delimiter=';')

#for the purpose of this classification task, using the seconday data is sufficient 
# because we do not need descriptors such as name and family, secondary dataset has all the informaiton we need
mushroom_data = pd.read_csv('secondary_data.csv', delimiter=';')

#drop the columns with missing values
mushroom_data = mushroom_data.dropna(axis=1)

mushroom_data

# Map 'p' (poisonous) to 1 and 'e' (edible) to -1
mushroom_data['class'] = mushroom_data['class'].map({'p': 1, 'e': -1})

# Print the columns to see if any of them are missing or misspelled
print(mushroom_data.columns)

columns_to_encode = ['cap-diameter', 'cap-shape', 'cap-color',
       'does-bruise-or-bleed', 'gill-color', 'stem-height', 'stem-width',
       'stem-color', 'has-ring', 'habitat', 'season']

# Step 2: Apply one-hot encoding to the selected columns
mushroom_data_encoded = pd.get_dummies(mushroom_data[columns_to_encode], drop_first=False)

# Step 3: Reattach the numerical columns and 'class' column
mushroom_data_encoded = pd.concat([mushroom_data.drop(columns=columns_to_encode), mushroom_data_encoded], axis=1)

encoded_columns = mushroom_data_encoded.columns.difference(['class'])  # Exclude 'class' column
mushroom_data_encoded[encoded_columns] = mushroom_data_encoded[encoded_columns].replace({True: 1, False: -1})

mushroom_data = mushroom_data_encoded

mushroom_data

numerical_columns = ['cap-diameter', 'stem-height', 'stem-width']  # Replace with the names of your numerical columns

# Step 2: Standardize the numerical columns
scaler = StandardScaler()
mushroom_data[numerical_columns] = scaler.fit_transform(mushroom_data[numerical_columns])

mushroom_data

Index(['class', 'cap-diameter', 'cap-shape', 'cap-color',
       'does-bruise-or-bleed', 'gill-color', 'stem-height', 'stem-width',
       'stem-color', 'has-ring', 'habitat', 'season'],
      dtype='object')


  mushroom_data_encoded[encoded_columns] = mushroom_data_encoded[encoded_columns].replace({True: 1, False: -1})


Unnamed: 0,class,cap-diameter,stem-height,stem-width,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_o,cap-shape_p,cap-shape_s,...,habitat_h,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w,season_a,season_s,season_u,season_w
0,1,1.619462,3.076705,0.492293,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
1,1,1.873982,3.385311,0.601900,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,1,-1
2,1,1.393432,3.328931,0.557061,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
3,1,1.412426,2.726555,0.381690,-1,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
4,1,1.501699,2.952075,0.503254,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,1,-1.054903,-0.786809,-0.590822,-1,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,-1
61065,1,-1.037808,-1.009362,-0.669539,-1,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,-1
61066,1,-1.037808,-0.807581,-0.575875,-1,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,1,-1
61067,1,-1.043506,-0.896602,-0.668543,-1,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,1,-1


In [None]:
#separate the data into the features and the target (y)

X = adult.drop('income', axis=1)  # Features
y = adult['income']  # Target

In [None]:
from sklearn.model_selection import train_test_split

# 20/80 Split
X_train_20, X_test_80, y_train_20, y_test_80 = train_test_split(X, y, test_size=0.8, random_state=42)

# 50/50 Split
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X, y, test_size=0.5, random_state=42)

# 80/20 Split
X_train_80, X_test_20, y_train_80, y_test_20 = train_test_split(X, y, test_size=0.2, random_state=42)

#to verify
print("20/80 Split: Train =", X_train_20.shape, "Test =", X_test_80.shape)
print("50/50 Split: Train =", X_train_50.shape, "Test =", X_test_50.shape)
print("80/20 Split: Train =", X_train_80.shape, "Test =", X_test_20.shape)

20/80 Split: Train = (9768, 59) Test = (39074, 59)
50/50 Split: Train = (24421, 59) Test = (24421, 59)
80/20 Split: Train = (39073, 59) Test = (9769, 59)


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.utils import shuffle

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Outer loop for trials
for trial in range(3):
    print(f"Trial {trial + 1}")
    # Shuffle the dataset for each trial
    X, y = shuffle(X, y, random_state=trial)

    # Middle loop for partitions
    for partition in range(3):
        if partition == 0:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            print("Partition 80/20:")
        elif partition == 1:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
            print("Partition 50/50:")
        elif partition == 2:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
            print("Partition 20/80:")

        # Scale the numerical features
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)

        # Inner loop for AdaBoost
        # Define AdaBoost Classifier and GridSearch parameters
        ada_classifier = AdaBoostClassifier(random_state=42)
        param_grid = {'n_estimators': [10, 50, 100, 200], 'learning_rate': [0.01, 0.1, 0.5, 1]}
        grid = GridSearchCV(ada_classifier, param_grid, scoring='accuracy', cv=10)
        
        # Grid search to find the best hyperparameters
        grid.fit(X_train, y_train)
        
        # Train AdaBoost with the best parameters
        best_ada = AdaBoostClassifier(
            n_estimators=grid.best_params_['n_estimators'],
            learning_rate=grid.best_params_['learning_rate'],
            random_state=42
        )
        best_ada.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = best_ada.predict(X_test)
        
        # Print the classification report
        print("AdaBoost:")
        print(classification_report(y_test, y_pred))

Trial 1
Partition 80/20:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.94      0.92      4903
         1.0       0.78      0.65      0.71      1610

    accuracy                           0.87      6513
   macro avg       0.84      0.80      0.81      6513
weighted avg       0.86      0.87      0.86      6513

Partition 50/50:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.94      0.91     12362
         1.0       0.76      0.65      0.70      3919

    accuracy                           0.87     16281
   macro avg       0.83      0.79      0.81     16281
weighted avg       0.86      0.87      0.86     16281

Partition 20/80:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.93      0.91     19772
         1.0       0.74      0.64      0.69      6277

    accuracy                           0.86     26049
   macro avg       0.82      0.79      0.80     26049
weighted avg       0.86      0.86      0.86     26049

Trial 2
Partition 80/20:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.94      0.91      4907
         1.0       0.78      0.64      0.70      1606

    accuracy                           0.86      6513
   macro avg       0.83      0.79      0.81      6513
weighted avg       0.86      0.86      0.86      6513

Partition 50/50:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.94      0.91     12326
         1.0       0.77      0.66      0.71      3955

    accuracy                           0.87     16281
   macro avg       0.83      0.80      0.81     16281
weighted avg       0.86      0.87      0.86     16281

Partition 20/80:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.93      0.91     19763
         1.0       0.75      0.64      0.69      6286

    accuracy                           0.86     26049
   macro avg       0.82      0.78      0.80     26049
weighted avg       0.86      0.86      0.86     26049

Trial 3
Partition 80/20:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.90      0.93      0.91      4939
         1.0       0.76      0.66      0.71      1574

    accuracy                           0.87      6513
   macro avg       0.83      0.80      0.81      6513
weighted avg       0.86      0.87      0.86      6513

Partition 50/50:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.94      0.92     12352
         1.0       0.77      0.65      0.70      3929

    accuracy                           0.87     16281
   macro avg       0.83      0.79      0.81     16281
weighted avg       0.86      0.87      0.86     16281

Partition 20/80:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.94      0.91     19779
         1.0       0.76      0.63      0.69      6270

    accuracy                           0.86     26049
   macro avg       0.82      0.78      0.80     26049
weighted avg       0.86      0.86      0.86     26049



In [17]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.utils import shuffle

# Outer loop for trials
for trial in range(3):
    print(f"Trial {trial + 1}")
    # Shuffle the dataset for each trial
    X, y = shuffle(X, y, random_state=trial)

    # Middle loop for partitions
    for partition in range(3):
        if partition == 0:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            print("Partition 80/20:")
        elif partition == 1:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
            print("Partition 50/50:")
        elif partition == 2:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
            print("Partition 20/80:")

        # Scale the numerical features
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)

        # Define AdaBoost Classifier with SAMME algorithm
        ada_classifier = AdaBoostClassifier(algorithm='SAMME', random_state=42)

        # Perform GridSearchCV for hyperparameter tuning
        param_grid = {'n_estimators': [10, 50, 100, 200], 'learning_rate': [0.01, 0.1, 0.5, 1]}
        grid = GridSearchCV(ada_classifier, param_grid, scoring='accuracy', cv=5)  # 5-fold cross-validation
        grid.fit(X_train, y_train)

        # Train AdaBoost with the best parameters from GridSearch
        best_ada = AdaBoostClassifier(
            algorithm='SAMME',
            n_estimators=grid.best_params_['n_estimators'],
            learning_rate=grid.best_params_['learning_rate'],
            random_state=42
        )
        best_ada.fit(X_train, y_train)

        # Cross-validation for evaluation
        cv_scores = cross_val_score(best_ada, X_train, y_train, cv=5, scoring='accuracy')
        print(f"Cross-Validation Accuracy Scores: {cv_scores}")
        print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")

        # Predict on the test set
        y_pred = best_ada.predict(X_test)

        # Print the classification report
        print("AdaBoost Classification Report:")
        print(classification_report(y_test, y_pred))


Trial 1
Partition 80/20:
Cross-Validation Accuracy Scores: [0.85988484 0.85988484 0.85585413 0.85217892 0.85697831]
Mean CV Accuracy: 0.8570
AdaBoost Classification Report:
              precision    recall  f1-score   support

        -1.0       0.88      0.94      0.91      4975
         1.0       0.75      0.59      0.66      1538

    accuracy                           0.86      6513
   macro avg       0.82      0.77      0.79      6513
weighted avg       0.85      0.86      0.85      6513

Partition 50/50:
Cross-Validation Accuracy Scores: [0.85534398 0.84766585 0.84920147 0.85872236 0.84981572]
Mean CV Accuracy: 0.8521
AdaBoost Classification Report:
              precision    recall  f1-score   support

        -1.0       0.89      0.94      0.91     12422
         1.0       0.75      0.61      0.68      3859

    accuracy                           0.86     16281
   macro avg       0.82      0.78      0.79     16281
weighted avg       0.85      0.86      0.86     16281

Partitio