In [8]:
import pandas as pd
import numpy as np

columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]

# Load training and test data
train_data = pd.read_csv("adult.data", header=None, names=columns, na_values=" ?", skipinitialspace=True)
test_data = pd.read_csv("adult.test", header=None, names=columns, na_values=" ?", skipinitialspace=True, skiprows=1)

# Combine training and test data for consistent preprocessing
adult = pd.concat([train_data, test_data])

# Categorical columns (need to be one-hot encoded)
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

# Numerical columns (should not be one-hot encoded)
numerical_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

# Apply one-hot encoding to categorical columns
adult = pd.get_dummies(adult, columns=categorical_cols, drop_first=True)

adult['income'] = adult['income'].map({'<=50K': -1, '>50K': 1})

#adult = pd.get_dummies(adult, drop_first=True)
adult = adult.replace({0: -1, 1: 1})
adult = adult.replace({False: -1, True: 1})

adult = adult.dropna()  # Drop rows with missing values

adult

  adult = adult.replace({False: -1, True: 1})


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,77516,13,2174,-1,40,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
1,50,83311,13,-1,-1,13,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
2,38,215646,9,-1,-1,40,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
3,53,234721,7,-1,-1,40,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
4,28,338409,13,-1,-1,40,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,-1,-1,38,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
32557,40,154374,9,-1,-1,40,1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
32558,58,151910,9,-1,-1,40,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
32559,22,201490,9,-1,-1,20,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1


In [10]:
#ensure no null values
print(adult.isna().sum())

age                               0
fnlwgt                            0
education-num                     0
capital-gain                      0
capital-loss                      0
                                 ..
native-country_Thailand           0
native-country_Trinadad&Tobago    0
native-country_United-States      0
native-country_Vietnam            0
native-country_Yugoslavia         0
Length: 101, dtype: int64


In [222]:
# Verify unique values in the 'income' column
print("Unique values before cleaning and mapping:", adult['income'].unique())

# Standardize the income column
adult['income'] = adult['income'].astype(str).str.strip()  # Remove extra spaces
adult['income'] = adult['income'].str.replace(r'[.\s]', '', regex=True)  # Remove periods and whitespace

# Define the mapping for all variations
income_map = {'<=50K': -1, '<=50K': -1, '>=50K': 1, '>50K': 1}

# Apply the mapping
adult['income'] = adult['income'].map(income_map)

# Check for unmapped or NaN values
print("Unique values after mapping:", adult['income'].unique())

# Optional: Drop rows with NaN if there are unmapped values
adult = adult.dropna(subset=['income'])

# Display the result
print(adult['income'].head())

adult

Unique values before cleaning and mapping: ['<=50K' '>50K' '<=50K.' '>50K.']
Unique values after mapping: [-1  1]
0   -1
1   -1
2   -1
3   -1
4   -1
Name: income, dtype: int64


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,-1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,-1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,-1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,-1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,-1
16277,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,-1
16278,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,-1
16279,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,-1


In [223]:
#check for null values
print(adult.isnull().sum())

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


Since the values in the numerical columns have significantly different ranges, it might be beneficial to standardise them, ensuring that the features are on the same scale and contribute equally to the model.

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
adult[numerical_cols] = scaler.fit_transform(adult[numerical_cols])

adult

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0.030671,-1.063611,1.134739,0.148574,-0.216664,-0.035429,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
1,0.837109,-1.008707,1.134739,-0.145929,-0.216664,-2.222153,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
2,-0.042642,0.245079,-0.420060,-0.145929,-0.216664,-0.035429,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
3,1.057047,0.425801,-1.197459,-0.145929,-0.216664,-0.035429,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
4,-0.775768,1.408176,1.134739,-0.145929,-0.216664,-0.035429,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-0.849080,0.639741,0.746039,-0.145929,-0.216664,-0.197409,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
32557,0.103983,-0.335433,-0.420060,-0.145929,-0.216664,-0.035429,1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
32558,1.423610,-0.358777,-0.420060,-0.145929,-0.216664,-0.035429,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
32559,-1.215643,0.110960,-0.420060,-0.145929,-0.216664,-1.655225,-1.0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1


In [13]:
#separate the data into the features and the target (y)

X = adult.drop('income', axis=1)  # Features
y = adult['income']  # Target

In [None]:
from sklearn.model_selection import train_test_split

# 20/80 Split
X_train_20, X_test_80, y_train_20, y_test_80 = train_test_split(X, y, test_size=0.8, random_state=42)

# 50/50 Split
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X, y, test_size=0.5, random_state=42)

# 80/20 Split
X_train_80, X_test_20, y_train_80, y_test_20 = train_test_split(X, y, test_size=0.2, random_state=42)

#to verify
print("20/80 Split: Train =", X_train_20.shape, "Test =", X_test_80.shape)
print("50/50 Split: Train =", X_train_50.shape, "Test =", X_test_50.shape)
print("80/20 Split: Train =", X_train_80.shape, "Test =", X_test_20.shape)

20/80 Split: Train = (6512, 100) Test = (26049, 100)
50/50 Split: Train = (16280, 100) Test = (16281, 100)
80/20 Split: Train = (26048, 100) Test = (6513, 100)


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.utils import shuffle

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Outer loop for trials
for trial in range(3):
    print(f"Trial {trial + 1}")
    # Shuffle the dataset for each trial
    X, y = shuffle(X, y, random_state=trial)

    # Middle loop for partitions
    for partition in range(3):
        if partition == 0:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            print("Partition 80/20:")
        elif partition == 1:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
            print("Partition 50/50:")
        elif partition == 2:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
            print("Partition 20/80:")

        # Scale the numerical features
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)

        # Inner loop for AdaBoost
        # Define AdaBoost Classifier and GridSearch parameters
        ada_classifier = AdaBoostClassifier(random_state=42)
        param_grid = {'n_estimators': [10, 50, 100, 200], 'learning_rate': [0.01, 0.1, 0.5, 1]}
        grid = GridSearchCV(ada_classifier, param_grid, scoring='accuracy', cv=10)
        
        # Grid search to find the best hyperparameters
        grid.fit(X_train, y_train)
        
        # Train AdaBoost with the best parameters
        best_ada = AdaBoostClassifier(
            n_estimators=grid.best_params_['n_estimators'],
            learning_rate=grid.best_params_['learning_rate'],
            random_state=42
        )
        best_ada.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = best_ada.predict(X_test)
        
        # Print the classification report
        print("AdaBoost:")
        print(classification_report(y_test, y_pred))

Trial 1
Partition 80/20:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.94      0.92      4903
         1.0       0.78      0.65      0.71      1610

    accuracy                           0.87      6513
   macro avg       0.84      0.80      0.81      6513
weighted avg       0.86      0.87      0.86      6513

Partition 50/50:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.94      0.91     12362
         1.0       0.76      0.65      0.70      3919

    accuracy                           0.87     16281
   macro avg       0.83      0.79      0.81     16281
weighted avg       0.86      0.87      0.86     16281

Partition 20/80:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.93      0.91     19772
         1.0       0.74      0.64      0.69      6277

    accuracy                           0.86     26049
   macro avg       0.82      0.79      0.80     26049
weighted avg       0.86      0.86      0.86     26049

Trial 2
Partition 80/20:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.94      0.91      4907
         1.0       0.78      0.64      0.70      1606

    accuracy                           0.86      6513
   macro avg       0.83      0.79      0.81      6513
weighted avg       0.86      0.86      0.86      6513

Partition 50/50:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.94      0.91     12326
         1.0       0.77      0.66      0.71      3955

    accuracy                           0.87     16281
   macro avg       0.83      0.80      0.81     16281
weighted avg       0.86      0.87      0.86     16281

Partition 20/80:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.93      0.91     19763
         1.0       0.75      0.64      0.69      6286

    accuracy                           0.86     26049
   macro avg       0.82      0.78      0.80     26049
weighted avg       0.86      0.86      0.86     26049

Trial 3
Partition 80/20:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.90      0.93      0.91      4939
         1.0       0.76      0.66      0.71      1574

    accuracy                           0.87      6513
   macro avg       0.83      0.80      0.81      6513
weighted avg       0.86      0.87      0.86      6513

Partition 50/50:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.94      0.92     12352
         1.0       0.77      0.65      0.70      3929

    accuracy                           0.87     16281
   macro avg       0.83      0.79      0.81     16281
weighted avg       0.86      0.87      0.86     16281

Partition 20/80:




AdaBoost:
              precision    recall  f1-score   support

        -1.0       0.89      0.94      0.91     19779
         1.0       0.76      0.63      0.69      6270

    accuracy                           0.86     26049
   macro avg       0.82      0.78      0.80     26049
weighted avg       0.86      0.86      0.86     26049



In [17]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.utils import shuffle

# Outer loop for trials
for trial in range(3):
    print(f"Trial {trial + 1}")
    # Shuffle the dataset for each trial
    X, y = shuffle(X, y, random_state=trial)

    # Middle loop for partitions
    for partition in range(3):
        if partition == 0:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            print("Partition 80/20:")
        elif partition == 1:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
            print("Partition 50/50:")
        elif partition == 2:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
            print("Partition 20/80:")

        # Scale the numerical features
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)

        # Define AdaBoost Classifier with SAMME algorithm
        ada_classifier = AdaBoostClassifier(algorithm='SAMME', random_state=42)

        # Perform GridSearchCV for hyperparameter tuning
        param_grid = {'n_estimators': [10, 50, 100, 200], 'learning_rate': [0.01, 0.1, 0.5, 1]}
        grid = GridSearchCV(ada_classifier, param_grid, scoring='accuracy', cv=5)  # 5-fold cross-validation
        grid.fit(X_train, y_train)

        # Train AdaBoost with the best parameters from GridSearch
        best_ada = AdaBoostClassifier(
            algorithm='SAMME',
            n_estimators=grid.best_params_['n_estimators'],
            learning_rate=grid.best_params_['learning_rate'],
            random_state=42
        )
        best_ada.fit(X_train, y_train)

        # Cross-validation for evaluation
        cv_scores = cross_val_score(best_ada, X_train, y_train, cv=5, scoring='accuracy')
        print(f"Cross-Validation Accuracy Scores: {cv_scores}")
        print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")

        # Predict on the test set
        y_pred = best_ada.predict(X_test)

        # Print the classification report
        print("AdaBoost Classification Report:")
        print(classification_report(y_test, y_pred))


Trial 1
Partition 80/20:
Cross-Validation Accuracy Scores: [0.85988484 0.85988484 0.85585413 0.85217892 0.85697831]
Mean CV Accuracy: 0.8570
AdaBoost Classification Report:
              precision    recall  f1-score   support

        -1.0       0.88      0.94      0.91      4975
         1.0       0.75      0.59      0.66      1538

    accuracy                           0.86      6513
   macro avg       0.82      0.77      0.79      6513
weighted avg       0.85      0.86      0.85      6513

Partition 50/50:
Cross-Validation Accuracy Scores: [0.85534398 0.84766585 0.84920147 0.85872236 0.84981572]
Mean CV Accuracy: 0.8521
AdaBoost Classification Report:
              precision    recall  f1-score   support

        -1.0       0.89      0.94      0.91     12422
         1.0       0.75      0.61      0.68      3859

    accuracy                           0.86     16281
   macro avg       0.82      0.78      0.79     16281
weighted avg       0.85      0.86      0.86     16281

Partitio