The deadline for this homework is on **21.03.2025 18:29** (right before the practice session). After completing the exercises, you should

1. Download this file into your computer (`File` $\to$ `Download .ipynb`)

2. Name the file in the following way *HWx_NameSurname* (for example `HW3_NshanPotikyan.ipynb`)

4. Submit the file via the e-learning environment.

**Note** if you do not follow any of the above conditions, your homework will not be graded.

**Problem.** During the practice session we tried to build a binary classifier on the adult dataset which is highly imbalanced.

* In this homework, you need to take the same dataset but this time you need to

 * use more features from the original data
 * try different sampling techniques from [imblearn](https://imbalanced-learn.org/stable/references/index.html) to tackle the class imbalance problem
 * experiment with different ensemble methods (the ones that we have discussed so far) to beat the score we got during the practice session

* Evaluate the model performance in terms of the accuracy score.

* Use the best data processing method to train a final model and report the accuracy score on the test dataset.



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

In [1]:
!curl https://archive.ics.uci.edu/static/public/2/adult.zip -o adult.zip
!unzip adult.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  605k    0  605k    0     0   363k      0 --:--:--  0:00:01 --:--:--  363k
Archive:  adult.zip
  inflating: Index                   
  inflating: adult.data              
  inflating: adult.names             
  inflating: adult.test              
  inflating: old.adult.names         


In [16]:
data = pd.read_csv('adult.data', header=None, na_values=["NA", " ?", ""])
data.columns = ["age", "workclass", "not_needed1", "education1",
                "education", "marital_status","occupation",
                "relationship", "race", "sex", "capital_gain",
                "capital_loss", "hours_per_week", "country", "income"]
data.dropna(inplace=True)
data.drop(['not_needed1', 'education1'], axis=1, inplace=True)
data.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,country,income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [17]:
data.isna().sum()

age               0
workclass         0
education         0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
country           0
income            0
dtype: int64

In [None]:
# edu_mapping = {
#     'Preschool': 0,
#     '1st-4th': 1,
#     '5th-6th': 2,
#     '7th-8th': 3,
#     '9th': 4,
#     '10th': 5,
#     '11th': 6,
#     '12th': 7,
#     'HS-grad': 8,
#     'Some-college': 9,
#     'Assoc-voc': 10,
#     'Assoc-acdm': 11,
#     'Bachelors': 12,
#     'Prof-school': 13,
#     'Masters': 14,
#     'Doctorate': 15
# }
# data['education_num'] = data['education'].map(edu_mapping)

In [34]:
def relationship_group(rel):
    if rel in [' Husband', ' Wife']:
        return 'Spouse'
    elif rel == ' Own-child':
        return 'Child'
    elif rel in [' Not-in-family', ' Unmarried', ' Other-relative']:
        return 'Other'
    else:
        return None

def country_group(x):
    x = x.strip()  # remove leading/trailing spaces
    if x == 'United-States':
        return 'United-States'
    elif x in ['Canada', 'Outlying-US(Guam-USVI-etc)']:
        return 'North America'
    elif x in ['Cuba', 'Jamaica', 'Mexico', 'Puerto-Rico', 'Honduras', 
               'Dominican-Republic', 'El-Salvador', 'Guatemala', 'Nicaragua',
               'Trinadad&Tobago', 'Haiti', 'Ecuador', 'Peru', 'Columbia']:
        return 'Latin America/Caribbean'
    elif x in ['India', 'Iran', 'Philippines', 'Cambodia', 'Thailand', 
               'Laos', 'Taiwan', 'China', 'South', 'Japan', 'Vietnam', 'Hong']:
        return 'Asia'
    elif x in ['England', 'Germany', 'France', 'Italy', 'Scotland', 'Poland',
               'Portugal', 'Yugoslavia', 'Greece', 'Ireland', 'Hungary',
               'Holand-Netherlands']:
        return 'Europe'
    else:
        return 'Other'

data['native_country_group'] = data['country'].apply(country_group)

# data['relationship_group'] = data['relationship'].apply(relationship_group)

In [None]:
data.drop(["not_needed1", "education1",
           "relationship"], axis = 1, inplace = True)

In [35]:
data

Unnamed: 0,age,workclass,education,marital_status,occupation,race,sex,capital_gain,capital_loss,hours_per_week,country,income,relationship_group,native_country_group
0,39,State-gov,13,Never-married,Adm-clerical,White,Male,2174,0,40,United-States,<=50K,Other,United-States
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,White,Male,0,0,13,United-States,<=50K,Spouse,United-States
2,38,Private,9,Divorced,Handlers-cleaners,White,Male,0,0,40,United-States,<=50K,Other,United-States
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Black,Male,0,0,40,United-States,<=50K,Spouse,United-States
4,28,Private,13,Married-civ-spouse,Prof-specialty,Black,Female,0,0,40,Cuba,<=50K,Spouse,Latin America/Caribbean
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,12,Married-civ-spouse,Tech-support,White,Female,0,0,38,United-States,<=50K,Spouse,United-States
32557,40,Private,9,Married-civ-spouse,Machine-op-inspct,White,Male,0,0,40,United-States,>50K,Spouse,United-States
32558,58,Private,9,Widowed,Adm-clerical,White,Female,0,0,40,United-States,<=50K,Other,United-States
32559,22,Private,9,Never-married,Adm-clerical,White,Male,0,0,20,United-States,<=50K,Child,United-States


In [43]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(["income"], axis = 1),
                                                    data["income"],
                                                    test_size = 0.2,
                                                    random_state = 0)

In [44]:
print(y_train.value_counts())
print(y_test.value_counts())

income
<=50K    18122
>50K      6007
Name: count, dtype: int64
income
<=50K    4532
>50K     1501
Name: count, dtype: int64


In [45]:
X_train

Unnamed: 0,age,workclass,education,marital_status,occupation,race,sex,capital_gain,capital_loss,hours_per_week,country,relationship_group,native_country_group
30461,34,Private,10,Never-married,Adm-clerical,White,Male,0,0,40,United-States,Child,United-States
18186,51,Private,9,Divorced,Craft-repair,White,Male,0,0,40,United-States,Other,United-States
24974,50,Private,9,Widowed,Prof-specialty,Black,Female,0,0,40,United-States,Other,United-States
25659,52,Self-emp-not-inc,10,Widowed,Craft-repair,Black,Male,0,0,35,United-States,Other,United-States
13876,31,Private,9,Married-civ-spouse,Machine-op-inspct,White,Male,0,0,40,United-States,Spouse,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14169,66,Local-gov,16,Divorced,Prof-specialty,White,Female,3273,0,40,United-States,Other,United-States
21206,50,Private,14,Married-civ-spouse,Exec-managerial,White,Male,0,0,50,United-States,Spouse,United-States
10646,50,Private,13,Divorced,Sales,White,Male,0,0,45,United-States,Other,United-States
11687,49,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,White,Male,0,0,60,United-States,Spouse,United-States


In [46]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


preprocessor = ColumnTransformer(
    transformers=[
        ('onehot',
         OneHotEncoder(handle_unknown='ignore'),
         ["workclass", "marital_status", "occupation", "race", "sex", "relationship_group", "native_country_group"])
    ])

preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [48]:
bagging_model = BaggingClassifier(estimator=DecisionTreeClassifier(),
                                  n_estimators=100,
                                  random_state=42)

random_forest_model = RandomForestClassifier(n_estimators=100,
                                             random_state=42)

weighted_voting_model = VotingClassifier(
    estimators=[('bagging', bagging_model),
                ('random_forest', random_forest_model)],
    voting='soft')

base_models = [('bagging', bagging_model),
               ('random_forest', random_forest_model)]
stacking_model = StackingClassifier(estimators=base_models,
                                    final_estimator=DecisionTreeClassifier())


In [49]:
# Train the models
bagging_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)
weighted_voting_model.fit(X_train, y_train)
stacking_model.fit(X_train, y_train)

# Evaluate the models
bagging_pred = bagging_model.predict(X_test)
random_forest_pred = random_forest_model.predict(X_test)
weighted_voting_pred = weighted_voting_model.predict(X_test)
stacking_pred = stacking_model.predict(X_test)

# Calculate and print the accuracy for each model
bagging_accuracy = accuracy_score(y_test, bagging_pred)
random_forest_accuracy = accuracy_score(y_test, random_forest_pred)
weighted_voting_accuracy = accuracy_score(y_test, weighted_voting_pred)
stacking_accuracy = accuracy_score(y_test, stacking_pred)

print("Bagging Accuracy:", bagging_accuracy)
print("Random Forest Accuracy:", random_forest_accuracy)
print("Weighted Voting Accuracy:", weighted_voting_accuracy)
print("Stacking Accuracy:", stacking_accuracy)

Bagging Accuracy: 0.8128625890933201
Random Forest Accuracy: 0.8138571191778551
Weighted Voting Accuracy: 0.8133598541355876
Stacking Accuracy: 0.7838554616277142


## Handling class Imbalance

In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the training data
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)