In [1]:
# Structures to make easier for us to perform the procedures
from src.entities import EnvironmentConfiguration

import warnings
warnings.filterwarnings("ignore")
import os

environment_configuration = EnvironmentConfiguration()
from src.preprocessing_pipelines.preprocessing_pipelines import common_data_preprocessing

# Basic

## Common Data Transformations

We are going to apply to our dataset the transformations that are common to every algorithm (enconding etc) and do not imply data leakage(transformations that do not use global statistics like mean, median, min or max)

1. Drop the 'education' column (redundant since 'education.num' encodes the same information).
2. Encode nominal categorical features using one-hot encoding while avoiding multicollinearity 
    (i.e., using dummy encoding with drop_first=True).
3. Create a new feature 'profit' as the difference between 'capital.gain' and 'capital.loss'.
4. Create a new binary feature 'investor' indicating if the person has any capital gain 
    (1 if capital.gain > 0, else 0).
5. Create a new binary feature 'american' indicating if the person is a U.S. citizen 
    (1 if native.country equals 'United-States', else 0).
6. Map the 'income' column to binary labels (0 for '<=50K' and 1 for '>50K').
7. Map the 'sex' column to binary labels (0 for 'Female' and 1 for 'Male').
8. One-hot encode the categorical-nominal features: workclass, marital.status, occupation, 
    relationship, and race (excluding native.country, already encoded as 'american').

In [2]:
import pandas as pd
raw_data = pd.read_csv(environment_configuration.raw_data_folder)

data_without_duplicates = raw_data.drop_duplicates()
common_data_preprocessed_data = common_data_preprocessing(data_without_duplicates)

In [8]:
common_data_preprocessed_data.head(5)

Unnamed: 0,age,fnlwgt,education.num,sex,capital.gain,capital.loss,hours.per.week,income,profit,investor,...,occupation_Transport-moving,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,90,77053,9,0,0,4356,40,0,-4356,0,...,0,1,0,0,0,0,0,0,0,1
1,82,132870,9,0,0,4356,18,0,-4356,0,...,0,1,0,0,0,0,0,0,0,1
2,66,186061,10,0,0,4356,40,0,-4356,0,...,0,0,0,0,1,0,0,1,0,0
3,54,140359,4,0,0,3900,40,0,-3900,0,...,0,0,0,0,1,0,0,0,0,1
4,41,264663,10,0,0,3900,40,0,-3900,0,...,0,0,0,1,0,0,0,0,0,1


## Holdout Test (Validation)

In [3]:
from sklearn.model_selection import train_test_split

# Split the dataset into X (features) and y (target)
X = common_data_preprocessed_data.drop('income', axis=1)  # All features except the 'income' column
y = common_data_preprocessed_data['income']  # The target variable 'income'

# Perform the train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify= y)

# Print the shape of the resulting sets
print(f"Training data shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Test data shape: X_test={X_test.shape}, y_test={y_test.shape}")

Training data shape: X_train=(26029, 47), y_train=(26029,)
Test data shape: X_test=(6508, 47), y_test=(6508,)


# Baseline Performance

## Simple Heuristic

Apply the mode to every sample

In [11]:
from sklearn.metrics import fbeta_score
import numpy as np

# Step 1: Identify the majority class
majority_class = y.mode()[0]  # This gets the most frequent class (0 or 1)

# Step 2: Make predictions (always predict the majority class)
y_pred_majority = np.full_like(y_test, majority_class)

# Step 3: Calculate F2 score
f2_score = fbeta_score(y_test, y_pred_majority, beta=2)

print(f"F2 Score for majority class heuristic: {f2_score}")

F2 Score for majority class heuristic: 0.0


## Strong Model - Out of the Box

We are going to test 2 Robust models to see how skillfull the are out of the box

In [None]:
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.metrics import fbeta_score
# XGBoost with 100 estimators

# Initialize XGBoost model
xgb_model = GradientBoostingClassifier(random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model (F2 score)
xgb_f2_score = fbeta_score(y_test, y_pred_xgb, beta=2)

print(f"XGBoost F2 Score: {xgb_f2_score}")

XGBoost F2 Score: 0.6167785234899329


In [None]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import fbeta_score
# Random Forest with 100 estimators

# Initialize XGBoost model
rfc_model = RandomForestClassifier(random_state=42)

# Train the model
rfc_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rfc = rfc_model.predict(X_test)

# Evaluate the model (F2 score)
rfc_f2_score = fbeta_score(y_test, y_pred_rfc, beta=2)

print(f"rfc F2 Score: {rfc_f2_score:.3f}")

rfc F2 Score: 0.634


## Conclusion

Out of the box RandomForestClassifier Got a Result of `f2_score: 0.634`.

Now we have a Baseline to improve it applying the correct `Imbalanced Dataset Framework`

# Experiment 1

## 1. Objective

Apply the imbalanced Framework to improve the performance, using heuristics from what works the vest

## 2. Description:

### 2.1 Preprocessing

* The common preprocessment steps applied in `CommonDataTransformations` section
* Use the `RobustScaler()` to put all the data within the same Scale (This scaler is robust to outliers/ Heuristic);
* Resample data Using `SMOTE and Edited Nearest Neighbors - Resampling` (this technique is one tha usually performs the best/Heuristic); 

We should use `pipelines` to perform the transformations inside each cross validation fold (this avoids data-leakage)

### 2.2 Evaluation

#### 2.2.1 Cross Validation

We are going to `RepeatedStratifiedKFold`, because it is the best for imbalanced Datasets for the following reasons:

**2.2.1.1 Maintains Class Balance in Every Split**

* Regular k-fold cross-validation can lead to uneven class distributions in some folds, which is problematic for imbalanced datasets.
* RepeatedStratifiedKFold ensures each fold has the same class proportions as the full dataset, making training and evaluation more reliable.

**2.2.1.2 Reduces Variance in Model Evaluation** 

* A single stratified k-fold split may still introduce randomness in performance metrics.
* By repeating the process multiple times with different splits, the results are more stable and less dependent on how the data was initially partitioned.

**2.2.1.3 Maximizes Data Utilization**

* In small datasets, some samples may appear only in the test set, limiting training opportunities.
* RepeatedStratifiedKFold allows each sample to be used in training and testing multiple times, leading to better generalization.

**2.2.1.4 More Reliable Performance Estimates**

* Imbalanced datasets often suffer from misleading metrics due to the dominance of the majority class.
* Repeating stratified k-fold cross-validation helps in getting a more representative model evaluation, reducing the risk of overfitting to specific splits.

#### 2.2.2 Final Evaluation:

The final model will be evaluated on the `validation dataset`, to ensure it is able to perform in real world datasets




* [2] Use some resampling technique to improve the model's performance;
* [5] There are some outliers, must apply a scaler that is robust to them Do not Forget to ignore the binary columns

Use StratifiedRepeatedKfold

SMOTE and Edited Nearest Neighbors - Resampling

Avoid Data Leakage. Scale And Resample through every iteration (pipeline)

Try Cost-Sensitive Algorithms[@]

Try Advanced Algorithms

Try to ensemble Cost-Sensitive Algorithms[@]

Calibrate probabilities, and move threshold

Stacking and testing

## Cost Weigthed Models

In [None]:
from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

 
# steps = [('scaler', RobustScaler()),
#         ('under', RandomUnderSampler()),
#         ('model', DecisionTreeClassifier())] # Model should be dinamically changed


steps = [('scaler', RobustScaler()),
        ('under', RandomUnderSampler()),
        ('model', DecisionTreeClassifier())] # Model should be dinamically changed

pipeline = Pipeline(steps=steps)

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F-measure: %.3f' % score)