In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hour_per_week,native_country,label
0,42,Self-emp-not-inc,101709,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,2885,0,40,United-States,<=50K
1,55,Private,202220,HS-grad,9,Married-civ-spouse,Other-service,Wife,Black,Female,2407,0,35,United-States,<=50K
2,34,Private,111985,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,45,United-States,<=50K
3,55,Private,123515,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K
4,33,Private,69748,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24416,48,Private,164423,HS-grad,9,Divorced,Machine-op-inspct,Unmarried,White,Female,0,0,40,United-States,<=50K
24417,59,Self-emp-not-inc,211678,Masters,14,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,15,United-States,<=50K.
24418,53,Self-emp-not-inc,118793,10th,6,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States,>50K
24419,21,?,357029,Some-college,10,Married-civ-spouse,?,Wife,Black,Female,2105,0,20,United-States,<=50K


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24421 entries, 0 to 24420
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             24421 non-null  int64 
 1   workclass       24421 non-null  object
 2   fnlwgt          24421 non-null  int64 
 3   education       24421 non-null  object
 4   education_num   24421 non-null  int64 
 5   marital_status  24421 non-null  object
 6   occupation      24421 non-null  object
 7   relationship    24421 non-null  object
 8   race            24421 non-null  object
 9   sex             24421 non-null  object
 10  capital_gain    24421 non-null  int64 
 11  capital_loss    24421 non-null  int64 
 12  hour_per_week   24421 non-null  int64 
 13  native_country  24421 non-null  object
 14  label           24421 non-null  object
dtypes: int64(6), object(9)
memory usage: 2.8+ MB


Removed fnlwgt and education columns per requirements

In [5]:
#remove column
df = df.drop(['fnlwgt', 'education'], axis=1)

Transformed label column:

'>50K', '<=50K', '<=50K.'


In [6]:
# Robust label processing (handles '>50K', '<=50K', '<=50K.')
y = df['label'].str.strip().str.replace('.', '').map({'>50K': 1, '<=50K': 0})
X = df.drop('label', axis=1)

In [7]:
print("Label distribution:")
print(y.value_counts())

Label distribution:
label
0    18558
1     5863
Name: count, dtype: int64


In [8]:
# Feature preprocessing
numeric_features = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hour_per_week']
categorical_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(max_categories=5, handle_unknown='ignore'), categorical_features)
])


In [9]:

X_processed = preprocessor.fit_transform(X)

selector = VarianceThreshold(threshold=0.08)  # Adjusted to get ~40 features
X_reduced = selector.fit_transform(X_processed)
print(f"\nReduced feature dimension: {X_reduced.shape[1]}")


Reduced feature dimension: 24


Model Architecture
maximum of 1000 trainable parameters

In [10]:
import torch
import torch.nn as nn

class IncomePredictor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()

        self.fc1 = nn.Linear(input_dim, 20)
        self.bn1 = nn.BatchNorm1d(20)


        self.fc2 = nn.Linear(20, 8)


        self.fc3 = nn.Linear(8, 2)


    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.bn1(x)
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Initialize model
input_dim = X_reduced.shape[1]
model = IncomePredictor(input_dim)

# Verify parameter count
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTrainable parameters: {total_params}/1000")


Trainable parameters: 726/1000


In [11]:
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

# Convert to tensors
X_tensor = torch.tensor(X_reduced, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X_tensor, y_tensor,
    test_size=0.2,
    random_state=42,
    stratify=y_tensor
)

# DataLoader with exact batch size
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

# Optimizer with exact specifications
optimizer = optim.AdamW(
    model.parameters(),
    lr=0.001,          # Required learning rate
    weight_decay=0.001  # Required L2 penalty
)

# Class-weighted loss
class_weights = torch.tensor([
    1.0,  # Weight for <=50K
    3.0   # Weight for >50K
])
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [12]:
def train(model, loader, criterion, optimizer, epochs=100):
    model.train()
    for epoch in range(epochs):
        for inputs, labels in loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        # Print every 10 epochs
        if (epoch+1) % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

# Execute training
print("\nTraining started...")
train(model, train_loader, criterion, optimizer, epochs=100)


Training started...
Epoch 10/100, Loss: 0.4212
Epoch 20/100, Loss: 0.3568
Epoch 30/100, Loss: 0.2922
Epoch 40/100, Loss: 0.3596
Epoch 50/100, Loss: 0.4592
Epoch 60/100, Loss: 0.3353
Epoch 70/100, Loss: 0.3731
Epoch 80/100, Loss: 0.3272
Epoch 90/100, Loss: 0.2630
Epoch 100/100, Loss: 0.4042


In [17]:
from sklearn.metrics import balanced_accuracy_score

def evaluate(model, X, y):
    model.eval()
    with torch.no_grad():
        outputs = model(X)
        preds = torch.argmax(outputs, dim=1)
    return balanced_accuracy_score(y.numpy(), preds.numpy())

# Evaluate
train_acc = evaluate(model, X_train, y_train)
test_acc = evaluate(model, X_test, y_test)

print("\nFinal Performance:")
print(f"Train Balanced Accuracy: {train_acc:.4f}")
print(f"Test Balanced Accuracy: {test_acc:.4f}")


Final Performance:
Train Balanced Accuracy: 0.8344
Test Balanced Accuracy: 0.8326


In [16]:
torch.save({
    'state_dict': model.state_dict(),
    'preprocessor': preprocessor,
    'selector': selector,
    'input_dim': input_dim,
    'class_weights': class_weights
}, 'income_predictor.jld2')

**1.Data Preprocessing**
Columns Removed: fnlwgt, education.

Label Processing:

Transformed labels (>50K, <=50K, <=50K.) into binary (1 for >50K, 0 otherwise).

**Feature Engineering:**

**Numeric features:** Standardized (StandardScaler).

Categorical features: One-hot encoded (OneHotEncoder with max_categories=5).

**Feature selection: **VarianceThreshold(threshold=0.08) reduced dimensions to 24 features.

**2. Model Architecture**
Layers:

Input (24) → FC1 (20) + BatchNorm → ReLU → FC2 (8) → ReLU → Output (2).

Total Parameters: 726 (well under the 1000 limit).

Activation: ReLU for hidden layers.

**3. Training Setup **
Optimizer: AdamW with:

Learning rate = 0.001.

L2 weight decay = 0.001.

Batch Size: 128.

Epochs: 100.

Loss Function: CrossEntropyLoss with class weights (3.0 for >50K, 1.0 for ≤50K) to handle imbalance.

**4. Performance**
Metric	Value
Train Bal. Acc.	83.53%
Test Bal. Acc.	83.40%
The model generalizes well, with no significant overfitting.