<a href="https://colab.research.google.com/github/Alohadron/PyTorch-for-Deep-Learning-Bootcamp/blob/main/extras/self_exercices/adult_income_binary_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Workflow
1. Get data ready (turn into tensors).
2. Build or pick a pretrained model (to suit your problem).
    
    2.1 Pick a loss function and a optimizer.
    
    2.2 Build a training loop.
    
3. Fit the model to the data and make a prediction.
4. Evaluate the model.
5. Improve through experimentation
6. Save and reload your trained model

## 1. Get the data ready

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("wenruliu/adult-income-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/wenruliu/adult-income-dataset?dataset_version_number=2...


100%|██████████| 652k/652k [00:00<00:00, 28.2MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/wenruliu/adult-income-dataset/versions/2





In [None]:
import pandas as pd
import numpy as np
import os

# List all files in the dataset directory
files = os.listdir(path)
print("Files in dataset directory:", files)

# Assuming the dataset contains a .csv file, load it
csv_file = [f for f in files if f.endswith('.csv')][0]  # Get the first CSV file
data = pd.read_csv(os.path.join(path, csv_file))

# Display the first few rows
data

Files in dataset directory: ['adult.csv']


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [None]:
# Cleaning data
data.replace("?", float("NaN"), inplace=True)

# Handle missing data (drop rows with missing values)
data.dropna(inplace=True)

# Encode catergorail columns usin one-hot encoding
# https://www.geeksforgeeks.org/python-pandas-get_dummies-method/
data = pd.get_dummies(data, drop_first=True)

# Split the data into features (X) and target (y)
X = data.drop('income_>50K', axis=1).values # Features
y = data["income_>50K"].values # Target

X[:5], y[:5]

(array([[25, 226802, 7, 0, 0, 40, False, True, False, False, False, False,
         True, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False,
         True, False, False, False, False, False, False, False, True,
         False, False, False, False, False, False, False, False, False,
         True, False, False, False, True, False, False, True, False,
         False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False,
         True, False, False]], dtype=object),
 array([False, False,  True,  True, False]))

In [None]:
# Split data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
len(X_train), len(y_train), len(X_test), len(y_test)

(36177, 36177, 9045, 9045)

In [None]:
# Standardize the feature (scaling)
from sklearn.preprocessing import StandardScaler
"""https://benalexkeen.com/feature-scaling-with-scikit-learn/"""

print(f"Before scaling: X_train {X_train[:1]}, \nX_test{X_test[:1]}")
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(f"After scaling: X_train {X_train[:1]}, \nX_test {X_test[:1]}")

Before scaling: X_train [[-0.72602641  0.70089758  1.12919651 -0.14734249 -0.21932151 -0.07858361
  -0.27281141 -1.67156421  5.13270603 -0.30059667 -0.21327772 -0.02231146
  -0.19267892 -0.11235595 -0.06665093 -0.09812372 -0.13334271 -0.12263303
  -0.18426483 -0.21135811  2.22423694 -0.11019182 -0.69797867 -0.24420287
  -0.03972497 -0.13280486 -0.5310448  -0.0257652  -0.93299646 -0.11273375
   1.44950437 -0.17766666 -0.17135946 -0.01743998 -0.39079551 -0.38957278
  -0.18289726 -0.21881439 -0.26518945 -0.34540301 -0.07342639 -0.39164096
  -0.14893003  2.71022669 -0.1799714  -0.23142132  1.69277201 -0.17758386
  -0.41348056 -0.34460038 -0.21985272 -0.17415944 -0.32181195 -0.08719868
   0.40385393  0.69452154 -0.06188041 -0.04938032 -0.04143354 -0.0531737
  -0.04527352 -0.02975438 -0.05720502 -0.05185046 -0.02832415 -0.0651703
  -0.03285112 -0.04465628 -0.04109747 -0.00525763 -0.02036664 -0.0257652
  -0.02168254 -0.05888305 -0.03489589 -0.02975438 -0.04678127 -0.04766322
  -0.04557902 -0.

In [None]:
# Convert data to PyTorch tensors
import torch

X_train = X_train.clone().detach().float()
X_test = X_test.clone().detach().float()
y_train = y_train.clone().detach().float()
y_test = y_test.clone().detach().float()

# I use .clone().detach instead of torch.tensor (colab recommended) because of "Better memory management" and "Avoids unnecessary computation graphs"

X_train.dtype, type(X_train)

(torch.float32, torch.Tensor)

In [None]:
# Create custom dataset
"""https://pytorch.org/tutorials/beginner/basics/data_tutorial.html"""
from torch.utils.data import DataLoader, Dataset

class AdultIncomeDataset(Dataset):
  def __init__(self, features, target):
    self.features = features
    self.target = target

  def __len__(self):
    return len(self.features)

  def __getitem__(self, idx):
    return self.features[idx], self.target[idx]

# Create DataLoader instances for training and testing
train_dataset = AdultIncomeDataset(X_train, y_train)
test_dataset = AdultIncomeDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

print(f"train_loader len: {len(train_loader)}")
print(f"test_loader len: {len(test_loader)}\n")
for train_features, train_target in train_loader:
  print(f"train_features[0]: {train_features[0]}")
  print(f"train_target[0]: {train_target[0]}\n")
  print(f"train_features shape: {train_features.shape}")
  print(f"train_target shape: {train_target.shape}")
  break

print("Each batch has 64 samples and each sample has 96 features (columns) of data")

train_loader len: 566
test_loader len: 142

train_features[0]: tensor([-1.3326e+00, -1.4240e+00,  7.3527e-01, -1.4734e-01, -2.1932e-01,
        -7.8584e-02, -2.7281e-01,  5.9824e-01, -1.9483e-01, -3.0060e-01,
        -2.1328e-01, -2.2311e-02, -1.9268e-01, -1.1236e-01, -6.6651e-02,
        -9.8124e-02, -1.3334e-01, -1.2263e-01,  5.4270e+00, -2.1136e-01,
        -4.4959e-01, -1.1019e-01, -6.9798e-01, -2.4420e-01, -3.9725e-02,
        -1.3280e-01, -5.3104e-01, -2.5765e-02, -9.3300e-01, -1.1273e-01,
         1.4495e+00, -1.7767e-01, -1.7136e-01, -1.7440e-02, -3.9080e-01,
        -3.8957e-01, -1.8290e-01,  4.5701e+00, -2.6519e-01, -3.4540e-01,
        -7.3426e-02, -3.9164e-01, -1.4893e-01, -3.6897e-01, -1.7997e-01,
        -2.3142e-01, -5.9075e-01, -1.7758e-01,  2.4185e+00, -3.4460e-01,
        -2.1985e-01, -1.7416e-01, -3.2181e-01, -8.7199e-02,  4.0385e-01,
         6.9452e-01, -6.1880e-02, -4.9380e-02, -4.1434e-02, -5.3174e-02,
        -4.5274e-02, -2.9754e-02, -5.7205e-02, -5.1850e-02, -

## Build the Model

In [None]:
import torch.nn as nn
import torch.optim as optim

class IncomePredictionModel(nn.Module):
  def __init__(self, input_size):
    super(IncomePredictionModel, self).__init__()
    # fc = fully connected layer
    self.fc1 = nn.Linear(input_size, 64)
    self.fc2 = nn.Linear(64, 32)
    self.fc3 = nn.Linear(32, 1)
    self.sigmoid = nn.Sigmoid()


  def forward(self, x):
    # Will use this method of typo for readability and for possible debugging
    x = torch.relu(self.fc1(x))
    x = torch.relu(self.fc2(x))
    x = self.fc3(x)
    x = self.sigmoid(x)
    return x

# Initialize the model
input_size = X_train.shape[1] # Number of features
model = IncomePredictionModel(input_size=input_size)
model

IncomePredictionModel(
  (fc1): Linear(in_features=96, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
# Define the loss fucntion and optimizer
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

## Testing & Training

In [None]:
from tqdm.auto import tqdm
torch.manual_seed(42)
epochs = 20

for epoch in tqdm(range(epochs)):
  ### Training
  model.train() # Set the model to training mode
  total_train_loss = 0

  for X_batch, y_batch in train_loader:
    # Zero gradients from previous step
    optimizer.zero_grad()

    # Forward pass
    y_pred = model(X_batch)

    # Calculate the loss
    train_loss = loss_fn(y_pred.squeeze(), y_batch) # Squeeze to match the target shape
    total_train_loss += train_loss.item()

    # Backward pass and optimizer step
    train_loss.backward()
    optimizer.step()

  # Calculate average train loss
  avg_train_loss = total_train_loss / len(train_loader)

  ### Testing
  model.eval() # Set the model to evaluating mode
  total_test_loss = 0
  all_preds = []
  all_labels = []

  with torch.inference_mode(): # Or torch.no_grad() disable gradient calculation during testing
    for X_batch, y_batch in test_loader:
      y_pred = model(X_batch)
      train_loss = loss_fn(y_pred.squeeze(), y_batch)
      total_test_loss += train_loss

      # Collecting predictions and labels for evaluating metrics
      all_preds.extend(torch.round(y_pred.squeeze()).cpu().numpy())
      all_labels.extend(y_batch.cpu().numpy())

  avg_test_loss = total_test_loss / len(train_loader)

  print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.5f}, Test Loss: {avg_test_loss:.5f}")

# Evaluate model accuracy
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(all_labels, all_preds)
print("Accuracy: ", accuracy)
print(classification_report(all_labels, all_preds))

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch 1/20, Train Loss: 0.27330, Test Loss: 0.10644
Epoch 2/20, Train Loss: 0.26675, Test Loss: 0.10266
Epoch 3/20, Train Loss: 0.26474, Test Loss: 0.10652
Epoch 4/20, Train Loss: 0.26532, Test Loss: 0.11499
Epoch 5/20, Train Loss: 0.26561, Test Loss: 0.10710
Epoch 6/20, Train Loss: 0.26839, Test Loss: 0.10602
Epoch 7/20, Train Loss: 0.26254, Test Loss: 0.10537
Epoch 8/20, Train Loss: 0.26316, Test Loss: 0.10580
Epoch 9/20, Train Loss: 0.26129, Test Loss: 0.11261
Epoch 10/20, Train Loss: 0.26253, Test Loss: 0.10688
Epoch 11/20, Train Loss: 0.26158, Test Loss: 0.12759
Epoch 12/20, Train Loss: 0.26650, Test Loss: 0.12385
Epoch 13/20, Train Loss: 0.26620, Test Loss: 0.13076
Epoch 14/20, Train Loss: 0.28125, Test Loss: 0.11471
Epoch 15/20, Train Loss: 0.26777, Test Loss: 0.12140
Epoch 16/20, Train Loss: 0.25999, Test Loss: 0.11195
Epoch 17/20, Train Loss: 0.25792, Test Loss: 0.12045
Epoch 18/20, Train Loss: 0.25674, Test Loss: 0.11575
Epoch 19/20, Train Loss: 0.25594, Test Loss: 0.11897
Ep

## Evaluating & Visualizating


In [None]:
# torch.manual_seed(42)
model.eval()
all_pred = []
all_label = []
with torch.inference_mode():
    for X_batch, y_batch in test_loader:
      y_pred = model(X_batch)
      all_pred.extend(torch.round(y_pred.squeeze()).cpu().numpy())
      all_label.extend(y_batch.cpu().numpy())
# len(all_pred), len(all_label), all_pred[:10], all_pred[:10]

corect = 0

for i in range(len(all_pred)):
  if all_pred[i] == all_label[i]:
    corect += 1
    print(f"\nPredicted: {all_pred[i]}, True: {all_label[i]}")
    print("\033[92mgreen\033[0m")
  else:
    print(f"\nPredicted: {all_pred[i]}, True: {all_label[i]}")
    print("\033[91mred\033[0m")

accuracy = (corect * 100)/len(all_pred)
accuracy

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 1.0, True: 1.0
[92mgreen[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 1.0
[91mred[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 1.0
[91mred[0m

Predicted: 1.0, True: 0.0
[91mred[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, True: 0.0
[92mgreen[0m

Predicted: 0.0, T

84.76506357103372