In [5]:
import numpy as np
import pandas as pd

In [6]:
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week",
    "native-country", "income"
]


In [18]:
train_path = "adult/adult.data"

train_df = pd.read_csv(
    train_path,
    header=None,
    names=columns,
    sep=",",
    skipinitialspace=True
)
print(train_df['age'])

0        39
1        50
2        38
3        53
4        28
         ..
32556    27
32557    40
32558    58
32559    22
32560    52
Name: age, Length: 32561, dtype: int64


In [19]:
test_path = "adult/adult.test"

test_df = pd.read_csv(
    test_path,
    header=None,
    names=columns,
    sep=",",
    skipinitialspace=True,
    skiprows=1
)
print(test_df['age'])

0        25
1        38
2        28
3        44
4        18
         ..
16276    39
16277    64
16278    38
16279    44
16280    35
Name: age, Length: 16281, dtype: int64


In [20]:
train_df["income"] = train_df["income"].str.strip()
test_df["income"] = test_df["income"].str.strip().str.replace(".", "", regex=False)


In [21]:
print(test_df['income'])
print(train_df['income'])

0        <=50K
1        <=50K
2         >50K
3         >50K
4        <=50K
         ...  
16276    <=50K
16277    <=50K
16278    <=50K
16279    <=50K
16280     >50K
Name: income, Length: 16281, dtype: object
0        <=50K
1        <=50K
2        <=50K
3        <=50K
4        <=50K
         ...  
32556    <=50K
32557     >50K
32558    <=50K
32559    <=50K
32560     >50K
Name: income, Length: 32561, dtype: object


In [22]:
train_df = train_df.replace("?", np.nan).dropna()
test_df = test_df.replace("?", np.nan).dropna()


In [23]:
for df in [train_df, test_df]:
    df["income"] = (df["income"] == ">50K").astype(int)

In [24]:
print(test_df['income'])
print(train_df['income'])

0        0
1        0
2        1
3        1
5        0
        ..
16275    0
16276    0
16278    0
16279    0
16280    1
Name: income, Length: 15060, dtype: int64
0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name: income, Length: 30162, dtype: int64


In [26]:
y_train = train_df["income"].values
X_train_df = train_df.drop("income", axis=1)

y_test = test_df["income"].values
X_test_df = test_df.drop("income", axis=1)


In [None]:
X_train_df = pd.get_dummies(X_train_df)
X_test_df = pd.get_dummies(X_test_df)

X_train_df, X_test_df = X_train_df.align(
    X_test_df, join="left", axis=1, fill_value=0
)


In [28]:
X_train = X_train_df.values.T
X_test = X_test_df.values.T

y_train = y_train.reshape(1, -1)
y_test = y_test.reshape(1, -1)


In [33]:
X_train = X_train.astype(float)
X_test = X_test.astype(float)

y_train = y_train.astype(float)
y_test = y_test.astype(float)


In [29]:
def relu(Z):
    return np.maximum(0, Z)

def relu_derivative(Z):
    return (Z > 0).astype(float)

def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))


In [30]:
np.random.seed(42)

input_size = X_train.shape[0]
hidden1 = 64
hidden2 = 32
output_size = 1

W1 = np.random.randn(hidden1, input_size) * np.sqrt(2 / input_size)
b1 = np.zeros((hidden1, 1))

W2 = np.random.randn(hidden2, hidden1) * np.sqrt(2 / hidden1)
b2 = np.zeros((hidden2, 1))

W3 = np.random.randn(output_size, hidden2) * np.sqrt(2 / hidden2)
b3 = np.zeros((output_size, 1))


In [41]:
learning_rate = 0.01
epochs = 30
batch_size = 64

num_samples = X_train.shape[1]


In [42]:
for epoch in range(epochs):

    for i in range(0, num_samples, batch_size):

        X_batch = X_train[:, i:i+batch_size]
        y_batch = y_train[:, i:i+batch_size]
        m = X_batch.shape[1]

        Z1 = np.dot(W1, X_batch) + b1
        A1 = relu(Z1)

        Z2 = np.dot(W2, A1) + b2
        A2 = relu(Z2)

        Z3 = np.dot(W3, A2) + b3
        y_hat = sigmoid(Z3)

        loss = -np.mean(
            y_batch * np.log(y_hat + 1e-8) +
            (1 - y_batch) * np.log(1 - y_hat + 1e-8)
        )
        dZ3 = y_hat - y_batch
        dW3 = np.dot(dZ3, A2.T) / m
        db3 = np.sum(dZ3, axis=1, keepdims=True) / m

        dA2 = np.dot(W3.T, dZ3)
        dZ2 = dA2 * relu_derivative(Z2)
        dW2 = np.dot(dZ2, A1.T) / m
        db2 = np.sum(dZ2, axis=1, keepdims=True) / m

        dA1 = np.dot(W2.T, dZ2)
        dZ1 = dA1 * relu_derivative(Z1)
        dW1 = np.dot(dZ1, X_batch.T) / m
        db1 = np.sum(dZ1, axis=1, keepdims=True) / m

        W3 -= learning_rate * dW3
        b3 -= learning_rate * db3

        W2 -= learning_rate * dW2
        b2 -= learning_rate * db2

        W1 -= learning_rate * dW1
        b1 -= learning_rate * db1


    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")


Epoch 1, Loss: 0.5317
Epoch 2, Loss: 0.5317
Epoch 3, Loss: 0.5317
Epoch 4, Loss: 0.5317
Epoch 5, Loss: 0.5317
Epoch 6, Loss: 0.5317
Epoch 7, Loss: 0.5317
Epoch 8, Loss: 0.5317
Epoch 9, Loss: 0.5317
Epoch 10, Loss: 0.5317
Epoch 11, Loss: 0.5317
Epoch 12, Loss: 0.5317
Epoch 13, Loss: 0.5317
Epoch 14, Loss: 0.5317
Epoch 15, Loss: 0.5317
Epoch 16, Loss: 0.5317
Epoch 17, Loss: 0.5317
Epoch 18, Loss: 0.5317
Epoch 19, Loss: 0.5317
Epoch 20, Loss: 0.5317
Epoch 21, Loss: 0.5317
Epoch 22, Loss: 0.5317
Epoch 23, Loss: 0.5317
Epoch 24, Loss: 0.5317
Epoch 25, Loss: 0.5317
Epoch 26, Loss: 0.5317
Epoch 27, Loss: 0.5317
Epoch 28, Loss: 0.5317
Epoch 29, Loss: 0.5317
Epoch 30, Loss: 0.5317


In [43]:

Z1 = np.dot(W1, X_test) + b1
A1 = relu(Z1)

Z2 = np.dot(W2, A1) + b2
A2 = relu(Z2)

Z3 = np.dot(W3, A2) + b3
y_pred = sigmoid(Z3)

y_pred_labels = (y_pred > 0.5).astype(int)

accuracy = np.mean(y_pred_labels == y_test)
print("Test Accuracy:", accuracy)


Test Accuracy: 0.7549136786188579
