In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
import joblib


In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
cols = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]

df = pd.read_csv(url, header=None, names=cols, na_values=" ?")
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
# Drop missing values
df.dropna(inplace=True)

# Strip whitespace
for c in df.select_dtypes(include='object').columns:
    df[c] = df[c].str.strip()

# Convert target to binary
df['income'] = df['income'].replace({'<=50K': 0, '>50K': 1})

# Drop unneeded columns
df = df.drop(columns=['fnlwgt'])

X = df.drop('income', axis=1)
y = df['income']


  df['income'] = df['income'].replace({'<=50K': 0, '>50K': 1})


In [4]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)


In [6]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()

from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

X_train_t = preprocessor.fit_transform(X_train)
X_val_t = preprocessor.transform(X_val)
X_test_t = preprocessor.transform(X_test)

# Save the preprocessor
joblib.dump(preprocessor, "preprocessor.joblib")

['preprocessor.joblib']

In [7]:
input_dim = X_train_t.shape[1]

model = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(128, activation="relu"),
    layers.Dropout(0.25),
    layers.Dense(64, activation="relu"),
    layers.Dropout(0.2),
    layers.Dense(32, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()


In [8]:
cb_early = callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
cb_chk = callbacks.ModelCheckpoint("best_model.h5", monitor="val_loss", save_best_only=True)

history = model.fit(
    X_train_t, y_train,
    validation_data=(X_val_t, y_val),
    epochs=30,
    batch_size=128,
    callbacks=[cb_early, cb_chk],
    verbose=2
)


Epoch 1/30




177/177 - 4s - 21ms/step - accuracy: 0.8214 - loss: 0.3822 - val_accuracy: 0.8467 - val_loss: 0.3269
Epoch 2/30




177/177 - 1s - 5ms/step - accuracy: 0.8479 - loss: 0.3267 - val_accuracy: 0.8501 - val_loss: 0.3197
Epoch 3/30
177/177 - 1s - 5ms/step - accuracy: 0.8527 - loss: 0.3198 - val_accuracy: 0.8480 - val_loss: 0.3211
Epoch 4/30
177/177 - 1s - 5ms/step - accuracy: 0.8551 - loss: 0.3144 - val_accuracy: 0.8520 - val_loss: 0.3200
Epoch 5/30
177/177 - 1s - 5ms/step - accuracy: 0.8560 - loss: 0.3124 - val_accuracy: 0.8491 - val_loss: 0.3209
Epoch 6/30




177/177 - 1s - 5ms/step - accuracy: 0.8592 - loss: 0.3101 - val_accuracy: 0.8509 - val_loss: 0.3183
Epoch 7/30
177/177 - 1s - 5ms/step - accuracy: 0.8581 - loss: 0.3074 - val_accuracy: 0.8546 - val_loss: 0.3193
Epoch 8/30
177/177 - 1s - 5ms/step - accuracy: 0.8596 - loss: 0.3049 - val_accuracy: 0.8525 - val_loss: 0.3205
Epoch 9/30
177/177 - 1s - 5ms/step - accuracy: 0.8611 - loss: 0.3035 - val_accuracy: 0.8538 - val_loss: 0.3206
Epoch 10/30
177/177 - 1s - 5ms/step - accuracy: 0.8623 - loss: 0.3011 - val_accuracy: 0.8552 - val_loss: 0.3214
Epoch 11/30
177/177 - 1s - 5ms/step - accuracy: 0.8623 - loss: 0.2998 - val_accuracy: 0.8515 - val_loss: 0.3238


In [9]:
loss, acc = model.evaluate(X_test_t, y_test, verbose=0)
print(f"Test Accuracy: {acc:.4f}")

y_pred_prob = model.predict(X_test_t).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))


Test Accuracy: 0.8520
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Confusion Matrix:
 [[2637  195]
 [ 363  576]]

Classification Report:
               precision    recall  f1-score   support

           0     0.8790    0.9311    0.9043      2832
           1     0.7471    0.6134    0.6737       939

    accuracy                         0.8520      3771
   macro avg     0.8130    0.7723    0.7890      3771
weighted avg     0.8462    0.8520    0.8469      3771



In [11]:
model.save("saved_model/income_model.keras")
print("✅ Model saved successfully.")

FileNotFoundError: [Errno 2] No such file or directory: 'saved_model/income_model.keras'

In [12]:
import os

if not os.path.exists("saved_model"):
    os.makedirs("saved_model")

model.save("saved_model/income_model.keras")
print("✅ Model saved successfully.")

✅ Model saved successfully.
