### Deep Learning Practical Assignment — Adult Income Dataset

### Part 0 — Data Preparation

### Load dataset, clean (replace '?'), split (70/15/15), preprocess (scale numerics,one-hot encode categoricals)

In [None]:
import numpy as np, pandas as pd, os, joblib, matplotlib.pyplot as plt, time
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers, callbacks

print('Loading Adult dataset from OpenML (this requires internet)...')
adult = fetch_openml(name='adult', version=2, as_frame=True)
df = adult.frame.copy()
print('Raw shape:', df.shape)

# Replace '?' with NaN and drop rows with missing values
df.replace('?', np.nan, inplace=True)
df = df.dropna().reset_index(drop=True)
print('After dropna shape:', df.shape)

# Prepare features and target
X = df.drop(columns='class')
y = df['class'].apply(lambda s: 1 if str(s).strip().startswith('>50K') else 0)

# Train/val/test splits (70% / 15% / 15%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)
relative_val_size = 0.15 / 0.85
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=relative_val_size, random_state=42, stratify=y_temp
)
print('Splits shapes -> train:', X_train.shape, 'val:', X_val.shape, 'test:', X_test.shape)

# Identify column types
numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object','category']).columns.tolist()
print('Numeric features:', numeric_features)
print('Categorical features count:', len(categorical_features))

# Preprocessor: scale numeric, one-hot categorical
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
])

print('Fitting preprocessor on training data...')
preprocessor.fit(X_train)

X_train_p = preprocessor.transform(X_train)
X_val_p   = preprocessor.transform(X_val)
X_test_p  = preprocessor.transform(X_test)

# Ensure float32 dtype for Keras
X_train_p = np.array(X_train_p, dtype=np.float32)
X_val_p   = np.array(X_val_p, dtype=np.float32)
X_test_p  = np.array(X_test_p, dtype=np.float32)

y_train = np.array(y_train, dtype=np.float32)
y_val   = np.array(y_val, dtype=np.float32)
y_test  = np.array(y_test, dtype=np.float32)

print('Preprocessed shapes ->', X_train_p.shape, X_val_p.shape, X_test_p.shape)

# Save preprocessor
os.makedirs('adult_results', exist_ok=True)
joblib.dump(preprocessor, 'adult_results/preprocessor.joblib')
print('Preprocessor saved to adult_results/preprocessor.joblib')


Loading Adult dataset from OpenML (this requires internet)...
Raw shape: (48842, 15)
After dropna shape: (45222, 15)
Splits shapes -> train: (31654, 14) val: (6784, 14) test: (6784, 14)
Numeric features: ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Categorical features count: 8
Fitting preprocessor on training data...
Preprocessed shapes -> (31654, 104) (6784, 104) (6784, 104)
Preprocessor saved to adult_results/preprocessor.joblib


### Helper: model builder and training function (Keras).

In [None]:
# Part 1 experiments

# Function to build MLP
def build_mlp(input_dim, hidden_layers=[128,64], l2=0.0, dropout=0.0):
    model = keras.Sequential()
    model.add(layers.Input(shape=(input_dim,)))
    for units in hidden_layers:
        model.add(layers.Dense(units, activation='relu',
                               kernel_regularizer=regularizers.l2(l2)))
        if dropout > 0:
            model.add(layers.Dropout(dropout))
    model.add(layers.Dense(1, activation='sigmoid'))
    return model

# Training function
def train_model(model, optimizer, X_train, y_train, X_val, y_val, 
                X_test=None, y_test=None,
                batch_size=64, epochs=20, verbose=1, use_early_stopping=False):
    
    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    cbs = []
    if use_early_stopping:
        cbs.append(callbacks.EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True))
    
    history = model.fit(X_train, y_train,
                        validation_data=(X_val, y_val),
                        epochs=epochs,
                        batch_size=batch_size,
                        callbacks=cbs,
                        verbose=verbose)
    
    train_acc = history.history['accuracy'][-1]
    val_acc   = history.history['val_accuracy'][-1]
    test_acc  = None
    if X_test is not None:
        test_acc = model.evaluate(X_test, y_test, verbose=0)[1]
    
    return history, train_acc, val_acc, test_acc


# Experiment settings
input_dim = X_train_p.shape[1]
hidden = [128,64]
epochs = 20
batch_size = 64

results = []
histories = {}

optimizers = {
    'SGD': keras.optimizers.SGD(learning_rate=0.01),
    'SGD_momentum': keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
    'Adam': keras.optimizers.Adam(learning_rate=0.001)
}

# Run experiments
for name, opt in optimizers.items():
    print('\nTraining with', name)
    model = build_mlp(input_dim, hidden_layers=hidden, l2=0.0, dropout=0.0)
    hist, tr_acc, val_acc, test_acc = train_model(
        model, optimizer=opt,
        X_train=X_train_p, y_train=y_train,
        X_val=X_val_p, y_val=y_val,
        X_test=X_test_p, y_test=y_test,
        batch_size=batch_size, epochs=epochs, verbose=1
    )
    histories[name] = hist
    results.append({
        'part': 'optimizers',
        'method': name,
        'train_acc': tr_acc,
        'val_acc': val_acc,
        'test_acc': test_acc
    })
    
    # Save accuracy plots
    plt.figure(figsize=(6,4))
    plt.plot(hist.history['accuracy'], label='train acc')
    plt.plot(hist.history['val_accuracy'], label='val acc')
    plt.title(f'{name} — accuracy')
    plt.xlabel('epoch'); plt.ylabel('accuracy'); plt.legend(); plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'adult_results/optim_{name}.png')
    plt.close()

# Save results table
pd.DataFrame(results).to_csv('adult_results/part1_optimizers_results.csv', index=False)
print('\nPart1 results saved to adult_results/part1_optimizers_results.csv')


## Part 1 — Optimizers
Train the same model with SGD, SGD+momentum, and Adam. Plot curves and report train/val/test accuracies.

In [8]:
# Function to build MLP
def build_mlp(input_dim, hidden_layers=[128,64], l2=0.0, dropout=0.0):
    model = keras.Sequential()
    model.add(layers.Input(shape=(input_dim,)))
    for units in hidden_layers:
        model.add(layers.Dense(units, activation='relu',
                               kernel_regularizer=regularizers.l2(l2)))
        if dropout > 0:
            model.add(layers.Dropout(dropout))
    model.add(layers.Dense(1, activation='sigmoid'))
    return model

# Training function
def train_model(model, optimizer, X_train, y_train, X_val, y_val, 
                X_test=None, y_test=None,
                batch_size=64, epochs=20, verbose=1, use_early_stopping=False):
    
    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    cbs = []
    if use_early_stopping:
        cbs.append(callbacks.EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True))
    
    history = model.fit(X_train, y_train,
                        validation_data=(X_val, y_val),
                        epochs=epochs,
                        batch_size=batch_size,
                        callbacks=cbs,
                        verbose=verbose)
    
    train_acc = history.history['accuracy'][-1]
    val_acc   = history.history['val_accuracy'][-1]
    test_acc  = None
    if X_test is not None:
        test_acc = model.evaluate(X_test, y_test, verbose=0)[1]
    
    return history, train_acc, val_acc, test_acc


# Experiment settings
input_dim = X_train_p.shape[1]
hidden = [128,64]
epochs = 20
batch_size = 64

results = []
histories = {}

optimizers = {
    'SGD': keras.optimizers.SGD(learning_rate=0.01),
    'SGD_momentum': keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
    'Adam': keras.optimizers.Adam(learning_rate=0.001)
}

# Run experiments
for name, opt in optimizers.items():
    print('\nTraining with', name)
    model = build_mlp(input_dim, hidden_layers=hidden, l2=0.0, dropout=0.0)
    hist, tr_acc, val_acc, test_acc = train_model(
        model, optimizer=opt,
        X_train=X_train_p, y_train=y_train,
        X_val=X_val_p, y_val=y_val,
        X_test=X_test_p, y_test=y_test,
        batch_size=batch_size, epochs=epochs, verbose=1
    )
    histories[name] = hist
    results.append({
        'part': 'optimizers',
        'method': name,
        'train_acc': tr_acc,
        'val_acc': val_acc,
        'test_acc': test_acc
    })
    
    # Save accuracy plots
    plt.figure(figsize=(6,4))
    plt.plot(hist.history['accuracy'], label='train acc')
    plt.plot(hist.history['val_accuracy'], label='val acc')
    plt.title(f'{name} — accuracy')
    plt.xlabel('epoch'); plt.ylabel('accuracy'); plt.legend(); plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'adult_results/optim_{name}.png')
    plt.close()

# Save results table
pd.DataFrame(results).to_csv('adult_results/part1_optimizers_results.csv', index=False)
print('\nPart1 results saved to adult_results/part1_optimizers_results.csv')



Training with SGD
Epoch 1/20
[1m495/495[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7765 - loss: 0.4527 - val_accuracy: 0.8252 - val_loss: 0.3803
Epoch 2/20
[1m495/495[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8351 - loss: 0.3603 - val_accuracy: 0.8361 - val_loss: 0.3540
Epoch 3/20
[1m495/495[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8401 - loss: 0.3441 - val_accuracy: 0.8398 - val_loss: 0.3446
Epoch 4/20
[1m495/495[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8441 - loss: 0.3356 - val_accuracy: 0.8438 - val_loss: 0.3379
Epoch 5/20
[1m495/495[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8466 - loss: 0.3298 - val_accuracy: 0.8436 - val_loss: 0.3337
Epoch 6/20
[1m495/495[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8484 - loss: 0.3257 - val_accuracy: 0.8436 - val_loss: 0.3305
Epoch 7/2

## Part 2 — Batch Size
Train the model with different batch sizes and record results.

In [9]:
# Part 2 experiments (use Adam optimizer)
batch_sizes = [1, 32, 128, 1024]
results_bs = []
for bs in batch_sizes:
    print('\nTraining with batch size =', bs)
    model = build_mlp(input_dim, hidden_layers=hidden)
    hist, tr_acc, val_acc, test_acc = train_model(model, optimizer=keras.optimizers.Adam(learning_rate=0.001), X_train=X_train_p, y_train=y_train, X_val=X_val_p, y_val=y_val, batch_size=bs, epochs=12, verbose=1)
    results_bs.append({'part':'batch_size','batch_size':bs,'train_acc':tr_acc,'val_acc':val_acc,'test_acc':test_acc})
    plt.figure(figsize=(6,4))
    plt.plot(hist.history['accuracy'], label='train acc')
    plt.plot(hist.history['val_accuracy'], label='val acc')
    plt.title(f'Batch size {bs} — accuracy'); plt.xlabel('epoch'); plt.legend(); plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'/content/adult_results/bs_{bs}_acc.png')
    plt.close()

pd.DataFrame(results_bs).to_csv('/content/adult_results/part2_batchsize_results.csv', index=False)
print('\nPart2 results saved to /content/adult_results/part2_batchsize_results.csv')


Training with batch size = 1
Epoch 1/12
[1m31654/31654[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 1ms/step - accuracy: 0.8459 - loss: 0.3313 - val_accuracy: 0.8499 - val_loss: 0.3299
Epoch 2/12
[1m31654/31654[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 2ms/step - accuracy: 0.8544 - loss: 0.3192 - val_accuracy: 0.8514 - val_loss: 0.3258
Epoch 3/12
[1m31654/31654[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 2ms/step - accuracy: 0.8578 - loss: 0.3174 - val_accuracy: 0.8520 - val_loss: 0.3203
Epoch 4/12
[1m31654/31654[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 2ms/step - accuracy: 0.8577 - loss: 0.3126 - val_accuracy: 0.8560 - val_loss: 0.3188
Epoch 5/12
[1m31654/31654[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 3ms/step - accuracy: 0.8588 - loss: 0.3098 - val_accuracy: 0.8522 - val_loss: 0.3211
Epoch 6/12
[1m31654/31654[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 2ms/step - accuracy: 0.8594 - loss: 0.3085 - val_accu

## Part 3 — Overfitting & Regularization
Train a large model to provoke overfitting, then apply L2 and Dropout.

In [10]:
# Part3 experiments: Large model, L2, Dropout
large_hidden = [512,256,128]
results_reg = []

# Large model (no reg)
print('\nLarge model (no regularization)')
model_large = build_mlp(input_dim, hidden_layers=large_hidden, l2=0.0, dropout=0.0)
hist_large, tr_large, val_large, test_large = train_model(model_large, optimizer=keras.optimizers.Adam(1e-3), X_train=X_train_p, y_train=y_train, X_val=X_val_p, y_val=y_val, batch_size=128, epochs=25, verbose=1)
results_reg.append({'setting':'large_no_reg','train_acc':tr_large,'val_acc':val_large,'test_acc':test_large})
plt.figure(figsize=(6,4)); plt.plot(hist_large.history['accuracy'], label='train'); plt.plot(hist_large.history['val_accuracy'], label='val'); plt.title('Large no reg'); plt.legend(); plt.grid(True); plt.savefig('/content/adult_results/large_no_reg.png'); plt.close()

# L2 regularization
print('\nLarge model + L2')
model_l2 = build_mlp(input_dim, hidden_layers=large_hidden, l2=1e-4, dropout=0.0)
hist_l2, tr_l2, val_l2, test_l2 = train_model(model_l2, optimizer=keras.optimizers.Adam(1e-3), X_train=X_train_p, y_train=y_train, X_val=X_val_p, y_val=y_val, batch_size=128, epochs=25, verbose=1)
results_reg.append({'setting':'large_l2','train_acc':tr_l2,'val_acc':val_l2,'test_acc':test_l2})
plt.figure(figsize=(6,4)); plt.plot(hist_l2.history['accuracy'], label='train'); plt.plot(hist_l2.history['val_accuracy'], label='val'); plt.title('Large + L2'); plt.legend(); plt.grid(True); plt.savefig('/content/adult_results/large_l2.png'); plt.close()

# Dropout regularization
print('\nLarge model + Dropout')
model_do = build_mlp(input_dim, hidden_layers=large_hidden, l2=0.0, dropout=0.5)
hist_do, tr_do, val_do, test_do = train_model(model_do, optimizer=keras.optimizers.Adam(1e-3), X_train=X_train_p, y_train=y_train, X_val=X_val_p, y_val=y_val, batch_size=128, epochs=25, verbose=1)
results_reg.append({'setting':'large_dropout','train_acc':tr_do,'val_acc':val_do,'test_acc':test_do})
plt.figure(figsize=(6,4)); plt.plot(hist_do.history['accuracy'], label='train'); plt.plot(hist_do.history['val_accuracy'], label='val'); plt.title('Large + Dropout'); plt.legend(); plt.grid(True); plt.savefig('/content/adult_results/large_dropout.png'); plt.close()

pd.DataFrame(results_reg).to_csv('/content/adult_results/part3_regularization_results.csv', index=False)
print('\nPart3 results saved to /content/adult_results/part3_regularization_results.csv')


Large model (no regularization)
Epoch 1/25
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.8414 - loss: 0.3358 - val_accuracy: 0.8523 - val_loss: 0.3188
Epoch 2/25
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8552 - loss: 0.3132 - val_accuracy: 0.8494 - val_loss: 0.3247
Epoch 3/25
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8584 - loss: 0.3079 - val_accuracy: 0.8488 - val_loss: 0.3173
Epoch 4/25
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8605 - loss: 0.3013 - val_accuracy: 0.8508 - val_loss: 0.3178
Epoch 5/25
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8635 - loss: 0.2965 - val_accuracy: 0.8522 - val_loss: 0.3172
Epoch 6/25
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8653 - loss: 0.2901 - val_accuracy: 0.8482 - val_loss: 0.

### Part 4: Early Stopping

In [11]:

# Part4: Early stopping comparison
# Without early stopping (long)
print('\nLong training without early stopping')
model_long = build_mlp(input_dim, hidden_layers=[256,128,64], l2=0.0, dropout=0.0)
hist_long, tr_long, val_long, test_long = train_model(model_long, optimizer=keras.optimizers.Adam(1e-3), X_train=X_train_p, y_train=y_train, X_val=X_val_p, y_val=y_val, batch_size=128, epochs=80, verbose=1)
plt.figure(figsize=(6,4)); plt.plot(hist_long.history['accuracy'], label='train'); plt.plot(hist_long.history['val_accuracy'], label='val'); plt.title('Long training no ES'); plt.legend(); plt.grid(True); plt.savefig('/content/adult_results/long_no_es.png'); plt.close()

# With early stopping
print('\nTraining with Early Stopping (monitor val_loss)')
model_es = build_mlp(input_dim, hidden_layers=[256,128,64], l2=0.0, dropout=0.0)
hist_es, tr_es, val_es, test_es = train_model(model_es, optimizer=keras.optimizers.Adam(1e-3), X_train=X_train_p, y_train=y_train, X_val=X_val_p, y_val=y_val, batch_size=128, epochs=200, use_early_stopping=True, verbose=1)
plt.figure(figsize=(6,4)); plt.plot(hist_es.history['accuracy'], label='train'); plt.plot(hist_es.history['val_accuracy'], label='val'); plt.title('Early stopping'); plt.legend(); plt.grid(True); plt.savefig('/content/adult_results/early_stopping.png'); plt.close()

pd.DataFrame([{'setting':'long_no_es','train_acc':tr_long,'val_acc':val_long,'test_acc':test_long},
              {'setting':'with_es','train_acc':tr_es,'val_acc':val_es,'test_acc':test_es}]).to_csv('/content/adult_results/part4_earlystopping_results.csv', index=False)
print('\nPart4 results saved to /content/adult_results/part4_earlystopping_results.csv')


Long training without early stopping
Epoch 1/80
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8389 - loss: 0.3403 - val_accuracy: 0.8516 - val_loss: 0.3188
Epoch 2/80
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8533 - loss: 0.3129 - val_accuracy: 0.8502 - val_loss: 0.3169
Epoch 3/80
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8575 - loss: 0.3071 - val_accuracy: 0.8513 - val_loss: 0.3159
Epoch 4/80
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8609 - loss: 0.3018 - val_accuracy: 0.8513 - val_loss: 0.3149
Epoch 5/80
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8623 - loss: 0.2985 - val_accuracy: 0.8529 - val_loss: 0.3142
Epoch 6/80
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8634 - loss: 0.2939 - val_accuracy: 0.8507 - val_los

## Part 5 — Reflection
Summarize results and recommended choices. The notebook also saves CSVs and PNGs under `/content/adult_results`.

In [12]:
# Final summary table (combine saved CSVs if exist)
import pandas as pd, os, glob
files = glob.glob('/content/adult_results/*.csv')
summary = []
for f in files:
    try:
        df = pd.read_csv(f)
        df['source_file'] = os.path.basename(f)
        summary.append(df)
    except Exception as e:
        print('skip', f, e)

if summary:
    big = pd.concat(summary, ignore_index=True, sort=False)
    big.to_csv('/content/adult_results/combined_results.csv', index=False)
    print('Combined results saved to /content/adult_results/combined_results.csv')
else:
    print('No CSV results found yet. Run the experiment cells first.')

print('\nAll output files (plots + CSVs) will be in /content/adult_results. Download as a zip if needed.')

Combined results saved to /content/adult_results/combined_results.csv

All output files (plots + CSVs) will be in /content/adult_results. Download as a zip if needed.


Thank you _ Abdelrahman Elsaeed 