In [None]:
# ***** PYTORCH DOCUMENTATION - EASE TO IMPLEMENTATION, FIND METHODS/FUNCTIONS SUITABLE FOR TASK ******

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Loading data from GitHub because I think it's easier
url_train = 'https://raw.githubusercontent.com/ElsieElsevier/BIAI/refs/heads/main/training_data.csv'
url_test = 'https://raw.githubusercontent.com/ElsieElsevier/BIAI/refs/heads/main/test_data.csv'

df_train = pd.read_csv(url_train)
df_test = pd.read_csv(url_test)

In [None]:
# DATA EXPLORATION
stars = '*************************************************'

# Print the columns of the training and test datasets
print(f"Train dataset columns: {df_train.columns.tolist()}")
print(f"Test dataset columns: {df_test.columns.tolist()}")
print(stars)

# Information about the training dataset
print("Training Data Info:")
df_train.info()
print(stars)

# Information about the test dataset
print("Test Data Info:")
df_test.info()
print(stars)

# Check for missing values in the training dataset
print("Missing values in train dataset:")
print(df_train.isnull().sum())
print(stars)

# Check for missing values in the test dataset
print("Missing values in test dataset:")
print(df_test.isnull().sum())
print(stars)

# Check unique values in each column for the training dataset
print("Unique values in train dataset:")
for column in df_train.columns:
    unique_count = df_train[column].nunique()
    print(f"{column}: {unique_count} unique values")
print(stars)

# Check unique values in each column for the test dataset
print("Unique values in test dataset:")
for column in df_test.columns:
    unique_count = df_test[column].nunique()
    print(f"{column}: {unique_count} unique values")
print(stars)

# Display unique values for specific columns in the training dataset
print(f"Unique values in 'workclass' column of train dataset: {df_train['workclass'].unique()}")
print(stars)
print(f"Unique values in 'occupation' column of train dataset: {df_train['occupation'].unique()}")
print(stars)
print(f"Unique values in 'relationship' column of train dataset: {df_train['relationship'].unique()}")
print(stars)
print(f"Unique values in 'native_country' column of train dataset: {df_train['native_country'].unique()}")
print(stars)
print(f"Unique values in 'marital_status' column of train dataset: {df_train['marital_status'].unique()}")
print(stars)
print(f"Unique values in 'income_bracket' column of train dataset: {df_train['income_bracket'].unique()}")
print(stars)

Train dataset columns: ['Unnamed: 0', 'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income_bracket']
Test dataset columns: ['Unnamed: 0', 'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income_bracket']
*************************************************
Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      32561 non-null  int64 
 1   age             32561 non-null  int64 
 2   workclass       32561 non-null  object
 3   fnlwgt          32561 non-null  int64 
 4   education       32561 non-null  object
 5   education

In [None]:
# DATA PRE-PROCESSING PART 1

# Listing my categorical and numeric features for readability and reference
categorical_features = [
    'workclass',
    'marital_status',
    'occupation',
    'relationship',
    'race', 'gender',
    'native_country'
    ]

numerical_features = [
    'age',
    #'fnlwgt', # using fnlwgt actually hurts our predictions
    'education_num',
    'capital_gain',
    'capital_loss',
    'hours_per_week'
    ]

def clean_train_data(df_train): # OKAY
    """Cleaning training data"""
    # 1. Drops first column (index column), education, and fnlwgt
    df_train = df_train.drop(df_train.columns[0], axis=1)
    df_train = df_train.drop('education', axis=1)
    df_train = df_train.drop('fnlwgt', axis=1) # Re-implement if needed for training
    return df_train

def clean_test_data(df_test): # OKAY
    """Cleaning test data"""
    # 1. Remove row with '|1x3 Cross validator' string in the 'age' feature
    df_test = df_test[~df_test['age'].astype(str).str.contains('Cross validator')]

    # 2. Drops first column (index column), education, and fnlwgy
    df_test = df_test.drop(df_test.columns[0], axis=1)
    df_test = df_test.drop('education', axis=1)
    df_test = df_test.drop('fnlwgt', axis=1) # Re-implement if needed for training
    return df_test

def process_features(df, scaler=None, train_columns=None):
  # Need scaler and train_colum param to differentiate between train/test
    """Feature processing"""
    # 1. Remove whitespace and replace '?' with NaN in categorical_features list
    for feature in categorical_features:
        df[feature] = df[feature].str.strip()
        df[feature] = df[feature].replace('?', np.nan)

    # 2. Clean income_bracket label by removing whitespaces and periods
    df['income_bracket'] = df['income_bracket'].str.strip()
    df['income_bracket'] = df['income_bracket'].str.replace('.', '')

    # 3. Create dictionary for mapping label
      # I have to specify the extra space for some reason because tensor error
    label_map = {
        ' <=50K': 0,
        '<=50K': 0,
        ' >50K': 1,
        '>50K': 1
    }
    df['income_bracket'] = df['income_bracket'].map(label_map).fillna(0) # Map to dic

    # 3. Apply imputation to missing values
      # Use mode imputation for categorical and mean imputation for numeric
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    numerical_imputer = SimpleImputer(strategy='mean')
    df[categorical_features] = categorical_imputer.fit_transform(df[categorical_features]) # Apply imputation
    df[numerical_features] = numerical_imputer.fit_transform(df[numerical_features]) # Apply imputation

    # 4. Scaling numeric features
    if scaler is None: # Create/fit scaler for new training data
        scaler = StandardScaler()
        df[numerical_features] = scaler.fit_transform(df[numerical_features])
    else: # If scaler is present, transform the data
          # Only applicable to test data
        df[numerical_features] = scaler.transform(df[numerical_features])

    # 6. Encode gender using LabelEncoder
    le_gender = LabelEncoder()
    df['gender'] = le_gender.fit_transform(df['gender'])

    # 7. One-hot encode other categorical features (drop_first=True to avoid multicollinearity)
    onehot_features = [
        'workclass',
        'marital_status',
        'occupation',
        'relationship',
        'race',
        'native_country'
        ]

      # Apply one-hot encoding
      # Got this one-hot encoding tweak from Claude-Sonnet 3.5
    if train_columns is None:  # Denote this is training data
        df_encoded = pd.get_dummies(df[onehot_features], prefix=onehot_features, drop_first=True)
        train_columns = df_encoded.columns

        # Drop original categorical columns and add encoded ones
        df = df.drop(columns=onehot_features)
        df = pd.concat([df, df_encoded], axis=1)

        return df, scaler, train_columns

    else:  # if train_column is not none then it denote test data
        df_encoded = pd.get_dummies(df[onehot_features], prefix=onehot_features)

        # Makes sure test and train data have the same number of columns
        for col in train_columns:
            if col not in df_encoded.columns:
                df_encoded[col] = 0
        df_encoded = df_encoded[train_columns] # Reorder columns to match

        # Drop original categorical columns and add encoded ones
        df = df.drop(columns=onehot_features)
        df = pd.concat([df, df_encoded], axis=1)

        return df, scaler

In [None]:
# DATA PRE-PROCESSING PART 2

def neural_network_prep(df_train, df_test): # OKAY
    """Prepare both datasets for neural network training"""
    # 1. Clean both datasets using function
    df_train = clean_train_data(df_train.copy())
    df_test = clean_test_data(df_test.copy())

    # 2. Process train features - DO NOT USE UNPACKING METHOD, USE INDEX METHOD
    result_train = process_features(df_train)  # Get training results
    df_train_processed = result_train[0]  # Get processed dataframe
    scaler = result_train[1]  # Get scaler
    train_columns = result_train[2]

    # Process test features
    result_test = process_features(df_test, scaler=scaler, train_columns=train_columns)  # Denotes that it's the test data
    df_test_processed = result_test[0]  # Get processed dataframe

    # 3. Separate features and labels
      # Don't use scikit-learn because we already have separated train/test datasets - Claude-Sonnect 3.5
    X_train = df_train_processed.drop('income_bracket', axis=1).values
    y_train = df_train_processed['income_bracket'].values
    X_test = df_test_processed.drop('income_bracket', axis=1).values
    y_test = df_test_processed['income_bracket'].values

    # 4. Convert to float32 for efficiency/speed
    X_train = X_train.astype(np.float32)
    X_test = X_test.astype(np.float32)
    y_train = y_train.astype(np.float32)
    y_test = y_test.astype(np.float32)

    # 5. Using verify_data to confirm data is ready for ingestion
    verify_data(X_train, y_train, "Training")
    verify_data(X_test, y_test, "Test")

    return X_train, y_train, X_test, y_test


def verify_data(X, y, name): # OKAY - FOR VERIFICATION THAT DATA IS NEURAL NETWORK READY
    """Verify dataset is ready for neural network"""
    print(f"\n{name} Dataset Verification:")
    print(f"Feature shape: {X.shape}")
    print(f"Label shape: {y.shape}")
    print(f"Feature dtype: {X.dtype}")
    print(f"Label dtype: {y.dtype}")
    print(f"Any NaN in features: {np.isnan(X).any()}")
    print(f"Any NaN in labels: {np.isnan(y).any()}")
    print(f"Feature value range: [{X.min():.3f}, {X.max():.3f}]")
    print(f"Unique label values: {np.unique(y)}")

In [None]:
# DATA MODELING USING PYTORCH

class IncomeDataset(Dataset):
    """Create custom dataset for income prediction"""
    def __init__(self, X, y): # Need to initialize constructor, X and y param for inputs

        self.X = torch.FloatTensor(X) # Convert to PyTorch tensors
        # Need to ensure y is a float that's between 0,1 and reshape it to the expected shape
        # Tensor expects this specific output or else won't run - Claude-Sonnect 3.5
        self.y = torch.FloatTensor(y.astype(float)).view(-1, 1) # Convert to PyTorch tensors

    # Creating custom dataset - Claude-Sonnect 3.5
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# 2. Create a neural network class
class IncomeClassifier(nn.Module):
    """Create neural network class and its layers"""
    def __init__(self, input_size): # Need to initialize, use input_size parameter for first hidden layer
        super(IncomeClassifier, self).__init__() # Not sure what this does but need it to run - Claude-Sonnect 3.5

        # Neural Network Topography
        self.model = nn.Sequential(

            # First layer
            nn.Linear(input_size, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.4),

            # Second layer
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),

            # Third layer
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.2),

            # Output layer
            nn.Linear(32, 1),
            nn.Sigmoid() # Sigmoid because binary classification
        )

    def forward(self, x): # Defines forward movement by passing in "X" to every layer
        return self.model(x)

In [None]:
# ***** PYTORCH DOCUMENTATION - EASE TO IMPLEMENTATION, FIND METHODS/FUNCTIONS SUITABLE FOR TASK ******

def train_model(model, train_loader, val_loader, criterion, optimizer, max_epochs=50, patience=10, min_delta=1e-5):
    """Creating the training loop with early stoppage"""
  # model: PyTorch model
  # train_loader: Training data loader
  # val_loader: Validation data loader
  # criterion: Loss function
  # optimizer: Optimizer (Adam)
  # max_epochs: Maximum number of epochs to train = 50
  # patience: Number of epochs to wait for improvement before stopping = 10
  # min_delta: Minimum change in validation loss to qualify as an improvement

    # Initialize variables
    best_val_loss = float('inf') # Initialize to infinity to keep track of best scores - Claude-Sonnet 3.5
    patience_counter = 0
    best_model_state = None
    train_losses = []
    val_losses = []

    for epoch in range(max_epochs): # Start iterating over my epochs
        model.train() # Training phase
        total_loss = 0
        correct = 0
        total = 0

        for inputs, labels in train_loader: # Iterates over training loader data
            outputs = model(inputs) # Forward propagation
            loss = criterion(outputs, labels)
            optimizer.zero_grad() # Backward propagation
            loss.backward() # Calculate gradient
            optimizer.step() # Update model based on gradient

            # Accuracy Score
            predicted = (outputs > 0.5).float()
            total += labels.size(0) # Counting total labels
            correct += (predicted == labels).sum().item() # Original sum didn't work - Claude-Sonnet 3.5
            total_loss += loss.item() # Counting losses

        # Training metrics - Claude-Sonnect 3.5
        epoch_loss = total_loss / len(train_loader)
        epoch_acc = 100 * correct / total
        train_losses.append(epoch_loss)

        # Validation
        model.eval() # Must set model to eval mode
        val_loss = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad(): # No need to save gradient since this is validation using test dataset
            for inputs, labels in val_loader:  # Iterates over the test data in val_loader
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                predicted = (outputs > 0.5).float()
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        # Metrics for validation phase
        val_epoch_loss = val_loss / len(val_loader)
        val_epoch_acc = 100 * val_correct / val_total
        val_losses.append(val_epoch_loss)

        # Print epoch statistics
        print(f'Epoch [{epoch+1}/{max_epochs}]: '
              f'Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.2f}%, '
              f'Val Loss: {val_epoch_loss:.4f}, Val Acc: {val_epoch_acc:.2f}%')

        # Early stoppage
        if val_epoch_loss < (best_val_loss - min_delta): # If current validation is better, start counting from 0 again
            best_val_loss = val_epoch_loss
            patience_counter = 0
            best_model_state = model.state_dict().copy()
        else: # If current validation is worst, add 1 count until you hit the patience count and stop the loop
            patience_counter += 1
            if patience_counter >= patience:
                print(f'\nEarly stopping triggered after {epoch+1} epochs')
                model.load_state_dict(best_model_state) # Restore the best model

                return train_losses, val_losses

    return train_losses, val_losses



In [None]:
def main():
    '''Creating the main function with validation split'''
    # Label data - USE INDEX BECAUSE WE DON'T USE UNPACKING
    result = neural_network_prep(df_train, df_test)
    X_train = result[0]
    y_train = result[1]
    X_test = result[2]
    y_test = result[3]

    # Split training data into train and validation sets (80-20 split)
    # Uses validation from train_data to tune
    # Not using scikit-learn because it gets messy imo with splitting training data with a real test dataset, etc
    train_size = int(0.8 * len(X_train))
    X_train_split = X_train[:train_size]
    y_train_split = y_train[:train_size]
    X_val = X_train[train_size:]
    y_val = y_train[train_size:]

    # Create datasets using predefined function
    train_dataset = IncomeDataset(X_train_split, y_train_split)
    val_dataset = IncomeDataset(X_val, y_val)
    test_dataset = IncomeDataset(X_test, y_test)

    # Create dataloaders using DataLoader function
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True) # Not sure to shuffle or not
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True) # Not sure to shuffle or not

    input_size = X_train.shape[1] # Input size is the same as the shape of X_train
    model = IncomeClassifier(input_size) # Instantiate the model

    # Define loss function and optimizer
    criterion = nn.BCELoss() # Claude-Sonnet 3.5 suggested BCE because of binary output
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # Claude-Sonnet 3.5 implement Adam optimizer
      # No momentum because using Adam, for SGD include 'momentum=0.9'

    # Train the model with early stopping
    train_losses, val_losses = train_model( # Call the train_model function to train using the arguments below
        model,
        train_loader,
        val_loader,
        criterion,
        optimizer,
        max_epochs=50, # Increased max epochs since we have early stopping
        patience=10, # Will stop if no improvement for 15 epochs
        min_delta=1e-5 # Minimum change in validation loss to qualify as an improvement
    )

    # Evaluate final model
    model.eval() # SET TO EVAL MODE
    with torch.no_grad(): # No gradient because we're using test dataset
        correct = 0
        total = 0
        for inputs, labels in test_loader: # Reiterate using test dataloader data
            outputs = model(inputs)
            predicted = (outputs > 0.5).float() # Threshold for binary classification
            total += len(labels)
            correct += ((predicted.view(-1) == labels.view(-1))).sum().item() # Claude-Sonnet 3.5
            # This has to be written like this so the shapes/input matches the expected

        accuracy = 100 * correct / total
        print(f'\nFinal Test Accuracy: {accuracy:.2f}%')



if __name__ == "__main__": # I don't understand why I need this but I do - Claude-Sonnet 3.5
    main() # Call main function to run everything


Training Dataset Verification:
Feature shape: (32561, 81)
Label shape: (32561,)
Feature dtype: float32
Label dtype: float32
Any NaN in features: False
Any NaN in labels: False
Feature value range: [-3.530, 13.395]
Unique label values: [0. 1.]

Test Dataset Verification:
Feature shape: (16281, 81)
Label shape: (16281,)
Feature dtype: float32
Label dtype: float32
Any NaN in features: False
Any NaN in labels: False
Feature value range: [-3.530, 13.395]
Unique label values: [0. 1.]
Epoch [1/50]: Train Loss: 0.3517, Train Acc: 83.29%, Val Loss: 0.3152, Val Acc: 85.29%
Epoch [2/50]: Train Loss: 0.3361, Train Acc: 84.31%, Val Loss: 0.3150, Val Acc: 85.37%
Epoch [3/50]: Train Loss: 0.3298, Train Acc: 84.51%, Val Loss: 0.3276, Val Acc: 84.83%
Epoch [4/50]: Train Loss: 0.3292, Train Acc: 84.69%, Val Loss: 0.3125, Val Acc: 85.81%
Epoch [5/50]: Train Loss: 0.3263, Train Acc: 85.02%, Val Loss: 0.3173, Val Acc: 85.58%
Epoch [6/50]: Train Loss: 0.3231, Train Acc: 84.98%, Val Loss: 0.3158, Val Acc: 8