First, let's import the clean data:

In [None]:
import pandas as pd

# Read data 
df = pd.read_csv('data_train.csv', header=0)

# print(X.head())

Second, let's do some Exploratory data analysis：

In [None]:
import matplotlib.pyplot as plt

# Using pie charts to count the number and proportion of positive and negative samples
count = df.iloc[:,-1].value_counts()

count.plot(kind='pie', labels=['Negative', 'Positive'], autopct='%1.1f%%', shadow=True)
plt.title('Class Distribution')
plt.show()

In [None]:
# Plot the histogram of each feature：
for feature in df.columns:
    if feature == 'income':
        continue
    plt.hist(df[df['income'] == 0][feature], bins=20, alpha=0.5, label='Negative')
    plt.hist(df[df['income'] == 1][feature], bins=20,alpha=0.5, label='Positive')
    plt.legend(loc='upper right')
    plt.title(feature)
    plt.xticks(rotation=90, fontsize=8)
    plt.show()


In [None]:
import seaborn as sns

# Calculate the correlation coefficient and plot the heat map
corr = df.corr()

sns.heatmap(corr, cmap='coolwarm', annot=True, fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

At the end of the exploratory data analysis, we have become familiar with the basic characteristics of the data. Next, we can start training the model.
Before training the model a data type conversion is required to encode string type variables into numeric variables, which is done by the LabelEncoder function of the sklearn package.

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create LabelEncoder object
encoder = LabelEncoder()

# 对 X_train 中的每一列进行 Label Encoding
for col in df:
    if df[col].dtype == 'object':
        df[col] = encoder.fit_transform(df[col].astype(str))

print(df)

After the encoding is complete, the “fnlwgt” columns need to be feature scaled so that the columns are of essentially the same order of magnitude. The min-max scaling method is used here

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Create MinMaxScaler object
scaler = MinMaxScaler()

# Feature scaling for fnlwgt columns
df['fnlwgt'] = scaler.fit_transform(df[['fnlwgt']])*100

df.head()

In [None]:
# Use the first 80% of the dataset as the training dataset and the last 20% as the valid dataset
split_idx = int(len(df) * 0.8)

df_train = df[:split_idx]
df_valid = df[split_idx:]

print(df_train.head())
print(df_valid.head())

After data pre-processing, the model is trained using L1 regularization to find the best subset of features for the model:

In [None]:

from sklearn.linear_model import LogisticRegression

X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:, -1] 

# Instantiate the classifier and set the corresponding hyperparameters
clf = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, random_state=0)

# Train the model using L1 regularization and select the best subset of features
clf.fit(X_train, y_train)


# Print the column names of the best feature subset
best_features = []
for i in range(len(X_train.columns)):
    if clf.coef_[0, i] != 0:
        best_features.append(X_train.columns[i]) 

print(best_features)


It can be seen that the best feature subset original feature subset is consistent. So the original data can be directly used to train the model.
Next, the test set is imported and the same data transformation and feature scaling are performed to start preparing the training model

In [None]:
df_test = pd.read_csv('data_test.csv', header=0)

# feature transformation
for col in df_test:
    if df_test[col].dtype == 'object':
        df_test[col] = encoder.fit_transform(df_test[col].astype(str))

# feature scaled
df_test['fnlwgt'] = scaler.fit_transform(df_test[['fnlwgt']])*100

Training models using the keras framework:

In [None]:
from keras.models import Sequential
from keras.layers import Dense

X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:, -1] 
X_valid = df_valid.iloc[:, :-1]
y_valid = df_valid.iloc[:, -1]
X_test = df_test.iloc[:, :-1]
y_test = df_test.iloc[:, -1]

In [None]:
from tensorflow.keras.optimizers import Adam
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
history = model.fit(X_train, y_train, epochs=20, batch_size=128, validation_data=(X_valid, y_valid))

# Evaluate model
test_loss, test_acc = model.evaluate(X_test, y_test)

print('Test accuracy:', test_acc)

Evaluate models using precision, recall and F1 scores

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

y_pred = model.predict(X_test)
y_pred = np.round(y_pred).astype(int)  

# print(y_pred)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(precision, recall, f1)

The keras model training is complete. Now to try the model in the pytorch framework and check if the accuracy will improve:

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# data type transformation:

# Convert a Pandas DataFrame to a NumPy array
X_train = np.array(X_train)
y_train = np.array(y_train)
X_valid = np.array(X_valid)
y_valid = np.array(y_valid)
X_test = np.array(X_test)
y_test = np.array(y_test)


# Convert NumPy arrays to PyTorch Tensor
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_valid = torch.tensor(X_valid, dtype=torch.float32)
y_valid = torch.tensor(y_valid, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Convert NumPy arrays to PyTorch Tensor
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_valid = torch.tensor(X_valid, dtype=torch.float32)
y_valid = torch.tensor(y_valid, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [None]:
# compile the neural network model
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(in_features=X_train.shape[1], out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.fc3 = nn.Linear(in_features=32, out_features=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.relu(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

# Initialize the model and optimizer
net = Net()
optimizer = optim.Adagrad(net.parameters(), lr_decay=0.01)
criterion = nn.BCELoss()

In [None]:
# train model
num_epochs = 10
batch_size = 128

for epoch in range(num_epochs):
    running_loss = 0.0
    for i in range(0, X_train.shape[0], batch_size):
        # Convert data into a tensor
        inputs = torch.Tensor(X_train[i:i+batch_size])
        labels = torch.Tensor(y_train[i:i+batch_size]).unsqueeze(1)

        # Forward propagation, computational loss and backward propagation
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Statistical losses
        running_loss += loss.item()

    # Validate on the validation dataset
    net.eval()
    with torch.no_grad():
        inputs = torch.Tensor(X_valid)
        labels = torch.Tensor(y_valid).unsqueeze(1)
        outputs = net(inputs)
        val_loss = criterion(outputs, labels)
        val_preds = outputs.round().squeeze().detach().numpy()
        val_labels = labels.squeeze().detach().numpy()
        val_accuracy = np.mean(val_preds == val_labels)
        val_precision = precision_score(val_labels, val_preds)
        val_recall = recall_score(val_labels, val_preds)
        val_f1 = f1_score(val_labels, val_preds)

    print(f'Epoch {epoch+1}, Training Loss: {running_loss / (X_train.shape[0] / batch_size):.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}, Validation Precision: {val_precision:.4f}, Validation Recall: {val_recall:.4f}, Validation F1-Score: {val_f1:.4f}')



calculate the Precision, Recall, F1-score as the indicates of model evaluation:

In [None]:
net.eval()
with torch.no_grad():
    inputs = torch.Tensor(X_test)
    print(y_test.shape)
    labels = torch.Tensor(y_test).unsqueeze(1)
    outputs = net(inputs)
    print(labels.shape, outputs.shape)
    labels = torch.round(labels)
    print(labels)
    test_loss = criterion(outputs, labels)
    test_preds = outputs.round().squeeze().detach().numpy()
    test_labels = labels.squeeze().detach().numpy()
    test_accuracy = np.mean(test_preds == test_labels)
    test_precision = precision_score(test_labels, test_preds)
    test_recall = recall_score(test_labels, test_preds)
    test_f1 = f1_score(test_labels, test_preds)
    
print('Precision: {:.4f}'.format(test_precision))
print('Recall: {:.4f}'.format(test_recall))
print('F1-score: {:.4f}'.format(test_f1))


Comparing the keras model and the pytorch model, it is found that the keras model is relatively superior. So the keras model was derived for ensemble learning:

In [None]:
model

model.save('NeuralNetwork.h5')

When you want to import the model, load_model.() should be used:

In [None]:
from tensorflow import keras

loaded_model = keras.models.load_model('NeuralNetwork.h5')