In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score
import torch.optim as optim
from sklearn.manifold import SpectralEmbedding
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss



In [2]:
train_data_df = pd.read_csv('/kaggle/input/gender-prediction-in-comp7404/train.csv')
train_labels_df = pd.read_csv('/kaggle/input/gender-prediction-in-comp7404/train_answers.csv')
test_data_df = pd.read_csv('/kaggle/input/gender-prediction-in-comp7404/test.csv')
# train_data['male'] = train_data[train_labels['writer'] == train_data['writer']]
# scaler = StandardScaler()
# train_data = scaler.fit_transform(train_data)
# test_data = scaler.transform(test_data)

# train_data = torch.tensor(train_data, dtype=torch.float32)
# train_labels = torch.tensor(train_labels.values, dtype=torch.float32)
# test_data = torch.tensor(test_data, dtype=torch.float32)

In [3]:
train_data = train_data_df.values
# training dataset
train_features = train_data[:, 4:]
the_labels = train_labels_df.values[:, 1]
total_labels = np.repeat(the_labels, 4)
# The label of train
train_labels = total_labels[:800]
# The label of test
test_labels = total_labels[800:]
test_data = test_data_df.values
# testing dataset
test_features = test_data[:, 4:]

In [4]:
variances = np.sum(train_features, axis=0)
non_zero_indices = np.where(variances > 0)[0]
len(non_zero_indices)
if len(non_zero_indices) > 4564:
    selected_indices = non_zero_indices[:4564]
else:
    selected_indices = non_zero_indices

In [5]:
train_features_reduce = train_features[:, selected_indices]
test_features_reduce = test_features[:, selected_indices]

scaler = StandardScaler()
train_features_scalered = scaler.fit_transform(train_features_reduce)
test_data_scalered = scaler.transform(test_features_reduce)
train_features_scalered.shape, test_data_scalered.shape

((800, 4564), (328, 4564))

In [6]:
X_train = torch.from_numpy(train_features_scalered).float()
X_test = torch.from_numpy(test_data_scalered).float()
y_train = torch.from_numpy(train_labels).float()
y_test = torch.from_numpy(test_labels).float()

In [7]:
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_data, batch_size=256, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

In [39]:
class DBN(nn.Module):
    def __init__(self):
        super(DBN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(4564, 2000),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(2000, 500),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(500, 2000),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(2000, 15),
            nn.ReLU()
        )
        self.fc = nn.Linear(15, 1)
        self.cls = nn.Sigmoid()
        
    
    def forward(self, x):
        penultimate = self.layers(x)
        x = self.fc(penultimate)
        return self.cls(x), penultimate
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.BCELoss()
columns = [DBN().to(device) for _ in range(5)]
num_epoch = 100

In [40]:
for i, column in enumerate(columns):
    optimizer = optim.SGD(column.parameters(), lr=0.01, momentum=0.05)
    for epoch in range(num_epoch+1):
        column.train()
        for data, label in train_loader:
            data, label = data.to(device), label.to(device)
            optimizer.zero_grad()
            output,_ = column(data)
            loss = criterion(output.squeeze(), label)
            loss.backward()
            optimizer.step()
        if epoch == 100:
            print(f'Model {i+1}, Epoch {epoch}, Loss: {loss.item()}')

Model 1, Epoch 100, Loss: 0.5619354844093323
Model 2, Epoch 100, Loss: 0.6924870014190674
Model 3, Epoch 100, Loss: 0.6732969284057617
Model 4, Epoch 100, Loss: 0.6464829444885254
Model 5, Epoch 100, Loss: 0.6765446662902832


In [41]:
def evaluate_model(model, test_loader, device):
    predictions = []
    true_labels = []
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0
    total_loss = 0
    with torch.no_grad():  # No gradients required
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            outputs, _ = model(data)
            predicted_probabilities = outputs.squeeze().cpu().numpy()
            loss = criterion(outputs.squeeze(), target)
            total_loss += loss.item() * target.size(0)
            predictions.extend(predicted_probabilities)
            true_labels.extend(target.cpu().numpy())
            
#     predicted_labels = [1 if prob > 0.7 else 0 for prob in predictions]
#     accuracy = accuracy_score(true_labels, predicted_labels)
    average_loss = total_loss / 328
    return accuracy, average_loss

accuracies = []
multi_fea = []
for i, column in enumerate(columns):
    accuracy, average_loss = evaluate_model(column, test_loader, device)
    accuracies.append(accuracy)
#     multi_fea.append(features)
    print(f'Model {i+1} Average Cross Entropy Loss: {average_loss:.4f}')

Model 1 Average Cross Entropy Loss: 0.6380
Model 2 Average Cross Entropy Loss: 0.6983
Model 3 Average Cross Entropy Loss: 0.6617
Model 4 Average Cross Entropy Loss: 0.6766
Model 5 Average Cross Entropy Loss: 0.6987


In [43]:
def concatenate_features(models, data_loader, device):
    concatenated_features = []
    targets = []
    with torch.no_grad():
        for data, target in data_loader:
            data = data.to(device)
            # Extract and concatenate features from all models
            model_features = [model(data)[1].cpu().numpy() for model in models]
            concatenated_features.append(np.concatenate(model_features, axis=1))
            targets.append(target.cpu().numpy())
    return np.concatenate(concatenated_features), np.concatenate(targets)

# Concatenate training and test data features
train_features, train_targets = concatenate_features(columns, train_loader, device)
test_features, test_targets = concatenate_features(columns, test_loader, device)
combined_features = np.vstack((train_features, test_features))

# Apply spectral embedding to the combined set
embedding = SpectralEmbedding(n_components=25, affinity='nearest_neighbors', n_neighbors=20)
combined_embedded_features = embedding.fit_transform(combined_features)

# Split the embedded features back into training and test sets
train_embedded_features = combined_embedded_features[:len(train_features)]
test_embedded_features = combined_embedded_features[len(train_features):]

# Train a classifier using the embedded training features
classifier = LogisticRegression(max_iter=1000)
classifier.fit(train_embedded_features, train_targets)

# Predict using the trained classifier on the embedded test features
# predictions_prob = classifier.predict_proba(test_embedded_features)
predictions = classifier.predict(test_embedded_features)
# log_loss(predictions, predictions_prob)
# predictions
# Calculate accuracy
accuracy = max(np.mean(predictions == test_targets), 1. - np.mean(predictions == test_targets))
print(f'Accuracy of the model on the test set using Spectral Embedding: {accuracy * 100:.2f}%')

Accuracy of the model on the test set using Spectral Embedding: 62.20%
