In [1]:
import pandas 
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:
def data_preprocessing(task_1a_dataframe):
    # Create a copy of the input dataframe
    encoded_dataframe = task_1a_dataframe.copy()

    # Initialize LabelEncoder for categorical columns
    label_encoder = LabelEncoder()

    # Encode each categorical column
    encoded_dataframe['Education'] = label_encoder.fit_transform(encoded_dataframe['Education'])
    encoded_dataframe['City'] = label_encoder.fit_transform(encoded_dataframe['City'])
    encoded_dataframe['Gender'] = label_encoder.fit_transform(encoded_dataframe['Gender'])
    encoded_dataframe['EverBenched'] = label_encoder.fit_transform(encoded_dataframe['EverBenched'])
    encoded_dataframe['PaymentTier'] = label_encoder.fit_transform(encoded_dataframe['PaymentTier'])
    encoded_dataframe['ExperienceInCurrentDomain'] = label_encoder.fit_transform(encoded_dataframe['ExperienceInCurrentDomain'])
    encoded_dataframe['LeaveOrNot'] = label_encoder.fit_transform(encoded_dataframe['LeaveOrNot'])

    # Return the encoded dataframe
    return encoded_dataframe


In [3]:
def identify_features_and_targets(encoded_dataframe):
    # Define the features (excluding 'LeaveOrNot' column)
    features = encoded_dataframe.drop(columns=['LeaveOrNot'])

    # Define the target label as 'LeaveOrNot'
    target = encoded_dataframe['LeaveOrNot']

    # Create a list with features and target
    features_and_targets = [features, target]

    return features_and_targets

In [4]:
def load_as_tensors(features_and_targets):
    # Extract features and target from the input list
    features, target = features_and_targets

    # Convert features and target to PyTorch tensors
    X_train_tensor = torch.tensor(features.values, dtype=torch.float32)
    y_train_tensor = torch.tensor(target.values, dtype=torch.float32)

    # Split the data into training and validation sets (80% training, 20% validation)
    split_ratio = 0.8
    split_index = int(len(X_train_tensor) * split_ratio)

    X_train, X_test = X_train_tensor[:split_index], X_train_tensor[split_index:]
    y_train, y_test = y_train_tensor[:split_index], y_train_tensor[split_index:]

    # Create PyTorch datasets and data loaders for training and validation
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

    # Return the tensors and iterable dataset for training
    tensors_and_iterable_training_data = [X_train, X_test, y_train, y_test, train_loader]

    return tensors_and_iterable_training_data

In [11]:

import torch.optim as optim
class Salary_Predictor(nn.Module):
    def __init__(self):
        super(Salary_Predictor, self).__init__()
        self.fc1 = nn.Linear(8, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

def model_loss_function():
    return nn.BCEWithLogitsLoss()

def model_optimizer(model):
    return optim.Adam(model.parameters(), lr=0.001)

def model_number_of_epochs():
    return 50


In [6]:
def training_function(model, number_of_epochs, tensors_and_iterable_training_data, loss_function, optimizer):


    X_train, _, y_train, _ , train_loader = tensors_and_iterable_training_data

    for epoch in range(number_of_epochs):
        model.train()  # Set the model to training mode
        total_loss = 0

        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = loss_function(outputs, batch_y.unsqueeze(1))  # BCEWithLogitsLoss expects 2D target
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch [{epoch + 1}/{number_of_epochs}], Loss: {avg_loss:.4f}")

    return model



In [7]:

def validation_function(trained_model, tensors_and_iterable_training_data):

    _, X_test, _, y_test, _ = tensors_and_iterable_training_data

    trained_model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        outputs = trained_model(X_test)
        predicted_labels = (torch.sigmoid(outputs) > 0.5).float()
        correct_predictions = (predicted_labels == y_test.unsqueeze(1)).sum().item()
        total_samples = len(y_test)
        model_accuracy = correct_predictions / total_samples

    print(f"Validation Accuracy: {model_accuracy * 100:.2f}%")

    return model_accuracy

In [15]:
	task_1a_dataframe = pandas.read_csv('task_1a_dataset.csv')

	# data preprocessing and obtaining encoded data
	encoded_dataframe = data_preprocessing(task_1a_dataframe)

	# selecting required features and targets
	features_and_targets = identify_features_and_targets(encoded_dataframe)

	# obtaining training and validation data tensors and the iterable
	# training data object
	tensors_and_iterable_training_data = load_as_tensors(features_and_targets)
	
	# model is an instance of the class that defines the architecture of the model
	model = Salary_Predictor()

	# obtaining loss function, optimizer and the number of training epochs
	loss_function = model_loss_function()
	optimizer = model_optimizer(model)
	number_of_epochs = model_number_of_epochs()

	# training the model
	trained_model = training_function(model, number_of_epochs, tensors_and_iterable_training_data, 
					loss_function, optimizer)

	# validating and obtaining accuracy
	model_accuracy = validation_function(trained_model,tensors_and_iterable_training_data)
	print(f"Accuracy on the test set = {model_accuracy}")


Epoch [1/50], Loss: 1.3607
Epoch [2/50], Loss: 0.6923
Epoch [3/50], Loss: 0.6960
Epoch [4/50], Loss: 0.7633
Epoch [5/50], Loss: 0.7078
Epoch [6/50], Loss: 0.6873
Epoch [7/50], Loss: 0.7234
Epoch [8/50], Loss: 0.7717
Epoch [9/50], Loss: 0.7399
Epoch [10/50], Loss: 0.6556
Epoch [11/50], Loss: 0.7391
Epoch [12/50], Loss: 0.7321
Epoch [13/50], Loss: 0.6541
Epoch [14/50], Loss: 0.6994
Epoch [15/50], Loss: 0.6676
Epoch [16/50], Loss: 0.7044
Epoch [17/50], Loss: 0.6579
Epoch [18/50], Loss: 0.7184
Epoch [19/50], Loss: 0.7101
Epoch [20/50], Loss: 0.6784
Epoch [21/50], Loss: 0.6460
Epoch [22/50], Loss: 0.6470
Epoch [23/50], Loss: 0.6451
Epoch [24/50], Loss: 0.7791
Epoch [25/50], Loss: 0.7964
Epoch [26/50], Loss: 0.6785
Epoch [27/50], Loss: 0.6632
Epoch [28/50], Loss: 0.6425
Epoch [29/50], Loss: 0.7968
Epoch [30/50], Loss: 0.7360
Epoch [31/50], Loss: 0.7293
Epoch [32/50], Loss: 0.7590
Epoch [33/50], Loss: 0.6550
Epoch [34/50], Loss: 0.6470
Epoch [35/50], Loss: 0.7109
Epoch [36/50], Loss: 0.7708
E