In [2]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [4]:
df = pd.read_csv('final_dataset.csv')

# **Preparation of Training data**
Separation,Tokenizing,appending Combined Features

In [5]:
X = []
Y = []
for index, row in df.iterrows():
    text_features = []
    numerical_features = []
    for i in range(1, 19):  # Skipping the first element (employee number)
        if isinstance(row[i], str):  # Text data
            text_features.append(row[i])
        else:  # Numerical data
            numerical_features.append(row[i])

    # Convert text features to embeddings
    text_input = " ".join(text_features)
    tokenized_text = tokenizer.encode(text_input, add_special_tokens=True)
    # Combine text and numerical features before padding
    combined_features = tokenized_text + numerical_features

    X.append(combined_features)

    # Add target
    Y.append(row[19])  # Assuming 20th element is the target (0/1)


Token indices sequence length is longer than the specified maximum sequence length for this model (814 > 512). Running this sequence through the model will result in indexing errors


In [6]:
# Pad the combined features to ensure fixed length
max_len = max([len(x) for x in X])
padded_texts = [x + [0]*(max_len-len(x)) for x in X]
max_len

2842

In [7]:
X = padded_texts

# **Data conversion to tensors**

In [8]:
#X = torch.tensor(X_pca, dtype=torch.float32)
X = torch.tensor(X, dtype=torch.float32)
Y = torch.tensor(Y, dtype=torch.long)

# **Splitting into training and validation data**

In [16]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.15, random_state=42)

In [10]:
# X_train = X_train.long()
# X_val = X_val.long()
# X.shape

# **Nueral Network**

In [11]:
class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        #self.bert = bert_model
        self.num_branch = nn.Sequential(
            nn.Linear(2842, 1024),
            nn.ReLU(),
            nn.Linear(1024,64),            # hidden layers
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU()
        )
        self.final_layer = nn.Linear(32, 2)  # Output layer for binary classification

    def forward(self, input_ids):
        #bert_output = self.bert(input_ids)[1]  # Use [1] for the pooled output
        num_output = self.num_branch(input_ids)
        logits = self.final_layer(num_output)
        return logits

In [12]:
model = CustomModel()

In [17]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)#adam optimiser used;
criterion = nn.CrossEntropyLoss()                        #Loss Function

In [14]:
X_val.shape

torch.Size([187, 2842])

# **Initiating the training loop for model**

In [15]:
model.train()
for epoch in tqdm(range(100)):  # Assuming 100 epochs
    optimizer.zero_grad()
    logits = model(X_train)
    loss = criterion(logits, Y_train)
    loss.backward()
    optimizer.step()


100%|██████████| 100/100 [00:47<00:00,  2.09it/s]


# **Computing precision, recall, and F1 score**

In [142]:
model.eval()
with torch.no_grad():
    logits_val = model(X_val)
    preds_val = torch.argmax(logits_val, dim=1)
    precision = precision_score(Y_val.numpy(), preds_val.numpy(), average='weighted')
    recall = recall_score(Y_val.numpy(), preds_val.numpy(), average='weighted')
    f1 = f1_score(Y_val.numpy(), preds_val.numpy(), average='weighted')

print(precision)
print(recall)
print(f"F1 score: {f1}")

0.5079110790205354
0.49732620320855614
F1 score: 0.501754519989814


# **Test Dataset**

In [17]:
test_df = pd.read_excel('final_test_dataset.xlsx')

# **Preparation of test data**

In [18]:
X_test = []

for index, row in test_df.iterrows():
    text_features = []
    numerical_features = []
    for i in range(1, 19):  # Skipping the first element (employee number)
        if isinstance(row[i], str):  # Text data
            text_features.append(row[i])
        else:  # Numerical data
            numerical_features.append(row[i])

    # Convert text features to embeddings
    text_input = " ".join(text_features)
    tokenized_text = tokenizer.encode(text_input, add_special_tokens=True)
    # Combine text and numerical features before padding
    combined_features = tokenized_text + numerical_features

    X_test.append(combined_features)



In [19]:
padded_texts = [x + [0]*(max_len-len(x)) for x in X_test]
X_test = padded_texts
X_test = torch.tensor(X_test, dtype=torch.float32)


In [20]:
X_test.shape

torch.Size([187, 2842])

# **Using model to make predictions**

In [21]:
model.eval()                                        #Evaluation of model
with torch.no_grad():                               #disables gradient calculation since no updation of parameters during inference
    logits_val = model(X_test)
    preds_val = torch.argmax(logits_val, dim=1)

In [22]:
preds_val.shape

torch.Size([187])

# **Associating predicted performance labels with the Candidate IDs**

In [23]:

# Assuming test_df is your DataFrame and preds_val is your predictions variable
# Extract the first column of test_df
first_column_values = test_df.iloc[:, 0]

# Convert preds_val to a list
preds_list = preds_val.tolist()

# Create a new DataFrame with two columns
new_df = pd.DataFrame({'CandidateID': first_column_values, 'Performance': preds_list})

# Save the new DataFrame to an Excel file
new_df.to_csv('result.csv', index=False)
