In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('UpdatedResumeDataSet.csv')
df = df.dropna()
df['label'] = LabelEncoder().fit_transform(df['Category'])

In [3]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Resume'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(texts, max_length=256):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)

In [5]:
class ResumeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = ResumeDataset(train_encodings, train_labels)
val_dataset = ResumeDataset(val_encodings, val_labels)

In [6]:
num_labels = len(set(df['label']))
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy='epoch', 
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy'
)

In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,2.784,2.582379,0.279793
2,1.4088,1.140242,0.943005
3,0.7569,0.66258,0.984456


TrainOutput(global_step=291, training_loss=1.9566444396153349, metrics={'train_runtime': 298.4002, 'train_samples_per_second': 7.731, 'train_steps_per_second': 0.975, 'total_flos': 303561277161984.0, 'train_loss': 1.9566444396153349, 'epoch': 3.0})

In [10]:
eval_result = trainer.evaluate()
print(f"Validation Accuracy: {eval_result['eval_accuracy']:.4f}")

preds = trainer.predict(val_dataset)
y_true = val_labels
y_pred = np.argmax(preds.predictions, axis=1)
print(classification_report(y_true, y_pred))

Validation Accuracy: 0.9845
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         7
           2       0.75      0.60      0.67         5
           3       1.00      1.00      1.00         8
           4       1.00      1.00      1.00         6
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         8
           7       1.00      1.00      1.00         7
           8       1.00      0.91      0.95        11
           9       1.00      1.00      1.00         5
          10       1.00      1.00      1.00         8
          11       1.00      1.00      1.00         6
          12       1.00      1.00      1.00         9
          13       1.00      1.00      1.00         8
          14       1.00      1.00      1.00         6
          15       0.94      1.00      0.97        17
          16       0.89      1.00      0.94         8

In [11]:
model.save_pretrained("resume_bert_model")
tokenizer.save_pretrained("resume_bert_model")

('resume_bert_model\\tokenizer_config.json',
 'resume_bert_model\\special_tokens_map.json',
 'resume_bert_model\\vocab.txt',
 'resume_bert_model\\added_tokens.json')