In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification


In [2]:
# Sample resume
resume_text = """
Johnathan Michael Smith
Location: New York, NY | Phone: (212) 555-7890 | Email: johnathan.smith@example.com
Skills:
• Programming Languages: Java, Python, C++, JavaScript
• Web Development: HTML, CSS, React, Node.js
• Databases: PostgreSQL, MongoDB, MySQL
• Tools & Technologies: Docker, Kubernetes, Git, Jenkins
• Cloud Platforms: AWS, Azure, Google Cloud
• Machine Learning: TensorFlow, PyTorch, scikit-learn
• Other: Agile methodologies, Test-Driven Development, RESTful API design
"""

In [3]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenize the resume
tokens = tokenizer(resume_text, return_tensors="pt", truncation=True, padding=True, is_split_into_words=True)


In [8]:
# Example tags (for demonstration purposes, actual tags need to be manually annotated)
tags = [1] * len(tokens.input_ids[0])  # 0 for non-skill, 1 for skill (simplified)

In [17]:
def get_actual_tags(resume_text):
    tokens = tokenizer.tokenize(resume_text)
    actual_tags = [0] * len(tokens)

    # Example indices where skills are located (this should be based on your annotations)
    skill_indices = [(5, 8), (10, 13)]  # Example: tokens 5-8 and 10-13 are skills

    for start, end in skill_indices:
        for i in range(start, end + 1):
            actual_tags[i] = 1
    
    return actual_tags

In [18]:
# Get actual tags for the test resume
actual_tags = get_actual_tags(resume_text)

In [19]:
actual_tags

[0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [None]:
# Convert tags to tensor
tags_tensor = torch.tensor([tags])

In [None]:
# Model
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)
# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokens,
    eval_dataset=tokens,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Training
trainer.train()

In [20]:
resume_text = """
Johnathan Michael Smith
Location: New York, NY | Phone: (212) 555-7890 | Email: johnathan.smith@example.com
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenize the resume
tokens = tokenizer.tokenize(resume_text)

In [21]:
tokens

['john',
 '##athan',
 'michael',
 'smith',
 'location',
 ':',
 'new',
 'york',
 ',',
 'ny',
 '|',
 'phone',
 ':',
 '(',
 '212',
 ')',
 '555',
 '-',
 '78',
 '##90',
 '|',
 'email',
 ':',
 'john',
 '##athan',
 '.',
 'smith',
 '@',
 'example',
 '.',
 'com']

In [None]:
tags_list = [
    "B-NAME",
    "I-NAME",
    "B-PHONE",
    "I-PHONE",
    "B-EMAIL",
    "B-SKILL",
    "I-SKILL"
    "O"    
]