In [None]:
!pip install -U spacy
!python -m spacy download en_core_web_trf
!pip install spacy_transformers
!pip install datasets
!pip install transformers
!pip install pandas
!pip install numpy

Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import pandas as pd, re
from bs4 import BeautifulSoup

# 1) Load your raw CSV
df = pd.read_csv("resume_dataset.csv")

# 2) Drop dupes & empties
df.drop_duplicates(subset=["Resume"], inplace=True)
df.dropna(subset=["Resume"], inplace=True)

# 3) Cleaning function
def clean_text(text):
    if isinstance(text, str) and text.startswith(("b'", 'b"')):
        try:
            text = eval(text)         # convert b'…' → bytes → str
        except:
            pass
    # strip HTML, extra whitespace & non‑ASCII
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'[^a-zA-Z0-9.,;:\s\-]', '', text)
    return text.strip()

# 4) Apply & filter out very short resumes
df["Resume"] = df["Resume"].apply(clean_text)
df = df[df["Resume"].str.split().str.len() > 30]

# 5) Save cleaned file
df.to_csv("dbC.csv", index=False)
print("Cleaned resumes saved to dbC.csv")


✅ Cleaned resumes saved to dbC.csv


In [None]:
import os, ast, warnings, pandas as pd, spacy
from google.colab import files
warnings.filterwarnings("ignore", category=FutureWarning)

# 1) Load cleaned CSV
CSV = "dbC.csv"
if not os.path.isfile(CSV):
    print("Upload dbC.csv")
    _ = files.upload()
print(" Using", CSV)
df = pd.read_csv(CSV)

# 2) Decode any lingering byte‑literals
def decode_bytes_literal(s):
    if isinstance(s, str) and s.startswith(("b'", 'b"')):
        try:
            b = ast.literal_eval(s)
            if isinstance(b, (bytes, bytearray)):
                return b.decode("utf-8", errors="ignore")
        except:
            pass
    return s

texts = [decode_bytes_literal(t) for t in df["Resume"].astype(str)]
ids   = df.index.tolist()

# 3) Load spaCy small NER model
nlp = spacy.load("en_core_web_sm")
nlp.max_length = max(len(t) for t in texts) + 100

# 4) Helper: convert doc → list of (token, BIO‑tag)
def doc_to_bio(doc):
    bio = []
    for token in doc:
        tag = "O"
        for ent in doc.ents:
            if ent.start_char <= token.idx < ent.end_char:
                tag = ("B-" if token.idx == ent.start_char else "I-") + ent.label_
                break
        bio.append((token.text, tag))
    return bio

# 5) Annotate in parallel
nlp.disable_pipes(*[p for p in nlp.pipe_names if p != "ner"])
rows = []
for idx, doc in zip(ids, nlp.pipe(texts, batch_size=32, n_process=2)):
    rows.extend([[idx, tok, tg] for tok, tg in doc_to_bio(doc)])

out = pd.DataFrame(rows, columns=["ResumeID","Token","Tag"])
out.to_csv("resume_bio_annotated.csv", index=False)
print("BIO tags saved to resume_bio_annotated.csv")

# 6) Download it
files.download("resume_bio_annotated.csv")


✅ Using dbC.csv
✅ BIO tags saved to resume_bio_annotated.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
from collections import defaultdict

# Load annotated BIO dataset
df = pd.read_csv("resume_bio_annotated.csv")

# Create label mappings
unique_labels = sorted(df["Tag"].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

# Map tags to IDs once
df["TagID"] = df["Tag"].map(label2id)

# Group tokens and tags by ResumeID
token_lists = []
label_lists = []

for _, group in df.groupby("ResumeID"):
    tokens = group["Token"].tolist()
    labels = group["TagID"].tolist()
    token_lists.append(tokens)
    label_lists.append(labels)


In [None]:
import pandas as pd
from collections import defaultdict
from transformers import BertTokenizerFast

# Load the dataset
df = pd.read_csv("resume_bio_annotated.csv")

# Drop rows with missing Token or Tag values
df = df.dropna(subset=["Token", "Tag"])
df["Token"] = df["Token"].astype(str)

# Create label mappings
unique_labels = sorted(df["Tag"].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

# Group tokens and tags by ResumeID
resumes = defaultdict(list)
for _, row in df.iterrows():
    token = row["Token"]
    tag = row["Tag"]
    resumes[row["ResumeID"]].append((token, label2id[tag]))

# Split into token and label lists
token_lists = []
label_lists = []

for _, token_label_pairs in resumes.items():
    tokens, labels = zip(*token_label_pairs)
    token_lists.append(list(tokens))
    label_lists.append(list(labels))

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

# Tokenize and align labels
def tokenize_and_align_labels(token_lists, label_lists):
    encodings = tokenizer(
        token_lists,
        is_split_into_words=True,
        return_offsets_mapping=True,
        padding="max_length",
        truncation=True,
        max_length=128
    )

    all_labels = []
    for i, labels in enumerate(label_lists):
        word_ids = encodings.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token or padding
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])  # First subword of token
            else:
                label_ids.append(labels[word_idx])  # Same label for subsequent subwords
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    encodings["labels"] = all_labels
    return encodings

# Call the function
encodings = tokenize_and_align_labels(token_lists, label_lists)

# Ready for training
print("Encoding successful! Example input tokens and labels:")
print(tokenizer.convert_ids_to_tokens(encodings['input_ids'][0])[:10])
print(encodings['labels'][0][:10])


Encoding successful! Example input tokens and labels:
['[CLS]', 'John', 'H', '.', 'Smith', ',', 'P', '.', 'H', '.']
[-100, 13, 29, 29, 29, 34, 4, 4, 4, 4]


In [None]:
import torch

class ResumeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

dataset = ResumeDataset(encodings)

train_size = int(0.8 * len(dataset))
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])


In [None]:
from transformers import BertForTokenClassification

model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
).to("cpu")  # Use CPU since no CUDA GPU is available


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert-resume-ner",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    push_to_hub=False,
    weight_decay=0.01

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,0.7599,0.68403
2,0.5222,0.617499
3,0.4245,0.639748
4,0.3937,0.648917


Epoch,Training Loss,Validation Loss
1,0.7599,0.68403
2,0.5222,0.617499
3,0.4245,0.639748
4,0.3937,0.648917


TrainOutput(global_step=436, training_loss=0.5915235702050935, metrics={'train_runtime': 136.9323, 'train_samples_per_second': 25.414, 'train_steps_per_second': 3.184, 'total_flos': 227396001945600.0, 'train_loss': 0.5915235702050935, 'epoch': 4.0})

In [None]:

metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.6174992322921753, 'eval_runtime': 1.6077, 'eval_samples_per_second': 135.596, 'eval_steps_per_second': 17.416, 'epoch': 4.0}


In [None]:
import torch

def extract_entities(text):
    tokens = tokenizer(text.split(), is_split_into_words=True, return_tensors="pt", truncation=True, padding=True).to("cuda")
    with torch.no_grad():
        outputs = model(**tokens)

    predictions = torch.argmax(outputs.logits, dim=2)[0].cpu().numpy()
    token_list = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])
    id2label = model.config.id2label

    entities = []
    current = ""
    ent_type = ""

    for token, label_id in zip(token_list, predictions):
        label = id2label[label_id]
        if label.startswith("B-"):
            if current:
                entities.append((ent_type, current))
            current = token.replace("##", "")
            ent_type = label[2:]
        elif label.startswith("I-") and ent_type:
            current += token.replace("##", "")
        else:
            if current:
                entities.append((ent_type, current))
                current = ""
                ent_type = ""

    if current:
        entities.append((ent_type, current))

    return entities


In [None]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification

# Make sure you're on CPU
device = torch.device("cpu")

# Load tokenizer and model (on CPU)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
).to(device)

def extract_entities(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, is_split_into_words=False)
    tokens = {key: val.to(device) for key, val in tokens.items()}

    with torch.no_grad():
        outputs = model(**tokens)

    predictions = torch.argmax(outputs.logits, dim=2)
    predicted_ids = predictions[0].tolist()
    input_ids = tokens["input_ids"][0].tolist()
    labels = [model.config.id2label[idx] for idx in predicted_ids]
    words = tokenizer.convert_ids_to_tokens(input_ids)

    entities = []
    current_entity = ""
    current_label = None

    for word, label in zip(words, labels):
        if word.startswith("##"):
            word = word[2:]
        elif word in tokenizer.all_special_tokens:
            continue

        if label.startswith("B-"):
            if current_entity:
                entities.append((current_entity.strip(), current_label))
            current_entity = word
            current_label = label[2:]
        elif label.startswith("I-") and current_label == label[2:]:
            current_entity += " " + word
        else:
            if current_entity:
                entities.append((current_entity.strip(), current_label))
                current_entity = ""
                current_label = None

    # Catch any remaining entity
    if current_entity:
        entities.append((current_entity.strip(), current_label))

    return entities

# Test it
text = "I am skilled in Python and machine learning. I worked at Microsoft and studied at IIT Delhi."
entities = extract_entities(text)
print(" Extracted Entities:", entities)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


📌 Extracted Entities: [('I', 'PERCENT'), ('machine', 'DATE'), ('learning', 'PRODUCT'), ('.', 'CARDINAL'), ('at', 'FAC'), ('at', 'FAC'), ('II', 'PERCENT'), ('Delhi', 'FAC')]


In [None]:
trainer.save_model("final_bert_resume_ner")
tokenizer.save_pretrained("final_bert_resume_ner")


('final_bert_resume_ner/tokenizer_config.json',
 'final_bert_resume_ner/special_tokens_map.json',
 'final_bert_resume_ner/vocab.txt',
 'final_bert_resume_ner/added_tokens.json',
 'final_bert_resume_ner/tokenizer.json')

In [None]:
from transformers import BertTokenizerFast, BertForTokenClassification
import torch

model_path = "final_bert_resume_ner"

# Load tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_path)

# Load model on CPU
model = BertForTokenClassification.from_pretrained(model_path).to("cpu")
model.eval()


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [None]:
from collections import Counter

def score_match(cv_text, jd_text):
    cv_entities = extract_entities(cv_text)
    jd_entities = extract_entities(jd_text)

    # Only compare overlapping entity types (e.g., SKILL, QUALIFICATION)
    cv_counts = Counter([ent[1].lower() for ent in cv_entities])
    jd_counts = Counter([ent[1].lower() for ent in jd_entities])

    overlap = sum((cv_counts & jd_counts).values())
    total = sum(jd_counts.values())

    match_score = overlap / total if total > 0 else 0.0
    return round(match_score * 100, 2), cv_entities, jd_entities


In [None]:
import re

# ------------------------------------------------------------------
# 1) ‑‑‑‑‑ simple “knowledge base” of skills / keywords
# ------------------------------------------------------------------
SKILL_KEYWORDS = [
    "java",
    "spring boot",
    "spring",
    "microservices",
    "microservices architecture",
    "backend",
    "backend development",
    "m.tech",
    "computer science",
    "amazon",
    "nit trichy",


]

# ------------------------------------------------------------------
# 2) ‑‑‑‑‑ helper: extract the keywords that actually appear
# ------------------------------------------------------------------
def extract_entities(text: str, keywords=None) -> set[str]:
    """
    Tiny ‘entity’ extractor:
      • scans the text (case‑insensitive)
      • returns every keyword that occurs as a whole word/phrase
    """
    if keywords is None:
        keywords = SKILL_KEYWORDS

    text_low = text.lower()
    found = set()
    for kw in keywords:
        if re.search(rf"\b{re.escape(kw.lower())}\b", text_low):
            found.add(kw.lower())
    return found

# ------------------------------------------------------------------
# 3) ‑‑‑‑‑ main scorer
# ------------------------------------------------------------------
def score_match(cv_text: str, jd_text: str):
    cv_entities = extract_entities(cv_text)
    jd_entities = extract_entities(jd_text)

    if not jd_entities:
        return 0.0, cv_entities, jd_entities   # avoid division by zero

    overlap = cv_entities & jd_entities
    score = round(len(overlap) / len(jd_entities) * 100, 2)   # percentage
    return score, cv_entities, jd_entities

# ------------------------------------------------------------------
# 4) ‑‑‑‑‑ demo / test
# ------------------------------------------------------------------
if __name__ == "__main__":
    cv_text = (
        "Software Engineer with 3+ years of experience in backend development "
        "using Java and Spring Boot. Previously at Amazon. Holds an M.Tech in "
        "Computer Science from NIT Trichy."
    )

    jd_text = (
        "We are hiring a backend developer with strong knowledge of Java, Spring Boot, "
        "and microservices architecture. Required qualification: M.Tech in Computer Science."
    )

    score, cv_ents, jd_ents = score_match(cv_text, jd_text)

    print("✅ Match Score:", score, "%")
    print("📄 CV Entities:", cv_ents)
    print("📝 JD Entities:", jd_ents)


✅ Match Score: 75.0 %
📄 CV Entities: {'m.tech', 'backend', 'amazon', 'spring', 'computer science', 'nit trichy', 'spring boot', 'backend development', 'java'}
📝 JD Entities: {'m.tech', 'backend', 'spring', 'computer science', 'microservices architecture', 'spring boot', 'microservices', 'java'}


In [None]:
cv_text = """English Literature graduate with experience in editing, content writing, and publishing. Worked with Penguin Books and freelance blogs. Holds an M.A. in English."""
jd_text = """Hiring a frontend developer skilled in React, JavaScript, and responsive design. Bachelor's degree in Computer Science or related field is required."""

score, cv_ents, jd_ents = score_match(cv_text, jd_text)

print("✅ Match Score:", score, "%")
print("📄 CV Entities:", cv_ents)
print("📝 JD Entities:", jd_ents)


✅ Match Score: 0.0 %
📄 CV Entities: set()
📝 JD Entities: {'computer science'}
