In [2]:
# Step 1: Install all needed libraries
!pip install transformers pandas torch scikit-learn sentence-transformers


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
# Step 2: Upload the dataset
from google.colab import files

uploaded = files.upload()  # Upload UpdatedResumeDataSet.csv


Saving UpdatedResumeDataSet.csv to UpdatedResumeDataSet (1).csv


In [4]:
# Step 3: Import libraries
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import json


In [5]:
# Step 4: Load your dataset
df = pd.read_csv('UpdatedResumeDataSet.csv', encoding='latin1')

# Keep necessary columns
df = df[['Resume', 'Category']]
df.head()


Unnamed: 0,Resume,Category
0,Skills * Programming Languages: Python (pandas...,Data Science
1,Education Details \r\nMay 2013 to May 2017 B.E...,Data Science
2,"Areas of Interest Deep Learning, Control Syste...",Data Science
3,Skills Ã¢ÂÂ¢ R Ã¢ÂÂ¢ Python Ã¢ÂÂ¢ SAP HANA ...,Data Science
4,"Education Details \r\n MCA YMCAUST, Faridab...",Data Science


In [6]:
# Check how many unique job categories you have
unique_categories = df['Category'].unique()

# Print all unique categories
print("Unique Job Categories:")
for idx, category in enumerate(unique_categories):
    print(f"{idx}: {category}")

# Also print how many categories
print(f"\nTotal number of categories: {len(unique_categories)}")


Unique Job Categories:
0: Data Science
1: HR
2: Advocate
3: Arts
4: Web Designing
5: Mechanical Engineer
6: Sales
7: Health and fitness
8: Civil Engineer
9: Java Developer
10: Business Analyst
11: SAP Developer
12: Automation Testing
13: Electrical Engineering
14: Operations Manager
15: Python Developer
16: DevOps Engineer
17: Network Security Engineer
18: PMO
19: Database
20: Hadoop
21: ETL Developer
22: DotNet Developer
23: Blockchain
24: Testing

Total number of categories: 25


In [7]:
# Step 5: Encode Labels
labels = df['Category'].unique()
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

df['Label'] = df['Category'].map(label2id)
df.head()


Unnamed: 0,Resume,Category,Label
0,Skills * Programming Languages: Python (pandas...,Data Science,0
1,Education Details \r\nMay 2013 to May 2017 B.E...,Data Science,0
2,"Areas of Interest Deep Learning, Control Syste...",Data Science,0
3,Skills Ã¢ÂÂ¢ R Ã¢ÂÂ¢ Python Ã¢ÂÂ¢ SAP HANA ...,Data Science,0
4,"Education Details \r\n MCA YMCAUST, Faridab...",Data Science,0


In [8]:
# Step 6: Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Resume'].tolist(),
    df['Label'].tolist(),
    test_size=0.2,
    random_state=42
)


In [9]:
# Step 7: Tokenize resumes
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
# Step 8: Create Dataset class
class ResumeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

train_dataset = ResumeDataset(train_encodings, train_labels)
val_dataset = ResumeDataset(val_encodings, val_labels)


In [11]:
# Step 9: Load pre-trained BERT
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(labels))


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
!pip install --upgrade transformers




In [11]:
!pip uninstall -y transformers
!pip install transformers==4.30.0


Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3
Collecting transformers==4.30.0
  Using cached transformers-4.30.0-py3-none-any.whl.metadata (113 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.0)
  Using cached tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.30.0-py3-none-any.whl (7.2 MB)
Using cached tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.1
    Uninstalling tokenizers-0.21.1:
      Successfully uninstalled tokenizers-0.21.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformer

In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,         # 👈 Train for 1 epoch
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=10
)



In [13]:
# Step 11: Setup Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    accuracy = (preds == labels).mean()
    return {'accuracy': accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [14]:
# Step 12: Start Training
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mashrithakotagiri1710[0m ([33mashrithakotagiri1710-university-of-north-florida[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,3.3059
20,3.236
30,3.0913
40,2.9328
50,2.7721
60,2.5875
70,2.3155
80,2.2017
90,1.9423
100,1.8354


TrainOutput(global_step=291, training_loss=1.339921170391168, metrics={'train_runtime': 243.7761, 'train_samples_per_second': 9.464, 'train_steps_per_second': 1.194, 'total_flos': 607122554323968.0, 'train_loss': 1.339921170391168, 'epoch': 3.0})

In [15]:
# Step 13: Save Model and Tokenizer
model.save_pretrained('./resume_classifier_model')
tokenizer.save_pretrained('./resume_classifier_model')

# Save Label Mappings
import json
with open('./resume_classifier_model/label2id.json', 'w') as f:
    json.dump(label2id, f)

with open('./resume_classifier_model/id2label.json', 'w') as f:
    json.dump(id2label, f)

print("✅ Model, Tokenizer, and Label mappings saved successfully!")


✅ Model, Tokenizer, and Label mappings saved successfully!


In [16]:
# Step 14: Zip and Download
import shutil
from google.colab import files

shutil.make_archive('resume_classifier_model', 'zip', './resume_classifier_model')
files.download('resume_classifier_model.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import json

# Load fine-tuned model
model = BertForSequenceClassification.from_pretrained('./resume_classifier_model')
tokenizer = BertTokenizer.from_pretrained('./resume_classifier_model')

# Load label mappings
with open('./resume_classifier_model/id2label.json', 'r') as f:
    id2label = json.load(f)


In [18]:
# Function to predict category for a new resume
def predict_resume(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return id2label[str(prediction)]


In [19]:
# Example: Predict on a sample resume
sample_resume = """
Experienced java Developer with expertise in java, springboot.
Worked on large-scale projects involving java based projects .
"""

predicted_category = predict_resume(sample_resume)
print(f"Predicted Job Category: {predicted_category}")


Predicted Job Category: Java Developer


In [20]:
from sentence_transformers import SentenceTransformer

# Load pre-trained sentence embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]