In [1]:
!pip install transformers datasets torch scikit-learn pandas



In [3]:
import pandas as pd

file_path = '/content/resume_classifier_dataset.csv'
df = pd.read_csv(file_path)

print(df.head())
print(df['category'].value_counts())

                                         resume_text            category
0  Professional with 8 years of experience in Web...     Web Development
1  Professional with 8 years of experience in Mob...  Mobile Development
2  Professional with 2 years of experience in UI/...        UI/UX Design
3  Professional with 8 years of experience in Clo...   Cloud Engineering
4  Professional with 3 years of experience in Mob...  Mobile Development
category
Web Development       52
DevOps                47
UI/UX Design          44
Cloud Engineering     43
Cybersecurity         40
Data Science          39
Mobile Development    35
Name: count, dtype: int64


In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["category"])

num_labels = len(label_encoder.classes_)
print("Number of classes:", num_labels)
print(label_encoder.classes_)

Number of classes: 7
['Cloud Engineering' 'Cybersecurity' 'Data Science' 'DevOps'
 'Mobile Development' 'UI/UX Design' 'Web Development']


In [5]:
label_mapping = dict(zip(label_encoder.classes_, range(num_labels)))
print(label_mapping)

{'Cloud Engineering': 0, 'Cybersecurity': 1, 'Data Science': 2, 'DevOps': 3, 'Mobile Development': 4, 'UI/UX Design': 5, 'Web Development': 6}


In [6]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [7]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
pre_classifier.weight   | MISSING    | 
classifier.weight       | MISSING    | 
classifier.bias         | MISSING    | 
pre_classifier.bias     | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [9]:
def tokenize_function(example):
    return tokenizer(
        example["resume_text"],
        padding=True,
        truncation=True,
        max_length=128
    )

train_dataset = train_dataset.map(tokenize_function)
val_dataset = val_dataset.map(tokenize_function)

train_dataset = train_dataset.rename_column("label", "labels")
val_dataset = val_dataset.rename_column("label", "labels")

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

In [13]:
from transformers import Trainer, TrainingArguments, IntervalStrategy

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy=IntervalStrategy.EPOCH, # Renamed from evaluation_strategy
    save_strategy=IntervalStrategy.NO, # Renamed from save_strategy to align with common alternatives
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_steps=50,
    load_best_model_at_end=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
print("Training arguments and Trainer successfully initialized.")

Training arguments and Trainer successfully initialized.


In [15]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator # Pass the data collator here
)

trainer.train()
print("Model training completed.")

Epoch,Training Loss,Validation Loss
1,1.28323,0.199765
2,0.164669,0.032494
3,0.041417,0.022972


Model training completed.


In [16]:
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

Evaluation Results: {'eval_loss': 0.022972384467720985, 'eval_runtime': 0.3901, 'eval_samples_per_second': 153.821, 'eval_steps_per_second': 38.455, 'epoch': 3.0}


In [17]:
model.save_pretrained("resume_classifier_model")
tokenizer.save_pretrained("resume_classifier_model")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('resume_classifier_model/tokenizer_config.json',
 'resume_classifier_model/tokenizer.json')

In [19]:
from transformers import pipeline
import pickle

# Save label encoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

classifier = pipeline(
    "text-classification",
    model="resume_classifier_model",
    tokenizer="resume_classifier_model"
)

# Load label encoder
with open("label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

text = """
Experienced Python developer with Machine Learning,
TensorFlow, and data analysis experience.
"""

prediction = classifier(text)[0]

predicted_label = int(prediction["label"].split("_")[-1])
predicted_category = label_encoder.inverse_transform([predicted_label])[0]

print("Predicted Category:", predicted_category)


Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

Predicted Category: Data Science
