<a href="https://colab.research.google.com/github/COdiwuor/Job-Title-Embedding-Model-with-Synthetic-Data/blob/main/Job_Title_Embedding_Model_with_Synthetic_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef, accuracy_score
import random
from typing import List, Tuple, Dict
import json

In [None]:
class JobTitleDataset(Dataset):
    """Custom dataset for job titles with CoLA-style acceptability labels"""

    def __init__(self, titles: List[str], labels: List[int], tokenizer, max_length: int = 64):
        self.titles = titles
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        title = self.titles[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            title,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
class JobTitleCoLAModel(nn.Module):
    """Job Title validation model using BERT with CoLA-style training"""

    def __init__(self, model_name: str = "google/bert_uncased_L-2_H-128_A-2"):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [8]:
class JobTitleValidator:
    """Main class for training and using the job title validation model"""

    def __init__(self, model_name: str = "google/bert_uncased_L-2_H-128_A-2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = JobTitleCoLAModel(model_name)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)

    def generate_synthetic_data(self, n_samples: int = 1000) -> Tuple[List[str], List[int]]:
        """Generate synthetic job titles with acceptability labels"""
        valid_titles = []
        invalid_titles = []

        # Basic job components
        roles = ["engineer", "manager", "analyst", "developer", "specialist"]
        departments = ["software", "data", "product", "sales", "marketing"]
        levels = ["senior", "junior", "lead", "principal", "associate"]

        # Generate valid titles
        for _ in range(n_samples // 2):
            components = []
            if random.random() < 0.7:
                components.append(random.choice(levels))
            components.append(random.choice(departments))
            components.append(random.choice(roles))
            valid_titles.append(" ".join(components))

        # Generate invalid titles
        invalid_patterns = [
            lambda: f"{random.choice(roles)} {random.choice(roles)}",  # Double roles
            lambda: f"{random.choice(levels)} {random.choice(levels)} {random.choice(roles)}",  # Double levels
            lambda: random.choice(roles).upper(),  # All caps
            lambda: f"{random.choice(departments)}!!!{random.choice(roles)}",  # Invalid characters
            lambda: f"{random.choice(roles)}{random.choice(departments)}"  # No spaces
        ]

        for _ in range(n_samples // 2):
            pattern = random.choice(invalid_patterns)
            invalid_titles.append(pattern())

        titles = valid_titles + invalid_titles
        labels = [1] * len(valid_titles) + [0] * len(invalid_titles)

        # Shuffle the data
        combined = list(zip(titles, labels))
        random.shuffle(combined)
        titles, labels = zip(*combined)

        return list(titles), list(labels)

    def prepare_dataloader(self, titles: List[str], labels: List[int], batch_size: int = 32) -> DataLoader:
        """Create DataLoader for training/evaluation"""
        dataset = JobTitleDataset(titles, labels, self.tokenizer)
        return DataLoader(dataset, batch_size=batch_size, shuffle=True)

    def train(self, train_dataloader: DataLoader, eval_dataloader: DataLoader = None,
              epochs: int = 3, learning_rate: float = 2e-5):
        """Train the model"""
        optimizer = AdamW(self.model.parameters(), lr=learning_rate)
        total_steps = len(train_dataloader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        self.model.train()
        for epoch in range(epochs):
            total_loss = 0

            for batch in train_dataloader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                optimizer.zero_grad()
                outputs = self.model(input_ids, attention_mask)
                loss = nn.CrossEntropyLoss()(outputs, labels)

                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

                total_loss += loss.item()

            avg_loss = total_loss / len(train_dataloader)
            print(f"Epoch {epoch+1} - Average loss: {avg_loss:.4f}")

            if eval_dataloader:
                metrics = self.evaluate(eval_dataloader)
                print(f"Evaluation metrics: {metrics}")

    def evaluate(self, eval_dataloader: DataLoader) -> Dict[str, float]:
        """Evaluate the model"""
        self.model.eval()
        predictions = []
        true_labels = []

        with torch.no_grad():
            for batch in eval_dataloader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels']

                outputs = self.model(input_ids, attention_mask)
                predictions.extend(torch.argmax(outputs, dim=1).cpu().numpy())
                true_labels.extend(labels.numpy())

        return {
            'matthews_correlation': matthews_corrcoef(true_labels, predictions),
            'accuracy': accuracy_score(true_labels, predictions)
        }

    def predict(self, title: str) -> Dict[str, float]:
        """Predict the acceptability of a job title"""
        self.model.eval()
        encoding = self.tokenizer(
            title,
            truncation=True,
            max_length=64,
            padding='max_length',
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        with torch.no_grad():
            outputs = self.model(input_ids, attention_mask)
            probabilities = torch.softmax(outputs, dim=1)

        return {
            'acceptable': bool(torch.argmax(probabilities) == 1),
            'confidence': float(torch.max(probabilities).cpu().numpy())
        }

def main():
    # Initialize validator
    validator = JobTitleValidator()

    # Generate synthetic data
    titles, labels = validator.generate_synthetic_data(n_samples=2000)

    # Split data
    train_titles, eval_titles, train_labels, eval_labels = train_test_split(
        titles, labels, test_size=0.2, random_state=42
    )

    # Create dataloaders
    train_dataloader = validator.prepare_dataloader(train_titles, train_labels)
    eval_dataloader = validator.prepare_dataloader(eval_titles, eval_labels)

    # Train model
    validator.train(train_dataloader, eval_dataloader)

    # Test examples
    test_titles = [
        "Senior Software Engineer",
        "Data Scientist",
        "MANAGER!!!DEVELOPER",
        "LeadLeadAnalyst",
        "Principal Product Manager",
        "Cloud Computing Engineer",
        "Computer Support Specialist",
        "Information Technology Analyst",
        "Information Security Specialist",
        "coder",
        "programmer"
    ]

    print("\nTesting job titles:")
    for title in test_titles:
        result = validator.predict(title)
        print(f"\nTitle: {title}")
        print(f"Acceptable: {result['acceptable']}")
        print(f"Confidence: {result['confidence']:.3f}")

if __name__ == "__main__":
    main()



Epoch 1 - Average loss: 0.6723
Evaluation metrics: {'matthews_correlation': 0.6094587573677411, 'accuracy': 0.79}
Epoch 2 - Average loss: 0.5188
Evaluation metrics: {'matthews_correlation': 0.9704316945960851, 'accuracy': 0.985}
Epoch 3 - Average loss: 0.4266
Evaluation metrics: {'matthews_correlation': 0.9950121889614717, 'accuracy': 0.9975}

Testing job titles:

Title: Senior Software Engineer
Acceptable: True
Confidence: 0.710

Title: Data Scientist
Acceptable: True
Confidence: 0.701

Title: MANAGER!!!DEVELOPER
Acceptable: False
Confidence: 0.744

Title: LeadLeadAnalyst
Acceptable: False
Confidence: 0.724

Title: Principal Product Manager
Acceptable: True
Confidence: 0.683

Title: Cloud Computing Engineer
Acceptable: True
Confidence: 0.633

Title: Computer Support Specialist
Acceptable: True
Confidence: 0.654

Title: Information Technology Analyst
Acceptable: True
Confidence: 0.679

Title: Information Security Specialist
Acceptable: True
Confidence: 0.722

Title: coder
Acceptable: F