# Baseline Models Training

**Project:** HEARTS Adaptation - Gender Bias Detection  
**SDG Alignment:** SDG 5 (Gender Equality) & SDG 8 (Decent Work and Economic Growth)  
**Task:** Binary classification (Biased vs. Non-Biased job descriptions)

This notebook trains three baseline models for comparison with Alvert-v2, DistilBERT and BERT:
1. **LR - TF-IDF**: Logistic Regression with TF-IDF features
2. **DistilRoBERTa-Bias**: Fine-tuned DistilRoBERTa model for gender bias detection
3. **LR - Embeddings**: Logistic Regression using embeddings from pre-trained transformer model

**Note:** Evaluation of these models is performed in `03_Model_Evaluation.ipynb`

In [21]:
# Import required libraries
import pandas as pd
import numpy as np
import os
import pickle
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, precision_recall_fscore_support, 
    balanced_accuracy_score, confusion_matrix, roc_curve, auc,
    precision_recall_curve, roc_auc_score
)
import matplotlib.pyplot as plt
import seaborn as sns

# Set up paths
current_dir = Path.cwd()
if current_dir.name == 'notebooks':
    project_root = current_dir.parent
else:
    project_root = current_dir

data_dir = project_root / 'data'
models_dir = project_root / 'models'
results_dir = project_root / 'results'

print(f"Project root: {project_root}")
print(f"Data directory: {data_dir}")
print(f"Models directory: {models_dir}")
print(f"Results directory: {results_dir}")

Project root: D:\Coursework\Project Replication\HEARTS-Gender-Bias-Job-Descriptions
Data directory: D:\Coursework\Project Replication\HEARTS-Gender-Bias-Job-Descriptions\data
Models directory: D:\Coursework\Project Replication\HEARTS-Gender-Bias-Job-Descriptions\models
Results directory: D:\Coursework\Project Replication\HEARTS-Gender-Bias-Job-Descriptions\results


## Load Preprocessed Data

In [12]:
# Load preprocessed data
def load_preprocessed_data(data_dir=None):
    """Load preprocessed train, val, and test data"""
    if data_dir is None:
        data_dir = project_root / 'data'
    
    # Load from splits directory
    if isinstance(data_dir, Path):
        splits_dir = data_dir / 'splits'
        train_path = splits_dir / 'train.csv'
        val_path = splits_dir / 'val.csv'
        test_path = splits_dir / 'test.csv'
    else:
        splits_dir = os.path.join(data_dir, 'splits')
        train_path = os.path.join(splits_dir, 'train.csv')
        val_path = os.path.join(splits_dir, 'val.csv')
        test_path = os.path.join(splits_dir, 'test.csv')
    
    # Check if files exist
    missing_files = []
    if not os.path.exists(str(train_path)):
        missing_files.append(str(train_path))
    if not os.path.exists(str(val_path)):
        missing_files.append(str(val_path))
    if not os.path.exists(str(test_path)):
        missing_files.append(str(test_path))
    
    if missing_files:
        raise FileNotFoundError(
            f"Preprocessed data not found. Please run 01_Data_Loading_Preprocessing.ipynb first.\n"
            f"Missing files: {missing_files}"
        )
    
    # Load data
    train_data = pd.read_csv(str(train_path))
    val_data = pd.read_csv(str(val_path))
    test_data = pd.read_csv(str(test_path))
    
    print(f"   Loaded train split: {len(train_data)} examples")
    print(f"   Loaded val split: {len(val_data)} examples")
    print(f"   Loaded test data: {len(test_data)} examples")
    
    return train_data, val_data, test_data

# Load data
train_data, val_data, test_data = load_preprocessed_data(data_dir)

   Loaded train split: 14065 examples
   Loaded val split: 3517 examples
   Loaded test data: 4396 examples


## Train Baseline Model 1: LR - TF-IDF

In [13]:
# Training configuration
BASELINE_CONFIG = {
    'max_features': 10000,  # Limit TF-IDF features to manage memory
    'ngram_range': (1, 2),  # Unigrams and bigrams
    'max_iter': 1000,  # Maximum iterations for logistic regression
    'random_state': 42,
    'C': 1.0  # Regularization strength
}

print("=" * 60)
print("BASELINE MODEL CONFIGURATION")
print("=" * 60)
for key, value in BASELINE_CONFIG.items():
    print(f"{key}: {value}")
print("=" * 60)

BASELINE MODEL CONFIGURATION
max_features: 10000
ngram_range: (1, 2)
max_iter: 1000
random_state: 42
C: 1.0


In [14]:
# Combine train and val for training (baseline models typically use all available training data)
print("=" * 60)
print("TRAINING BASELINE MODEL")
print("=" * 60)

# Combine train and validation sets for training
train_val_data = pd.concat([train_data, val_data], ignore_index=True)
print(f"Combined training data: {len(train_val_data)} examples")

# Extract text and labels
X_train = train_val_data['text'].values
y_train = train_val_data['label'].values

X_test = test_data['text'].values
y_test = test_data['label'].values

print(f"Training samples: {len(X_train):,}")
print(f"Test samples: {len(X_test):,}")

# Initialize TF-IDF vectorizer
print("\nFitting TF-IDF vectorizer...")
vectorizer = TfidfVectorizer(
    max_features=BASELINE_CONFIG['max_features'],
    ngram_range=BASELINE_CONFIG['ngram_range'],
    stop_words='english',  # Remove common English stop words
    lowercase=True,
    min_df=2,  # Minimum document frequency
    max_df=0.95  # Maximum document frequency (remove very common words)
)

# Fit and transform training data
X_train_tfidf = vectorizer.fit_transform(X_train)
print(f"TF-IDF matrix shape: {X_train_tfidf.shape}")

# Transform test data
X_test_tfidf = vectorizer.transform(X_test)
print(f"Test TF-IDF matrix shape: {X_test_tfidf.shape}")

# Train Logistic Regression model
print("\nTraining Logistic Regression model...")
lr_model = LogisticRegression(
    max_iter=BASELINE_CONFIG['max_iter'],
    random_state=BASELINE_CONFIG['random_state'],
    C=BASELINE_CONFIG['C'],
    solver='liblinear',  # Good for small datasets
    class_weight='balanced'  # Handle class imbalance
)

lr_model.fit(X_train_tfidf, y_train)
print("   Model training completed!")

TRAINING BASELINE MODEL
Combined training data: 17582 examples
Training samples: 17,582
Test samples: 4,396

Fitting TF-IDF vectorizer...
TF-IDF matrix shape: (17582, 10000)
Test TF-IDF matrix shape: (4396, 10000)

Training Logistic Regression model...
   Model training completed!


## Save Model

In [15]:
# Save model and vectorizer
baseline_model_dir = models_dir / 'job_descriptions' / 'baseline_lr_tfidf'
os.makedirs(baseline_model_dir, exist_ok=True)

# Save model
with open(baseline_model_dir / 'lr_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

# Save vectorizer
with open(baseline_model_dir / 'tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print(f"   Model saved to: {baseline_model_dir}")
print(f"   - lr_model.pkl")
print(f"   - tfidf_vectorizer.pkl")

   Model saved to: D:\Coursework\Project Replication\HEARTS-Gender-Bias-Job-Descriptions\models\job_descriptions\baseline_lr_tfidf
   - lr_model.pkl
   - tfidf_vectorizer.pkl


## Train Baseline Model 2: DistilRoBERTa-Bias

This section trains a DistilRoBERTa model fine-tuned for gender bias detection.


In [16]:
# Import transformers libraries for DistilRoBERTa
from transformers import (
    AutoModelForSequenceClassification, 
    AutoTokenizer, 
    Trainer, 
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
import torch
from sklearn.metrics import precision_recall_fscore_support, balanced_accuracy_score

# Training configuration for DistilRoBERTa
DISTILROBERTA_CONFIG = {
    'model_path': 'distilbert/distilroberta-base',
    'batch_size': 8,
    'epochs': 3,
    'learning_rate': 2e-5,
    'gradient_accumulation_steps': 4,
    'seed': 42
}

print("=" * 60)
print("DISTILROBERTA-BIAS CONFIGURATION")
print("=" * 60)
for key, value in DISTILROBERTA_CONFIG.items():
    print(f"{key}: {value}")
print("=" * 60)


DISTILROBERTA-BIAS CONFIGURATION
model_path: distilbert/distilroberta-base
batch_size: 8
epochs: 3
learning_rate: 2e-05
gradient_accumulation_steps: 4
seed: 42


In [17]:
# Set random seeds
np.random.seed(DISTILROBERTA_CONFIG['seed'])
torch.manual_seed(DISTILROBERTA_CONFIG['seed'])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(DISTILROBERTA_CONFIG['seed'])

# Combine train and val for training
train_val_data = pd.concat([train_data, val_data], ignore_index=True)
print(f"Combined training data: {len(train_val_data)} examples")

# Load model and tokenizer
print(f"\nLoading DistilRoBERTa model: {DISTILROBERTA_CONFIG['model_path']}")
model = AutoModelForSequenceClassification.from_pretrained(
    DISTILROBERTA_CONFIG['model_path'],
    num_labels=2,
    ignore_mismatched_sizes=True
)
tokenizer = AutoTokenizer.from_pretrained(DISTILROBERTA_CONFIG['model_path'])

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding=True,
        truncation=True,
        max_length=512
    )

# Tokenize datasets
print("\nTokenizing datasets...")
tokenized_train = Dataset.from_pandas(train_val_data).map(
    tokenize_function,
    batched=True
).map(lambda examples: {'labels': examples['label']})

tokenized_val = Dataset.from_pandas(val_data).map(
    tokenize_function,
    batched=True
).map(lambda examples: {'labels': examples['label']})

print(f"Tokenized training samples: {len(tokenized_train)}")
print(f"Tokenized validation samples: {len(tokenized_val)}")

# Metrics computation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='macro'
    )
    balanced_acc = balanced_accuracy_score(labels, predictions)
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "balanced_accuracy": balanced_acc
    }

# Set up output directory
distilroberta_model_dir = models_dir / 'job_descriptions' / 'baseline_distilroberta_bias'
os.makedirs(distilroberta_model_dir, exist_ok=True)
output_dir_str = str(distilroberta_model_dir)

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir_str,
    num_train_epochs=DISTILROBERTA_CONFIG['epochs'],
    evaluation_strategy="epoch",
    learning_rate=DISTILROBERTA_CONFIG['learning_rate'],
    per_device_train_batch_size=DISTILROBERTA_CONFIG['batch_size'],
    per_device_eval_batch_size=DISTILROBERTA_CONFIG['batch_size'],
    gradient_accumulation_steps=DISTILROBERTA_CONFIG['gradient_accumulation_steps'],
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=1,
    logging_dir=os.path.join(output_dir_str, 'logs'),
    logging_steps=100,
    report_to="none",
    seed=DISTILROBERTA_CONFIG['seed'],
    fp16=torch.cuda.is_available(),
    lr_scheduler_type="constant",
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics
)

# Train model
print("\n" + "=" * 60)
print("TRAINING DISTILROBERTA-BIAS")
print("=" * 60)
trainer.train()

# Save model
print("\nSaving DistilRoBERTa model...")
trainer.save_model(output_dir_str)
tokenizer.save_pretrained(output_dir_str)
print(f"âœ… DistilRoBERTa model saved to: {distilroberta_model_dir}")


Combined training data: 17582 examples

Loading DistilRoBERTa model: distilbert/distilroberta-base


config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]


Tokenizing datasets...


Map:   0%|          | 0/17582 [00:00<?, ? examples/s]

Map:   0%|          | 0/17582 [00:00<?, ? examples/s]

Map:   0%|          | 0/3517 [00:00<?, ? examples/s]

Map:   0%|          | 0/3517 [00:00<?, ? examples/s]

Tokenized training samples: 17582
Tokenized validation samples: 3517


  trainer = Trainer(



TRAINING DISTILROBERTA-BIAS


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Balanced Accuracy
0,0.6685,0.6541,0.644301,0.608854,0.578996,0.608854
2,0.5802,0.538206,0.722587,0.722619,0.722601,0.722619



Saving DistilRoBERTa model...
âœ… DistilRoBERTa model saved to: D:\Coursework\Project Replication\HEARTS-Gender-Bias-Job-Descriptions\models\job_descriptions\baseline_distilroberta_bias


## Train Baseline Model 3: LR - Embeddings

This section trains a Logistic Regression model using embeddings from a pre-trained transformer model.


In [18]:
# Configuration for LR - Embeddings
LR_EMBEDDINGS_CONFIG = {
    'embedding_model': 'distilbert/distilroberta-base',  # Model to extract embeddings from
    'batch_size': 32,  # Batch size for embedding extraction
    'max_iter': 1000,  # Maximum iterations for logistic regression
    'random_state': 42,
    'C': 1.0  # Regularization strength
}

print("=" * 60)
print("LR - EMBEDDINGS CONFIGURATION")
print("=" * 60)
for key, value in LR_EMBEDDINGS_CONFIG.items():
    print(f"{key}: {value}")
print("=" * 60)


LR - EMBEDDINGS CONFIGURATION
embedding_model: distilbert/distilroberta-base
batch_size: 32
max_iter: 1000
random_state: 42
C: 1.0


In [19]:
# Set random seeds
np.random.seed(LR_EMBEDDINGS_CONFIG['random_state'])
torch.manual_seed(LR_EMBEDDINGS_CONFIG['random_state'])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(LR_EMBEDDINGS_CONFIG['random_state'])

# Combine train and val for training
train_val_data = pd.concat([train_data, val_data], ignore_index=True)
print(f"Combined training data: {len(train_val_data)} examples")

# Load model for embeddings (without classification head)
from transformers import AutoModel
print(f"\nLoading embedding model: {LR_EMBEDDINGS_CONFIG['embedding_model']}")
embedding_model = AutoModel.from_pretrained(LR_EMBEDDINGS_CONFIG['embedding_model'])
embedding_tokenizer = AutoTokenizer.from_pretrained(LR_EMBEDDINGS_CONFIG['embedding_model'])

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embedding_model.to(device)
embedding_model.eval()

print(f"Using device: {device}")

# Function to extract embeddings
def extract_embeddings(texts, model, tokenizer, device, batch_size=32):
    """Extract embeddings from texts using the model"""
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize
        encoded = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        
        # Move to device
        encoded = {k: v.to(device) for k, v in encoded.items()}
        
        # Get embeddings (use [CLS] token embedding)
        with torch.no_grad():
            outputs = model(**encoded)
            # Use mean pooling of all token embeddings
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        
        embeddings.append(batch_embeddings)
    
    return np.vstack(embeddings)

# Extract embeddings for training data
print("\nExtracting embeddings for training data...")
X_train_texts = train_val_data['text'].values.tolist()
X_train_embeddings = extract_embeddings(
    X_train_texts, 
    embedding_model, 
    embedding_tokenizer, 
    device,
    batch_size=LR_EMBEDDINGS_CONFIG['batch_size']
)
y_train = train_val_data['label'].values

print(f"Training embeddings shape: {X_train_embeddings.shape}")

# Extract embeddings for test data
print("\nExtracting embeddings for test data...")
X_test_texts = test_data['text'].values.tolist()
X_test_embeddings = extract_embeddings(
    X_test_texts,
    embedding_model,
    embedding_tokenizer,
    device,
    batch_size=LR_EMBEDDINGS_CONFIG['batch_size']
)
y_test = test_data['label'].values

print(f"Test embeddings shape: {X_test_embeddings.shape}")

# Train Logistic Regression on embeddings
print("\nTraining Logistic Regression on embeddings...")
lr_embeddings_model = LogisticRegression(
    max_iter=LR_EMBEDDINGS_CONFIG['max_iter'],
    random_state=LR_EMBEDDINGS_CONFIG['random_state'],
    C=LR_EMBEDDINGS_CONFIG['C'],
    solver='liblinear',
    class_weight='balanced'
)

lr_embeddings_model.fit(X_train_embeddings, y_train)
print("   Model training completed!")


Combined training data: 17582 examples

Loading embedding model: distilbert/distilroberta-base
Using device: cuda

Extracting embeddings for training data...
Training embeddings shape: (17582, 768)

Extracting embeddings for test data...
Test embeddings shape: (4396, 768)

Training Logistic Regression on embeddings...
   Model training completed!


## Save LR - Embeddings Model


In [20]:
# Save LR - Embeddings model
baseline_lr_embeddings_dir = models_dir / 'job_descriptions' / 'baseline_lr_embeddings'
os.makedirs(baseline_lr_embeddings_dir, exist_ok=True)

# Save model
with open(baseline_lr_embeddings_dir / 'lr_model.pkl', 'wb') as f:
    pickle.dump(lr_embeddings_model, f)

# Save embedding model name (for evaluation)
with open(baseline_lr_embeddings_dir / 'embedding_model_name.txt', 'w') as f:
    f.write(LR_EMBEDDINGS_CONFIG['embedding_model'])

print(f"   LR - Embeddings model saved to: {baseline_lr_embeddings_dir}")
print(f"   - lr_model.pkl")
print(f"   - embedding_model_name.txt")
print(f"\nNote: Evaluation will be performed in 03_Model_Evaluation.ipynb")


   LR - Embeddings model saved to: D:\Coursework\Project Replication\HEARTS-Gender-Bias-Job-Descriptions\models\job_descriptions\baseline_lr_embeddings
   - lr_model.pkl
   - embedding_model_name.txt

Note: Evaluation will be performed in 03_Model_Evaluation.ipynb
