# Generation of job descriptions with tranformer models

### Checking for GPU availability to run the computations


In [1]:
from datasets import Dataset
import numpy as np
import os
import pandas as pd
import sys
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, pipeline

  from .autonotebook import tqdm as notebook_tqdm
2025-05-10 01:51:28.357835: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-10 01:51:28.563067: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746834688.635543  210453 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746834688.656145  210453 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746834688.828193  210453 computation_placer.cc:177] computation placer already r

### Setting root path to project path

In [6]:
project_root = os.path.abspath(
    os.path.join(os.getcwd(), '../../../..')
)
if project_root not in sys.path:
    sys.path.append(project_root)

os.environ["TOKENIZERS_PARALLELISM"] = "true"
LOAD_FINE_TUNED = True

### Imports

In [7]:
import torch
print(f"GPU available: {torch.cuda.is_available()}")
print(f"Device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")

GPU available: True
Device count: 1
Current device: 0
Device name: NVIDIA GeForce RTX 3060 Ti


### Load fine-tuned models

In [8]:
fine_tuned_model_path = os.path.join(project_root, 'models', 'src', 'generation', 'transformers', 'fine_tuned_model_gpt2')
if not os.path.exists(fine_tuned_model_path):
    raise FileNotFoundError(f"Model directory not found: {fine_tuned_model_path}. Please ensure the path is correct.")

fine_tuned_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)
fine_tuned_model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_path)

### Model selection

In [4]:
# model_name = "gpt2"  # Or choose another model like "gpt2", "t5-small", etc.
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name)

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set pad token if it doesn't exist (common for GPT-2 models)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

print(f"Loaded model: {model_name}")
print(f"Tokenizer pad token: {tokenizer.pad_token}")

Loaded model: gpt2
Tokenizer pad token: <|endoftext|>


### Data preparation

In [10]:
# --- Configuration ---
parquet_file_path = os.path.join(project_root, "data", "processed", "cleaned_postings_modeling.parquet")
text_column = "description" # Your column name
block_size = 1024 # Max sequence length for the model
test_size = 0.05 
random_seed = 42

model = fine_tuned_model
tokenizer = fine_tuned_tokenizer

# --- Load Dataframe from Parquet ---
try:
    df = pd.read_parquet(parquet_file_path)

    print(f"Loaded DataFrame from {parquet_file_path}. Shape: {df.shape}")
    print(df.head()) # Optional
    print(df.info()) # Optional
    # filter out any rows with a description shorter than 100 characters
    df = df[df[text_column].str.len() > 100]
    print(f"Filtered DataFrame. New shape: {df.shape}")
    print (df.info())
    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found in the DataFrame.")
except FileNotFoundError:
    print(f"Error: Parquet file not found at {parquet_file_path}")
    df = None
except Exception as e:
    print(f"Error loading Parquet file: {e}")
    df = None

# --- Convert Pandas DataFrame to Hugging Face Dataset ---
if df is not None:
    # Convert the DataFrame to a single Dataset object first
    full_dataset = Dataset.from_pandas(df)
    print(f"\nConverted DataFrame to Dataset. Size: {len(full_dataset)}")

    # --- Split the Dataset ---
    # Use train_test_split on the Dataset object
    split_datasets = full_dataset.train_test_split(test_size=test_size, seed=random_seed)

    # Rename the default 'test' split to 'validation' if preferred for Trainer, or keep as 'test'
    # Trainer uses 'eval_dataset', so 'validation' or 'test' are common keys. Let's use 'test'.
    # split_datasets['validation'] = split_datasets.pop('test') # Optional rename

    print("\nSplit dataset into training and testing sets:")
    print(split_datasets)

    # Assign to raw_datasets (which is now a DatasetDict with 'train' and 'test')
    raw_datasets = split_datasets

else:
    print("\nSkipping dataset conversion and splitting due to loading error.")
    raw_datasets = None

# --- Tokenization Function (remains the same) ---
def tokenize_function(examples):
    # Tokenize the text
    tokenized_output = tokenizer(examples[text_column], truncation=True, padding="max_length", max_length=block_size)
    # For Causal LM, labels are usually the same as inputs
    tokenized_output["labels"] = tokenized_output["input_ids"].copy()
    return tokenized_output

# --- Apply Tokenization ---
if raw_datasets:
    # Apply tokenization to both splits ('train' and 'test')
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=raw_datasets["train"].column_names # Remove original columns from both splits
    )
    # Set format for PyTorch
    tokenized_datasets.set_format("torch")

    print("\nTokenized dataset structure:")
    print(tokenized_datasets)

    print("\nTokenized training dataset sample:")
    if len(tokenized_datasets["train"]) > 0:
         print(tokenized_datasets["train"][0])
    else:
        print("Tokenized training dataset is empty.")

    # Assign the splits
    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["test"] # Use the 'test' split for evaluation

else:
    print("\nSkipping tokenization due to dataset loading/conversion error.")
    train_dataset = None
    eval_dataset = None

if (LOAD_FINE_TUNED):
    print("\nSetting train_dataset to None to skip training and only perform evaluation.")
    train_dataset = None 

Loaded DataFrame from /home/gabriel/dev/SCIA/NLP_Linkedin_offers/data/processed/cleaned_postings_modeling.parquet. Shape: (122124, 4)
                company_name  \
0      Corcoran Sawyer Smith   
1     The National Exemplar    
2     Abrams Fensterman, LLP   
3  Downtown Raleigh Alliance   
4                 Raw Cereal   

                                               title  \
0                              Marketing Coordinator   
1                        Assitant Restaurant Manager   
2  Senior Elder Law / Trusts and Estates Associat...   
3           Economic Development and Planning Intern   
4                                           Producer   

                                         description           location  
0  Job description A leading real estate firm in ...      Princeton, NJ  
1  The National Exemplar is accepting application...     Cincinnati, OH  
2  Senior Associate Attorney Elder Law Trusts and...  New Hyde Park, NY  
3  Job summary The Economic Development 

Map: 100%|██████████| 115952/115952 [00:50<00:00, 2277.76 examples/s]
Map: 100%|██████████| 6103/6103 [00:02<00:00, 2492.28 examples/s]


Tokenized dataset structure:
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 115952
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6103
    })
})

Tokenized training dataset sample:
{'input_ids': tensor([   36,  1731,   486,  ...,  4645,  1085, 22901]), 'attention_mask': tensor([1, 1, 1,  ..., 1, 1, 1]), 'labels': tensor([   36,  1731,   486,  ...,  4645,  1085, 22901])}

Setting train_dataset to None to skip training and only perform evaluation.





### Setup for the training of the model

In [11]:
output_directory = os.path.join(project_root, "models", "src", "generation", "transformers")
use_fp16 = torch.cuda.is_available() # Enable FP16 only if GPU is available

if (LOAD_FINE_TUNED):
    model = fine_tuned_model
    tokenizer = fine_tuned_tokenizer

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
)

training_args = TrainingArguments(
    torch_compile=True,
    torch_compile_backend="inductor",
    torch_compile_mode="default", 
    torch_empty_cache_steps=4,
    output_dir=output_directory,
    num_train_epochs=1,  # Start with 1 epoch for testing
    per_device_train_batch_size=2,  # Adjust based on GPU memory
    per_device_eval_batch_size=4, # Batch size for evaluation (can often be larger)
    gradient_accumulation_steps=4, # Increase effective batch size
    learning_rate=5e-5, # Learning rate
    fp16=use_fp16, # Enable mixed precision training if GPU available
    logging_dir=f"{output_directory}/logs",
    logging_strategy="steps", # Log metrics periodically
    logging_steps=100,        # Log every 100 steps
    eval_strategy="steps", # Evaluate periodically
    eval_steps=500,              # Evaluate every 500 steps
    save_strategy="steps",       # Save checkpoints periodically
    save_steps=500,              # Save every 500 steps
    load_best_model_at_end=True, # Load the best model found during evaluation at the end
    metric_for_best_model="loss", # Use evaluation loss to determine the best model (lower is better)
    greater_is_better=False,     # Lower loss is better
    save_total_limit=1,          # Keep only the last 2 checkpoints + the best one
    report_to="none",          # Disable external reporting (like wandb) for now
    weight_decay=0.01,         # Regularization
    dataloader_pin_memory=True, # Pin memory for faster data transfer to GPU
    dataloader_num_workers=12, # Number of workers for data loading, leverage multiple CPU cores for faster data loading to GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if train_dataset else None,
    eval_dataset=eval_dataset if eval_dataset else None, # Pass the evaluation dataset
    tokenizer=tokenizer,
    # data_collator=data_collator # Usually not needed for CausalLM unless custom padding
)

print("TrainingArguments and Trainer initialized.")
if eval_dataset:
    print(f"Evaluation dataset size: {len(eval_dataset)}")
print(f"FP16 enabled: {use_fp16}")
print(f"Evaluation strategy: {training_args.eval_strategy}")

  trainer = Trainer(


TrainingArguments and Trainer initialized.
Evaluation dataset size: 6103
FP16 enabled: True
Evaluation strategy: IntervalStrategy.STEPS


### Fine-tuning the model


In [8]:
if train_dataset:
    print("Starting training...")
    try:
        train_result = trainer.train()
        print("Training finished.")
        # You can print some metrics from train_result if needed
        metrics = train_result.metrics
        print(f"Train Output Metrics: {metrics}")
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
    except Exception as e:
        print(f"An error occurred during training: {e}")
else:
    print("Skipping training because the dataset was not loaded properly.")


Starting training...


Step,Training Loss,Validation Loss,Model Preparation Time
500,2.1587,2.055996,0.0045
1000,2.1796,2.050528,0.0045
1500,2.1294,2.046983,0.0045
2000,2.0446,2.04459,0.0045
2500,2.1764,2.042842,0.0045
3000,2.127,2.041235,0.0045
3500,2.0899,2.039516,0.0045
4000,2.1143,2.037973,0.0045
4500,2.1232,2.036798,0.0045
5000,2.093,2.035658,0.0045


W0508 11:43:40.051000 277504 torch/_inductor/utils.py:1250] [2/1_1] Not enough SMs to use max_autotune_gemm mode
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Training finished.
Train Output Metrics: {'train_runtime': 11423.8779, 'train_samples_per_second': 10.15, 'train_steps_per_second': 1.269, 'total_flos': 6.0965702074368e+16, 'train_loss': 2.123645397684764, 'epoch': 1.0}
***** train metrics *****
  epoch                    =        1.0
  total_flos               = 56778734GF
  train_loss               =     2.1236
  train_runtime            = 3:10:23.87
  train_samples_per_second =      10.15
  train_steps_per_second   =      1.269


In [9]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

### Perform Inference with Pipeline and store predictions for test dataset
Using the fine-tuned model to generate text samples based on prompts from the evaluation set.

In [17]:
ground_truth = []
predictions = []
inputs = []

device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'cuda' if device == 0 else 'cpu'}")

# Check if trainer and necessary data are available
if 'trainer' in globals() and hasattr(trainer, 'model') and trainer.model and \
   'raw_datasets' in globals() and raw_datasets and 'test' in raw_datasets and \
   'text_column' in globals():

    print(f"Initializing text-generation pipeline with model: {trainer.model.config._name_or_path if hasattr(trainer.model, 'config') else 'N/A'}")
    
    # Use the model and tokenizer from the trainer
    # These should be the fine-tuned model and its corresponding tokenizer
    generator = pipeline(
        "text-generation",
        model=trainer.model,
        tokenizer=trainer.tokenizer,
        device=device,
        do_sample=True,
        # temperature=0,
        top_k=10,
        # top_p=0.95,
    )

    print("Pipeline initialized.")

    # Get a few examples from the raw test dataset to use as prompts
    num_samples_to_generate = 10
    if len(raw_datasets['test']) >= num_samples_to_generate:
        sample_indices = range(num_samples_to_generate)
    else:
        sample_indices = range(len(raw_datasets['test']))
        print(f"Warning: Requested {num_samples_to_generate} samples, but only {len(raw_datasets['test'])} available in test set.")

    for i in sample_indices:
        original_text = raw_datasets['test'][i][text_column]
        
        # Create a prompt (e.g., first 50 words of the original text)
        prompt_max_words = 50
        prompt_text = " ".join(original_text.split()[:prompt_max_words])
        
        print(f"\n--- Sample {i+1} ---")
        print(f"Prompt: \"{prompt_text}...\"")
        
        try:
            # Generate text
            # max_new_tokens specifies how many tokens to generate after the prompt
            # max_length would be prompt_length + max_new_tokens
            generated_sequences = generator(
                prompt_text,
                max_new_tokens=500,  # Generate 100 new tokens
                num_return_sequences=1,
                pad_token_id=generator.tokenizer.eos_token_id # Important for open-ended generation
            )
            
            generated_text = generated_sequences[0]['generated_text']
            print(f"Generated Text: \"{generated_text}\"")
            # Append to predictions and ground truth
            predictions.append(generated_text)
            ground_truth.append(original_text)
            inputs.append(prompt_text)
            print(f"Appended sample {i+1} to golden dataset.")
        except Exception as e:
            print(f"Error during generation for sample {i+1}: {e}")
            import traceback
            traceback.print_exc()

    # Clean up to free GPU memory if a GPU was used
    if device == 0:
        del generator
        # torch.cuda.empty_cache()
        print("\nCleaned up generator and emptied CUDA cache.")
        
elif 'trainer' not in globals() or not hasattr(trainer, 'model') or not trainer.model:
    print("Skipping pipeline inference: 'trainer' or 'trainer.model' is not available.")
elif 'raw_datasets' not in globals() or not raw_datasets or 'test' not in raw_datasets:
    print("Skipping pipeline inference: 'raw_datasets' or 'raw_datasets['test']' is not available.")
elif 'text_column' not in globals():
    print("Skipping pipeline inference: 'text_column' variable is not defined.")
else:
    print("Skipping pipeline inference due to missing components. Ensure trainer, data, and text_column are set up.")

golden_dataset = pd.DataFrame({
    'prediction': predictions,
    'ground_truth': ground_truth,
    'input': inputs
})
model_name = 'gpt2' # TO DELETE
sanitized_model_name = model_name.replace('/', '_')
golden_dataset_file_path = os.path.join(output_directory, f"small_golden_dataset_{sanitized_model_name}.csv")
golden_dataset.to_csv(golden_dataset_file_path, index=False)
print(f"\nGolden dataset saved to: {golden_dataset_file_path}")


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Device set to use cuda:0


Using device: cuda
Initializing text-generation pipeline with model: /home/gabriel/dev/SCIA/NLP_Linkedin_offers/models/src/generation/transformers/fine_tuned_model_gpt2
Pipeline initialized.

--- Sample 1 ---
Prompt: "As a Publix Pharmacy Associate, you ll see how very satisfying it is to work for a company that is passionately devoted to its customers, to its associates, and to the wellness of the communities it serves. What sets our pharmacy departments apart is a genuine patient centric environment with..."
Generated Text: "As a Publix Pharmacy Associate, you ll see how very satisfying it is to work for a company that is passionately devoted to its customers, to its associates, and to the wellness of the communities it serves. What sets our pharmacy departments apart is a genuine patient centric environment with a wide variety of products and experiences available, as well as an emphasis on quality control and patient satisfaction, and that ensures that everyone feels safe and well.

### Save the fine-tuned model

In [11]:
final_model_path = f"{output_directory}/fine_tuned_model_{sanitized_model_name}"

if train_dataset: # Only save if training actually happened
    print(f"Saving final model and tokenizer to {final_model_path}...")
    try:
        trainer.save_model(final_model_path)
        tokenizer.save_pretrained(final_model_path)
        print("Model and tokenizer saved successfully.")
    except Exception as e:
        print(f"Error saving model/tokenizer: {e}")
else:
    print("Skipping final model saving as training did not run.")

Saving final model and tokenizer to /home/gabriel/dev/SCIA/NLP_Linkedin_offers/models/src/generation/transformers/fine_tuned_model_gpt2...
Model and tokenizer saved successfully.


### Evaluate a loaded model

In [11]:
if eval_dataset and trainer:
    print("\nStarting evaluation on the test set...")
    try:
        eval_results = trainer.evaluate(eval_dataset=eval_dataset)
        print(f"Evaluation results: {eval_results}")
        trainer.log_metrics("eval", eval_results)
        trainer.save_metrics("eval", eval_results)
    except Exception as e:
        print(f"Error during evaluation: {e}")
else:
    print("Skipping evaluation as eval_dataset or trainer is not available.")


Starting evaluation on the test set...


W0509 16:09:36.977000 2997 torch/_inductor/utils.py:1250] [2/0_1] Not enough SMs to use max_autotune_gemm mode
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Evaluation results: {'eval_loss': 2.020193576812744, 'eval_model_preparation_time': 0.9129, 'eval_runtime': 152.3496, 'eval_samples_per_second': 40.059, 'eval_steps_per_second': 10.016}
***** eval metrics *****
  eval_loss                   =     2.0202
  eval_model_preparation_time =     0.9129
  eval_runtime                = 0:02:32.34
  eval_samples_per_second     =     40.059
  eval_steps_per_second       =     10.016
