# Dataset

In [20]:
import json

In [21]:
filename = "onboarding_task_dataset_v3.jsonl"

In [22]:
data = []
with open(filename, 'r', encoding='utf-8') as f:
  for line in f:
    data.append(json.loads(line.strip()))


In [23]:
from datasets import Dataset

dataset = Dataset.from_list(data)

train_test_data = dataset.train_test_split(test_size=63, seed=42)
train_val_data = train_test_data["train"].train_test_split(test_size=63, seed=42)

# Access the train, validation, and test datasets
train_dataset = train_val_data["train"]
eval_dataset = train_val_data["test"]
test_dataset = train_test_data["test"]

# Print the number of examples in each set
print(f"Train dataset: {len(train_dataset)} examples")
print(f"Validation dataset: {len(eval_dataset)} examples")
print(f"Test dataset: {len(test_dataset)} examples")



Train dataset: 577 examples
Validation dataset: 63 examples
Test dataset: 63 examples


In [24]:
test_dataset[13]["text"]

'<|system|>\nYou are an intelligent assistant dedicated to extracting management levels and job titles from user queries. Before doing so, you must understand what a functional area is.<|end|>\n<|user|>\n\nDefinition of a Functional Area:\n- A functional area is a department or group of personnel tasked with a specific organizational function. These include departments like finance, marketing, engineering, etc.\n\nDefinition of Management Level:\n- A management level refers to a hierarchical position within an organization without a specific functional area. It encompasses broader titles that may include roles across different functional areas.\n- Management levels include: "Board of Directors," "CSuite and President," "Executive and Sr. VP," "General Manager," "VP," "Director," "Manager," "Senior (Individual Contributor)," "Mid (Individual Contributor)," and "Junior."\n\nDefinition of a Job Title:\n- A job title refers to a specific employment position combined with a functional area.

In [25]:
system_message = """You are an intelligent assistant dedicated to extracting management levels and job titles from user queries. Before doing so, you must understand what a functional area is."""

In [26]:
user_message = """Definition of a Functional Area:\n- A functional area is a department or group of personnel tasked with a specific organizational function. These include departments like finance, marketing, engineering, etc.\n\nDefinition of Management Level:\n- A management level refers to a hierarchical position within an organization without a specific functional area. It encompasses broader titles that may include roles across different functional areas.\n- Management levels include: "Board of Directors," "CSuite and President," "Executive and Sr. VP," "General Manager," "VP," "Director," "Manager," "Senior (Individual Contributor)," "Mid (Individual Contributor)," and "Junior."\n\nDefinition of a Job Title:\n- A job title refers to a specific employment position combined with a functional area.\n- Examples include \'VP of Engineering\' (functional area: Engineering) and \'Director of Finance\' (functional area: Finance).\n\nInstructions:\n1. Management Levels: Only return management levels that match the predefined set: ["Board of Directors," "CSuite and President," "Executive and Sr. VP," "General Manager," "VP," "Director," "Manager," "Senior (Individual Contributor)," "Mid (Individual Contributor)," "Junior"].\n2. Job Titles: Normalize the job title after extracting it from the text. For example, convert "ceo" to "Chief Executive Officer" and include both the full title and its abbreviation if mentioned in the query, e.g., "VP of Engineering" and "Vice President of Engineering."\n3. Response Format: Your response must be a dictionary with two keys: "management_levels" and "titles". Each key should have a list of management levels and titles respectively.\n4. If a keyword is classified as title, don\'t include it in the management levels and vice versa. e.g if "VP of Engineering" is classified as title then don\'t include "VP" in management levels.\n\nExample:\nQuery: "Provide a list of CFOs and VPs working in the technology sector"\nOutput: {"management_levels": ["VP"], "titles": ["Chief Financial Officer", "CFO"]}\n\nGuidelines:\n- "Leaders" implies CSuite/President level\n- Titles without functional areas (e.g., "Founder", "Managing Director") go in titles list\n- Use knowledge base for educated guesses about classifications\n\nQuery: "VPs of development or executives in real estate?"\nOutput:"""

In [27]:
prefix = """{\'management_level\':"""

In [28]:
PROMPT_TEMPLATE_INFERENCE = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>{{system_message}}<|eot_id|>
<|begin_of_text|><|start_header_id|>user<|end_header_id|>{{user_message}}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>{{prefix}}"""
PROMPT_TEMPLATE_FINETUNING = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>{{system}}<|eot_id|>
    <|start_header_id|>user<|end_header_id|>{{user}}<|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>{{assistant}}<|eot_id|>"""


In [29]:
from jinja2 import Template

In [30]:
prompt = Template(PROMPT_TEMPLATE).render({"system_message": system_message, "user_message": user_message, "prefix" : prefix})

# Loading Libraries

In [31]:
base_model = "/home/raza/Downloads/meta-llama-3-2-3B-Instruct"
# peft_model_name = 

In [42]:
from transformers import BitsAndBytesConfig, AutoProcessor, AutoModelForCausalLM
import torch
from peft import PeftModel

# specify how to quantize the model
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
)
device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None

model_kwargs = dict(
    attn_implementation="flash_attention_2", # set this to True if your GPU supports it (Flash Attention drastically speeds up model computations)
    torch_dtype="auto",
    use_cache=False, # set to False as we're going to use gradient checkpointing
    device_map=device_map,
    quantization_config=quantization_config,
)
# Assuming you're using a text generation model like GPT
processor = AutoProcessor.from_pretrained(base_model)

# Set the padding token to be the same as eos_token if it's not defined
if processor.pad_token is None:
    processor.pad_token = processor.eos_token  # or you can set a custom token if needed

processor.padding_side = 'right'



In [36]:

model = AutoModelForCausalLM.from_pretrained(
          base_model,
          trust_remote_code=True,
          attn_implementation="flash_attention_2", # set this to True if your GPU supports it (Flash Attention drastically speeds up model computations)
          torch_dtype="auto",
          use_cache=False, # set to False as we're going to use gradient checkpointing
          device_map=device_map,
          quantization_config=quantization_config,
)

# peft_model = PeftModel.from_pretrained(
#     model,
#     peft_model_name
# )

# peft_model.eval()



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [41]:
print(prompt.strip())

<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are an intelligent assistant dedicated to extracting management levels and job titles from user queries. Before doing so, you must understand what a functional area is.<|eot_id|>
<|begin_of_text|><|start_header_id|>user<|end_header_id|>Definition of a Functional Area:
- A functional area is a department or group of personnel tasked with a specific organizational function. These include departments like finance, marketing, engineering, etc.

Definition of Management Level:
- A management level refers to a hierarchical position within an organization without a specific functional area. It encompasses broader titles that may include roles across different functional areas.
- Management levels include: "Board of Directors," "CSuite and President," "Executive and Sr. VP," "General Manager," "VP," "Director," "Manager," "Senior (Individual Contributor)," "Mid (Individual Contributor)," and "Junior."

Definition of a Job Title:
- 

In [44]:
def inference_function(model, max_new_tokens, input_text: str):
    """
    Simple inference function to generate text based on an input prompt.

    Args:
    - input_text (str): The input text to be processed by the model.
    - max_length (int): The maximum length of the generated text.

    Returns:
    - str: The generated text.
    """
    # Process the input text
    inputs = processor(input_text, return_tensors="pt", padding=True).to(model.device)
    # Generate output from the model
    output = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,  # Enable sampling for more diverse outputs
        temperature=0.1,
        top_p=0.9
    )

    # Decode the output tokens into text
    generated_text = processor.decode(output[0], skip_special_tokens=True)  # Get the first sequence

    return generated_text

# Example usage
input_text = "Donald Trump is "
generated_output = inference_function(model=model, input_text=prompt.strip(), max_new_tokens=8)
print(f"Generated Output: {generated_output}")


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated Output: systemYou are an intelligent assistant dedicated to extracting management levels and job titles from user queries. Before doing so, you must understand what a functional area is.
userDefinition of a Functional Area:
- A functional area is a department or group of personnel tasked with a specific organizational function. These include departments like finance, marketing, engineering, etc.

Definition of Management Level:
- A management level refers to a hierarchical position within an organization without a specific functional area. It encompasses broader titles that may include roles across different functional areas.
- Management levels include: "Board of Directors," "CSuite and President," "Executive and Sr. VP," "General Manager," "VP," "Director," "Manager," "Senior (Individual Contributor)," "Mid (Individual Contributor)," and "Junior."

Definition of a Job Title:
- A job title refers to a specific employment position combined with a functional area.
- Examples i

# Util Functions

In [None]:
import json

def parse_string(input_string):
    # Define the markers and unwanted "Output:" part
    marker = "<|assistant|>"
    output_label = "Output:"
    end_marker = "<|end|>"  # Marker that may appear at the end of the string
    
    # Find the index of the main marker
    marker_index = input_string.find(marker)

    # If the marker is found, split the string accordingly
    if marker_index != -1:
        prompt = input_string[:marker_index].strip() + marker + """{ "management_level" : """
        ground_truth = input_string[marker_index + len(marker):].strip()
        
        # Remove the "Output:" label if it's present at the beginning of the ground truth
        if ground_truth.startswith(output_label):
            ground_truth = ground_truth[len(output_label):].strip()
        
        # Remove the "<|end|>" token or other similar tokens from the ground truth
        ground_truth = ground_truth.replace(end_marker, "").strip()

        # Convert single quotes to double quotes for valid JSON parsing
        ground_truth = ground_truth.replace("'", "\"")

        # Try parsing the ground truth into a dictionary
        try:
            ground_truth_dict = json.loads(ground_truth)  # Assumes ground truth is a valid JSON string
        except json.JSONDecodeError:
            ground_truth_dict = None  # If it's not valid JSON, return None
            
        return {
            "prompt": prompt,
            "ground_truth": ground_truth_dict
        }
    else:
        # If the marker is not found, return the entire input as prompt, and None as ground_truth
        return {
            "prompt": input_string.strip(),
            "ground_truth": None
        }


parsed_result = parse_string(test_dataset[13]["text"])

print(json.dumps(parsed_result, indent=2))


{
  "prompt": "<|system|>\nYou are an intelligent assistant dedicated to extracting management levels and job titles from user queries. Before doing so, you must understand what a functional area is.<|end|>\n<|user|>\n\nDefinition of a Functional Area:\n- A functional area is a department or group of personnel tasked with a specific organizational function. These include departments like finance, marketing, engineering, etc.\n\nDefinition of Management Level:\n- A management level refers to a hierarchical position within an organization without a specific functional area. It encompasses broader titles that may include roles across different functional areas.\n- Management levels include: \"Board of Directors,\" \"CSuite and President,\" \"Executive and Sr. VP,\" \"General Manager,\" \"VP,\" \"Director,\" \"Manager,\" \"Senior (Individual Contributor),\" \"Mid (Individual Contributor),\" and \"Junior.\"\n\nDefinition of a Job Title:\n- A job title refers to a specific employment positio

In [None]:
import re

def extract_query(input_str):
    # Define the pattern to capture the last "Query:" and corresponding "Output:"
    pattern = r'Use knowledge base for educated guesses about classifications\s*Query:\s*(.*?)\s*Output:'
    
    # Search for the pattern
    match = re.search(pattern, input_str, re.DOTALL)
    
    if match:
        # Extract the query text (the first group corresponds to the query part)
        query = match.group(1).strip()
        return eval(query)
    else:
        return None
    
extract_query(test_dataset[13]["text"])

'VPs of development or executives in real estate?'

In [None]:
import json
import re

def response_parser(response_str):
    # Define a regular expression pattern to capture all JSON objects in the string
    json_pattern = r'(\{.*?\})'
    
    # Find all matches of the JSON objects in the string
    matches = re.findall(json_pattern, response_str.strip(), re.DOTALL)
    
    if matches:
        # Get the last match, which should be the most recent JSON object
        last_json_str = matches[-1]
        
        try:
            # Parse the last matched JSON string
            parsed_response = json.loads(last_json_str)
            return parsed_response
        except json.JSONDecodeError:
            print("Error: The last JSON in the response string is not valid.")
            return None
    else:
        print("Error: No valid JSON found in the response.")
        return None


In [None]:
import pandas as pd
from IPython.display import display

def display_comparison(comparison_list):
    # Helper function to safely get key from dictionary or return a default value
    def safe_get(entry, field, subfield=None, default="N/A"):
        if entry.get(field) is None:  # Check if the field itself is None
            return default
        if subfield:  # If we need to access a subfield (e.g., 'label', 'confidence')
            return entry.get(field, {}).get(subfield, default)
        return entry.get(field, default)
    
    # Extract relevant information from Query, Ground Truth, Prediction, and Exact Match
    query_data = [entry.get('Query', 'N/A') for entry in comparison_list]
    
    # Extract 'Ground Truth' data, handling missing keys or NoneType
    gt_data = [
        f"{safe_get(entry, 'Ground Truth')}"
        for entry in comparison_list
    ]
    
    # Extract 'Prediction' data, handling missing keys or NoneType
    pred_data = [
        f"{safe_get(entry, 'Prediction')}"
        for entry in comparison_list
    ]
    
    # Extract 'Exact Match' information (assuming it's a boolean)
    exact_match_data = [
        "Yes" if entry.get('Exact Match', False) else "No"  # Default to False if 'Exact Match' is missing or None
        for entry in comparison_list
    ]
    
    # Create a DataFrame with the new information
    df = pd.DataFrame({
        'Query': query_data,
        'Ground Truth (GT)': gt_data,
        'Prediction (Pred)': pred_data,
        'Exact Match': exact_match_data
    })
    
    # Display the DataFrame as a neat table
    display(df)
    return df


# Generating Results

In [None]:
from transformers import BitsAndBytesConfig, AutoProcessor, AutoModelForCausalLM
import torch
from peft import PeftModel

# specify how to quantize the model
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
)
device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None

model_kwargs = dict(
    attn_implementation="flash_attention_2", # set this to True if your GPU supports it (Flash Attention drastically speeds up model computations)
    torch_dtype="auto",
    use_cache=False, # set to False as we're going to use gradient checkpointing
    device_map=device_map,
    quantization_config=quantization_config,
)

processor = AutoProcessor.from_pretrained(base_model)
processor.padding_side = 'right'

In [None]:

model = AutoModelForCausalLM.from_pretrained(
          base_model,
          trust_remote_code=True,
          attn_implementation="flash_attention_2", # set this to True if your GPU supports it (Flash Attention drastically speeds up model computations)
          torch_dtype="auto",
          use_cache=False, # set to False as we're going to use gradient checkpointing
          device_map=device_map,
          quantization_config=quantization_config,
)

peft_model = PeftModel.from_pretrained(
    model,
    peft_model_name
)

peft_model.eval()



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3FlashAttention2(
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_m