In [1]:
import sys
import os

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Either hard code your path to the project, or set it as an environment variable
# build_project_path = os.environ['BUILD_PROJECT_PATH']
build_project_path = '/Users/amruthaj/Documents/GitHub/fine-tuning-build-project'

In [4]:
synthetic_data_path = os.path.join(build_project_path, 'synthetic_data')
data_path = os.path.join(synthetic_data_path, 'data')

In [5]:
if synthetic_data_path not in sys.path:
    sys.path.append(synthetic_data_path)

In [6]:
from llm_requests import generate_prompt

seed_titles_df = pd.read_csv(os.path.join(data_path, 'seed_titles.csv'))

seed_titles = seed_titles_df['seed_title'].unique()

example_seed_titles = np.random.choice(seed_titles, 5)

example_prompt = generate_prompt(example_seed_titles)

  from tqdm.autonotebook import tqdm, trange


<h3>Experiment with different models and see how they compare!</h3>

In [14]:
# model_name = 'Meta-Llama-3.3-70B-Instruct'
# model_name = 'Meta-Llama-3.1-405B-Instruct'
model_name = "E5-Mistral-7B-Instruct"

In [15]:
from llm_requests import get_client

client = get_client()

In [16]:
from llm_requests import async_make_api_call

example_response = await async_make_api_call(client, model_name, example_prompt, perturbation_std=0.1)

In [10]:
from gpt_parsing import parse_gpt_response

parsed_output = parse_gpt_response(example_response.choices[0].message.content, 5, 5)

In [11]:
for seed_title, response in zip(example_seed_titles, parsed_output):
    print(f'Variations of: {seed_title}:')
    print('-------------------')
    for i, variation in enumerate(response):
        print(f'{i+1}: {variation}')

    print('\n\n')

Variations of: Inside Sales Engineer:
-------------------
1: Inside Sales Engineer - Industrial Automation (Base + Commission)
2: Technical Sales Engineer (Inside Sales) - Growing Company
3: Inside Sales Application Engineer - Mechanical Products
4: Sales Engineer (Inside) - Building Relationships with Key Accounts
5: Senior Inside Sales Engineer - Solutions-Focused Role (Remote Option)



Variations of: Patient Access Coordinator:
-------------------
1: Patient Access Coordinator - Admitting Department (Full-Time)
2: Patient Access Representative - ER Registration (Night Shift)
3: Access Center Coordinator - Patient Scheduling (Call Center)
4: Patient Access Coordinator - Pre-Registration and Scheduling
5: Admitting Coordinator/Patient Access - Part-Time, Variable Shifts



Variations of: Aquaculture Director:
-------------------
1: Aquaculture Director - Freshwater Fish Farm (International)
2: Director of Aquaculture - Shrimp Farming Operation
3: Aquaculture Farm Manager/Director - S

In [12]:
from llm_requests import async_main_stubborn

output_dict_path = os.path.join(data_path, 'jitter_responses.pkl')
response_dict = await async_main_stubborn(
    all_query_titles=seed_titles,
    client=client,
    model_name=model_name,
    output_path=output_dict_path,
    chunk_size=5,
    num_examples_per_title=5,
    giveup_after=1,
)

  0%|          | 0/1169 [00:00<?, ?it/s]

In [13]:
import pickle

output_dict_path = os.path.join(data_path, 'jitter_responses.pkl')

with open(output_dict_path, 'rb') as f:
    response_dict = pickle.load(f)

In [14]:
jitter_df = {
    'jittered_title': [],
    'seed_title': [],
}

for seed_title, jittered_titles in response_dict.items():
    for jittered_title in jittered_titles:
        jitter_df['jittered_title'].append(jittered_title)
        jitter_df['seed_title'].append(seed_title)

jitter_df = pd.DataFrame(jitter_df)

jitter_df = jitter_df.merge(seed_titles_df, on='seed_title', how='left')
jitter_df.sample(5)

Unnamed: 0,jittered_title,seed_title,onet_code,onet_name
14763,Elementary Kindergarten Teacher - Title One Pr...,Title One Kindergarten Teacher,25-2012.00,"Kindergarten Teachers, Except Special Education"
7655,Lab Technician (Lab Tech) - Microbiology - 2nd...,Lab Technician (Lab Tech),19-4099.01,Quality Control Analysts
15120,Network Security Analyst - Incident Response &...,Network Security Analyst,15-1212.00,Information Security Analysts
21464,Certified Sports Official - Multiple Sports - ...,Sports Official,27-2023.00,"Umpires, Referees, and Other Sports Officials"
118,"Cocktail Server Wanted - Tips, Fun Work Enviro...",Cocktail Server,35-3031.00,Waiters and Waitresses


In [15]:
jitter_df.to_csv(os.path.join(data_path, 'jittered_titles.csv'), index=False)