In [9]:
import sys
import os

In [10]:
import pandas as pd
import numpy as np

In [11]:
# Either hard code your path to the project, or set it as an environment variable
# build_project_path = os.environ['BUILD_PROJECT_PATH']
build_project_path = '/Users/amruthaj/Documents/GitHub/fine-tuning-build-project'

In [12]:
synthetic_data_path = os.path.join(build_project_path, 'synthetic_data')
data_path = os.path.join(synthetic_data_path, 'data')

In [13]:
if synthetic_data_path not in sys.path:
    sys.path.append(synthetic_data_path)

In [14]:
from llm_requests import generate_prompt

seed_titles_df = pd.read_csv(os.path.join(data_path, 'seed_titles.csv'))

seed_titles = seed_titles_df['seed_title'].unique()

example_seed_titles = np.random.choice(seed_titles, 5)

example_prompt = generate_prompt(example_seed_titles)

<h3>Experiment with different models and see how they compare!</h3>

In [15]:
model_name = 'Meta-Llama-3.3-70B-Instruct'
# model_name = 'Meta-Llama-3.1-405B-Instruct'
# model_name = "E5-Mistral-7B-Instruct"

In [16]:
from llm_requests import get_client

client = get_client()

In [17]:
from llm_requests import async_make_api_call

example_response = await async_make_api_call(client, model_name, example_prompt, perturbation_std=0.1)

In [18]:
from gpt_parsing import parse_gpt_response

parsed_output = parse_gpt_response(example_response.choices[0].message.content, 5, 5)

In [19]:
for seed_title, response in zip(example_seed_titles, parsed_output):
    print(f'Variations of: {seed_title}:')
    print('-------------------')
    for i, variation in enumerate(response):
        print(f'{i+1}: {variation}')

    print('\n\n')

Variations of: Solar System Installer:
-------------------
1: Renewable Energy Solar System Installer - Residential & Commercial
2: Solar Panel Installation Technician - Travel Opportunities
3: Lead Solar Installer - Crew Management Experience Required
4: Solar System Installer (NABCEP Certification a Plus) - Full-Time
5: PV Solar System Installer - Electrical Experience Preferred



Variations of: Spray Technician:
-------------------
1: Industrial Spray Technician - 2nd Shift - Competitive Pay & Benefits
2: Spray Technician - Coatings Specialist - Automotive Industry
3: Aerospace Spray Technician - Secret Clearance Required
4: Spray Technician Wanted - Paint & Coatings Experience Necessary
5: Experienced Spray Technician - Custom Fabrication Shop



Variations of: Geology Professor:
-------------------
1: Tenure Track Geology Professor - Sedimentary Petrology Specialty
2: Geology Professor (Part-Time) - Teach Introductory Courses
3: Associate Professor of Geology - Structural Geology

In [21]:
from llm_requests import async_main_stubborn

output_dict_path = os.path.join(data_path, 'jitter_responses.pkl')
response_dict = await async_main_stubborn(
    all_query_titles=seed_titles,
    client=client,
    model_name=model_name,
    output_path=output_dict_path,
    chunk_size=5,
    num_examples_per_title=5,
    giveup_after=1,
)

  0%|          | 0/1169 [00:00<?, ?it/s]

In [33]:
import pickle

output_dict_path = os.path.join(data_path, 'jitter_responses.pkl')

with open(output_dict_path, 'rb') as f:
    response_dict = pickle.load(f)

In [34]:
jitter_df = {
    'jittered_title': [],
    'seed_title': [],
}

for seed_title, jittered_titles in response_dict.items():
    for jittered_title in jittered_titles:
        jitter_df['jittered_title'].append(jittered_title)
        jitter_df['seed_title'].append(seed_title)

jitter_df = pd.DataFrame(jitter_df)

jitter_df = jitter_df.merge(seed_titles_df, on='seed_title', how='left')
jitter_df.sample(5)

Unnamed: 0,jittered_title,seed_title,onet_code,onet_name
107,Bindery Worker Wanted - Finishing Department (...,Bindery Worker,51-5113.00,Print Binding and Finishing Workers
12062,Industrial Energy Engineer - Renewable Energy ...,Industrial Energy Engineer,17-2199.03,"Energy Engineers, Except Wind and Solar"
14775,Senior Aerial Photogrammetrist - UAS (Drone) E...,Aerial Photogrammetrist,17-1021.00,Cartographers and Photogrammetrists
9159,Bilingual Typist - Translation Project (Short-...,Typist,43-9022.00,Word Processors and Typists
15654,"OR Sterile Technician - Travel Opportunity, 13...",Sterile Technician,31-9093.00,Medical Equipment Preparers


In [15]:
jitter_df.to_csv(os.path.join(data_path, 'jittered_titles.csv'), index=False)