In [8]:
from transformers import AutoTokenizer, LlamaForCausalLM
import torch

In [9]:
torch.multiprocessing.set_start_method('spawn', force=True)
device = "cuda" if torch.cuda.is_available() else "cpu"

In [10]:
model_name_or_path = "meta-llama/Llama-3.1-8B-Instruct"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, device_map=device)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [15]:
model = LlamaForCausalLM.from_pretrained(model_name_or_path,
                                            device_map=device,
                                            torch_dtype=torch.bfloat16)
model.generation_config.min_new_tokens = 512
model.generation_config.max_new_tokens = 1024
model.generation_config.do_sample = True
model.generation_config.num_return_sequences = 1

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [16]:
few_shot_prompt = [
    {
        "role": "system",
        "content":
        """You are a skill extraction model.
        Extract 1-3 relevant HARD skills in a simple comma-separated list with each skill being 1-2 words.
        Do NOT extract soft skills (e.g., \"communication,\" \"scheduling,\" etc. are examples of soft skills and not hard skills)"""
    }

]

In [18]:
import pandas as pd

df = pd.read_csv('new_england_indeed_jobs.csv')

In [20]:
df.describe()

Unnamed: 0,min_amount,max_amount,job_level,job_function,listing_type,experience_range,company_rating,company_reviews_count,vacancy_count,work_from_home_type
count,2506.0,2506.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,22628.794493,34262.878292,,,,,,,,
std,41083.46914,64265.609103,,,,,,,,
min,6.0,11.0,,,,,,,,
25%,18.0,21.0,,,,,,,,
50%,22.0,30.0,,,,,,,,
75%,44000.0,60662.0,,,,,,,,
max,290000.0,450000.0,,,,,,,,


In [21]:
df.head()

Unnamed: 0,id,site,job_url,job_url_direct,title,company,location,date_posted,job_type,salary_source,...,company_addresses,company_num_employees,company_revenue,company_description,skills,experience_range,company_rating,company_reviews_count,vacancy_count,work_from_home_type
0,in-9f0c48dc5551762a,indeed,https://www.indeed.com/viewjob?jk=9f0c48dc5551...,http://www.indeed.com/job/-home-caregiver-9f0c...,In - Home Caregiver,Coastal Care Solutions,"Portland, ME, US",2025-11-08,parttime,,...,,,,,"patient care, first aid, elderly care",,,,,
1,in-16f94c9ca0e56be7,indeed,https://www.indeed.com/viewjob?jk=16f94c9ca0e5...,http://www.indeed.com/job/infanttoddlerprescho...,Infant/Toddler/Preschool Teacher,Pearlite Montessori Home,"South Portland, ME, US",2025-11-08,fulltime,,...,,,,,"Montessori, early childhood, classroom management",,,,,
2,in-948094920b9c3dd6,indeed,https://www.indeed.com/viewjob?jk=948094920b9c...,https://www.careersatmainehealth.org/jobs/1701...,Cardiac Sonographer - Cardiopulmonary,MaineHealth,"Biddeford, ME, US",2025-11-08,fulltime,,...,"110 Free Street\r\nPortland, ME 04102","10,000+",$25M to $100M (USD),The MaineHealth system offers outstanding oppo...,"ultrasound, echocardiography, cardiac imaging",,,,,
3,in-7e5c00a5634552e0,indeed,https://www.indeed.com/viewjob?jk=7e5c00a56345...,https://retailcareers.staples.com//job/-/-/495...,Retail Sales Associate,Staples,"Auburn, ME, US",2025-11-08,parttime,,...,"500 Staples Drive, Framingham, MA 01702","10,000+",more than $10B (USD),"Staples is a world-class retail, online and de...","sales, POS",,,,,
4,in-148df71f298ae14d,indeed,https://www.indeed.com/viewjob?jk=148df71f298a...,http://www.indeed.com/job/strength-trainer-gro...,Strength Trainer (Group Classes),Mome Studios,"South Portland, ME, US",2025-11-08,parttime,direct_data,...,,,,,"strength training,group instruction",,,,,


In [22]:
df.columns

Index(['id', 'site', 'job_url', 'job_url_direct', 'title', 'company',
       'location', 'date_posted', 'job_type', 'salary_source', 'interval',
       'min_amount', 'max_amount', 'currency', 'is_remote', 'job_level',
       'job_function', 'listing_type', 'emails', 'description',
       'company_industry', 'company_url', 'company_logo', 'company_url_direct',
       'company_addresses', 'company_num_employees', 'company_revenue',
       'company_description', 'skills', 'experience_range', 'company_rating',
       'company_reviews_count', 'vacancy_count', 'work_from_home_type'],
      dtype='object')

In [45]:
def extract_skills(row):
    desc = row['description']
    prompt = few_shot_prompt + [{"role": "user", "content": desc}]
    tokenized_prompt = tokenizer.apply_chat_template(prompt, add_generation_prompt=True, return_tensors='pt', padding=True).to(device=device)
    output_tokens = model.generate(inputs=tokenized_prompt,
                            generation_config=model.generation_config,
                            pad_token_id=tokenizer.eos_token_id)
    output_decoded = tokenizer.batch_decode(output_tokens[:, tokenized_prompt.shape[1]:], skip_special_tokens=True)[0]
    print(output_decoded + '\n\n')
    return output_decoded


In [None]:
from tqdm import tqdm

tqdm.pandas()

df['skills'] = df.progress_apply(extract_skills, axis=1)

  0%|          | 0/5400 [00:00<?, ?it/s]