In [1]:
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch 
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Login to Hugging Face to use LLaMA model

from huggingface_hub import login
login()

In [3]:
df = pd.read_csv("linkedin_jobs.csv") 
text_columns = [c for c in
    ["Description","Responsibilities","QualificationsRequired","QualificationsPreferred","Requirements"]
    if c in df.columns]

In [4]:
# Set GPU and percision 
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high") 

# Use the LLama-3.1-8B-Instruct Model locally
Model_ID = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(Model_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    Model_ID,
    dtype=torch.bfloat16,   
    device_map="auto"
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.58s/it]


In [5]:
def make_prompt(field_name, field_text):
    system = (
        "You extract technical computer science skills from job postings. "
        "Return only the skills line; no explanations."
    )
    user = f"""Extract technical CS skills.

Rules:
- Include technical skills and years of experience.
- Deduplicate terms: keep multi-word phrases intact.
- Output EXACTLY one line: comma-separated list. 

Field: {field_name}
Text:
<<<
{field_text}
>>>

Answer:"""
    return [{"role": "system", "content": system},
            {"role": "user", "content": user}]


In [6]:
def _parse_skills_line(text: str):
    line = next((ln.strip() for ln in text.splitlines() if ln.strip()), "")
    line = line.replace(";", ",")
    parts = [p.strip(" .\t") for p in line.split(",") if p.strip()]
    out, seen = [], set()
    for p in parts:
        k = p.lower()
        if k and k != "none" and k not in seen:
            seen.add(k); out.append(p)
    return out

In [7]:
def extract_cs_terms_batched(df, text_columns, batch_size,
                             max_input_tokens=2048, max_new_tokens=100):

    # Extract the information of dataframes into huggingface standard prompt format
    pairs = []
    for i, row in df.iterrows():
        for col in text_columns:
            val = row.get(col, "")
            val = "" if pd.isna(val) else str(val)
            if val.strip() and val.lower() != "nan":
                messages = make_prompt(col, val)           
                prompt = tokenizer.apply_chat_template(
                    messages, tokenize=False, add_generation_prompt=True
                )
                pairs.append((i, prompt))

    # Parallel Processing
    row_to_skills = defaultdict(set)
    for k in tqdm(range(0, len(pairs), batch_size), desc="Extracting CS skills (batched)"):
        batch = pairs[k:k+batch_size]
        idxs, prompts = zip(*batch)

        inputs = tokenizer(
            list(prompts),
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_input_tokens
        ).to(model.device)

        with torch.inference_mode():
            gen = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,  
                use_cache = False,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
            )

        texts = tokenizer.batch_decode(gen, skip_special_tokens=True)
        for row_idx, text in zip(idxs, texts):
            for s in _parse_skills_line(text):
                row_to_skills[row_idx].add(s)

    # Store the skills in one column with corresponding rows
    return pd.Series(
        [", ".join(sorted(row_to_skills.get(i, set()), key=str.lower)) for i in df.index],
        index=df.index,
    )

In [None]:
df["CS_Terms"] = extract_cs_terms_batched(df, text_columns, batch_size = 4)

Extracting CS skills (batched):   0%|          | 0/604 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Extracting CS skills (batched):  12%|█▏        | 75/604 [30:37<2:29:08, 16.92s/it]

In [None]:
df.to_csv("Linkedin_Job_Requirements_LLama.csv", index=False)