In [None]:
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
from unsloth import FastLanguageModel

In [None]:
# Login to Hugging Face to use LLaMA model

from huggingface_hub import login
login("hf_your_token_here")

In [None]:
df = pd.read_csv("linkedin_jobs.csv") 
text_columns = ["Description“,”Responsibilities“,”QualificationsRequired“,”QualificationsPreferred“,”Requirements"]

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Meta-Llama-3-8B-Instruct",
    dtype = "bfloat16",
    load_in_4bit = True,
    device_map = "auto"
)

In [3]:
# Prompt template for skill extraction

def make_prompt(field_name, field_text):
    return f"""
You are an AI assistant that extracts **technical computer science skills** from job postings.

Field: {field_name}
Text: {field_text}

Answer ONLY with a concise comma-separated list of skills (e.g., Python, TensorFlow, SQL).
"""

In [None]:
def process_column(row_dict):
    """This function processes each row and extracts skills from specified text columns.
       It processes each column individually and aggregates the results to avoid token limit issues.
     """
    skills_all = []
    for col in text_columns:
        field_text = str(row_dict.get(col, ""))
        if field_text.strip() and field_text.lower() != "nan":
            # Apply prompt template to specific task
            prompt = make_prompt(col, field_text)

            # Model inference, we could change max_new_tokens and do_sample for more diverse results
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            outputs = model.generate(**inputs, max_new_tokens=200, do_sample=False)

            # Decode and clean up the output to extract skills
            text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            skills = text.strip().split("\n")[-1]
            skills_all.extend([s.strip() for s in skills.split(",") if s.strip()])

    return ", ".join(sorted(set(skills_all)))


# Parallel processing to speed up, n_jobs can be adjusted based on device capability
results = Parallel(n_jobs=10, backend="threading")(
    delayed(process_column)(row) 
    for row in tqdm(df.to_dict(orient="records"), desc="Extracting CS skills")
)

In [None]:
df["Extracted_Skills"] = results
df.to_csv("Linkedin_Job_Requirements_LLama.csv", index=False)