In [1]:
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch 
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Login to Hugging Face to use LLaMA model

from huggingface_hub import login
login()

In [4]:
df = pd.read_csv("linkedin_jobs.csv") 
text_columns = [c for c in
    ["Description","Responsibilities","QualificationsRequired","QualificationsPreferred","Requirements"]
    if c in df.columns]

In [5]:
# Use the LLama-3.1-8B-Instruct Model locally
Model_ID = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(Model_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    Model_ID,
    dtype=torch.float16,   
    device_map="auto"
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:42<00:00, 10.70s/it]


In [8]:
def make_prompt(row, text_columns):
    system_message = (
        "You are an AI assistant that extracts only computer science and technical skills "
        "from job descriptions. Respond with a concise, comma-separated list of skills. "
        "Do not explain or repeat the input."
    )
    
    combined_text = ""
    for col in text_columns:
        content = str(row.get(col, "")).strip()
        if content and content.lower() != "nan":
            combined_text += f"\n{col}:\n{content}\n"
    
    prompt = f"{system_message}\n\n{combined_text}\nExtracted Skills:"
    return prompt

In [9]:
model.device

device(type='cuda', index=0)

In [23]:
def generate_skills_batched(df, text_columns, batch_size, max_new_tokens=100):
    all_skills = []

    for i in tqdm(range(0, len(df), batch_size), desc="Batch generating"):
        batch_df = df.iloc[i:i + batch_size]
        
        # Make one prompt per row (not per column)
        prompts = [make_prompt(row, text_columns) for _, row in batch_df.iterrows()]
        
        # Tokenize all prompts as a batch
        encodings = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=2048)
        input_ids = encodings["input_ids"].to(model.device)
        attention_mask = encodings["attention_mask"].to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id
            )

        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for output in decoded_outputs:
            last_line = output.strip().split("\n")[-1]
            skills = [s.strip() for s in last_line.split(",") if s.strip()]
            all_skills.append(", ".join(sorted(set(skills))))

    return all_skills

In [24]:
df["CS_Terms"] = generate_skills_batched(df, text_columns, batch_size = 16)

Batch generating: 100%|██████████| 31/31 [10:25<00:00, 20.18s/it]


In [25]:
df

Unnamed: 0,Title,Company,Location,Link,Description,Responsibilities,QualificationsRequired,QualificationsPreferred,Requirements,Salary,EmploymentType,AboutTheCompany,WorkplaceType,Applicants,Posted_date,CS_Terms
0,Machine Learning Intern,Hireshire,,https://www.linkedin.com/jobs/view/4295567445/...,About HireShire\n\nHireShire is a modern staff...,[],[],['Knowledge of BI tools (Power BI / Tableau / ...,['Pursuing (or recently completed) B.Tech/BE/M...,$18/hour,Internship,"Hireshire\n5,119 followers\nFollow\nStaffing a...",Remote,,,"AWS, Azure, BI tools, Classification, Clusteri..."
1,"Fully Remote, Entry - Level Data Entry Job",Dolan Mental Health,"Florida, United States",https://www.linkedin.com/jobs/view/4298234161/...,Exciting Career Opportunity - Join Our Team!\n...,[],['High school diploma or equivalent (college d...,[],[],$35-40/hr,Full Time,"Dolan Mental Health\n11,402 followers\nFollow\...",Remote,26 applicants,2025/09/09,Here is the list
2,IT Intern,Oxy,"The Woodlands, TX",https://www.linkedin.com/jobs/view/4295526435/...,Oxy is an international energy company with as...,[],['Pursuing a Bachelor’s or Master’s degree in ...,[],[],$16/hr,Intern,"Oxy\n661,874 followers\nFollow\nOil and Gas 10...",Remote,Over 100 people clicked apply,2025/09/10,"Business Systems, Computer Engineering, Comput..."
3,IT Intern,Xcel Energy,"Denver, CO",https://www.linkedin.com/jobs/view/4297654935/...,Are you looking for an exciting job where you ...,[],"['3.0 GPA (out of a 4.0 scale) or higher', 'Co...",[],[],$27.20 per hour,Full Time,"Xcel Energy\n136,660 followers\nFollow\nUtilit...",Remote,26 people clicked apply,2025/09/09,Note: The extracted skills are based on the pr...
4,"Computer, Computational & Stat Sciences Underg...",Los Alamos National Laboratory,"Los Alamos, NM",https://www.linkedin.com/jobs/view/4204748299/...,What You Will Do\n\nCome join the brightest mi...,"[""Come join the brightest minds at the most in...",[],[],[],$99.5K/yr,Full Time,"Los Alamos National Laboratory\n168,852 follow...",Remote,Over 100 people clicked apply,2025/09/06,"""on technical experience, ', '4 references who..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,Analytics Specialist,Joni and Friends,"Agoura Hills, CA",https://www.linkedin.com/jobs/view/4296699675/...,Duties:\n\nUnder the supervision of the Data B...,['Under the supervision of the Data Batching T...,['Must possess a vibrant personal relationship...,[],[],$22.00 per hour,Full Time,"Joni and Friends\n7,510 followers\nFollow\nNon...",Hybrid,4 applicants,,"', '2+ years of experience in data entry, 'A t..."
479,Senior Fullstack Developer (Java/Python/React)...,Synergy Interactive,"Irvine, CA",https://www.linkedin.com/jobs/view/4297573443/...,We are looking for a highly skilled Senior Ful...,[],['Bachelor’s or Master’s degree in Computer Sc...,[],[],$60/hr,Intern,"Synergy Interactive\n525,439 followers\nFollow...",Hybrid,Over 100 applicants,2025/09/09,"AWS, Agile, Authentication, Authorization, CI/..."
480,Technical Support Analyst,Winter Park Recruiting,"Orlando, FL",https://www.linkedin.com/jobs/view/4294491217/...,Winter Park Recruiting is a leading recruitmen...,[],"[""performing teams. We believe in personalized...",[],[],$60K/yr,Intern,"Winter Park Recruiting\n4,085 followers\nFollo...",Remote,Over 100 applicants,2025/09/06,"administrative, analytical, auditing, business..."
481,Enterprise Account Executive,RevPilots,,https://www.linkedin.com/jobs/view/4297244796/...,(This is for a RevPilots' client)\n\nAccount E...,[],"['Required:', ""Bachelor's degree in Business (...","[""Master's degree in a technical or business d...",[],"Salary: $150,000",Full Time,"RevPilots\n25,058 followers\nFollow\nTechnolog...",Remote,,,"AI, AI sales acceleration platforms, AI sales ..."


In [26]:
df.to_csv("Linkedin_Job_Requirements_LLama.csv", index=False)