In [12]:
import os
import re
import pdfplumber
import pandas as pd
import numpy as np
import openai
import json
from dotenv import load_dotenv
import time
from datasets import load_dataset
from tqdm import tqdm

In [4]:
jd_data = load_dataset('jacob-hugging-face/job-descriptions', split="train")
jd_data

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 853/853 [00:00<00:00, 12009.55 examples/s]


Dataset({
    features: ['company_name', 'job_description', 'position_title', 'description_length', 'model_response'],
    num_rows: 853
})

In [5]:
jd_df = pd.DataFrame(jd_data)
jd_df.head()

Unnamed: 0,company_name,job_description,position_title,description_length,model_response
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo..."
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ..."
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus..."
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs..."
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo..."


In [6]:
jd_df.shape

(853, 5)

In [7]:
jd_df.description_length.describe()

count      853.000000
mean      3335.600234
std       2094.794724
min         14.000000
25%       1961.000000
50%       3078.000000
75%       4404.000000
max      23924.000000
Name: description_length, dtype: float64

In [8]:
def process_with_chatgpt(text):
    openai.api_key = os.getenv("OPENAI_API_KEY")
    
    prompt = f"""
    Convert the following job description into a JSON object with exactly three keys: "work_experience", "education", and "skills". Each key should have its corresponding information as a value. Ensure the response is strictly JSON with no comments or additional text outside the JSON object.
    
    Job description:
    {text}
    """
    client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    result = response.choices[0].message.content
    return result


In [9]:
process_with_chatgpt(jd_df['job_description'][0])

'{\n  "work_experience": "Bachelors degree or equivalent practical experience; years of experience in SaaS or productivity tools business; experience managing enterprise accounts with sales cycles.  Years of experience building strategic business partnerships with enterprise customers; ability to work through and with a reseller ecosystem to scale the business; ability to plan, pitch, and execute a territory business strategy; ability to build relationships and to deliver results in a cross-functional/matrixed environment. Responsible for maintenance and expansion of Google Workspace business growth across the region with customers. Experience in establishing partnerships, increasing account and territory business growth, and constructing successful strategies at account and territory level.",\n  "education": "Bachelor\'s Degree or equivalent practical experience.",\n  "skills": "Ability to identify cross-promoting and up-promoting opportunities within the existing account base. Excell

In [13]:
tqdm.pandas()
def extract_parts(row):
    try:
        result = process_with_chatgpt(row["job_description"])
        parsed_result = json.loads(result) 
        return pd.Series(parsed_result)  
    except Exception as e:
       
        return pd.Series({"work_experience": None, "education": None, "skills": None})


jd_df[["work_experience", "education", "skills"]] = jd_df.progress_apply(extract_parts, axis=1)


100%|██████████| 853/853 [3:28:21<00:00, 14.66s/it]    


In [14]:
jd_df.isnull().sum()

company_name           0
job_description        0
position_title         0
description_length     0
model_response         0
work_experience       61
education             54
skills                53
dtype: int64

In [15]:
columns_to_analyze = ["work_experience", "education", "skills"]

for col in columns_to_analyze:
    # Count words in each entry
    jd_df[col + '_word_count'] = jd_df[col].fillna('').apply(lambda x: len(str(x).split()))

    # Display basic statistics
    print(f"Statistics for '{col}':")
    print(jd_df[col + '_word_count'].describe())
    print("\n")


Statistics for 'work_experience':
count    853.000000
mean      24.717468
std       29.101952
min        0.000000
25%        6.000000
50%       14.000000
75%       33.000000
max      277.000000
Name: work_experience_word_count, dtype: float64


Statistics for 'education':
count    853.000000
mean      60.144197
std       41.920270
min        0.000000
25%       29.000000
50%       55.000000
75%       83.000000
max      234.000000
Name: education_word_count, dtype: float64


Statistics for 'skills':
count    853.000000
mean     116.349355
std       72.736000
min        0.000000
25%       71.000000
50%      109.000000
75%      154.000000
max      591.000000
Name: skills_word_count, dtype: float64




In [16]:

total_nan_rows = jd_df.isna().any(axis=1).sum()

print(f"Total rows with at least one NaN value: {total_nan_rows}")


Total rows with at least one NaN value: 61


In [17]:
jd_df.to_csv('../data/jd_data/jd.csv',index=False)