In [1]:
# make the necessary imports
import pandas as pd
from bs4 import BeautifulSoup
import os
from dotenv import load_dotenv
import openai

In [2]:
# load the csv file
df = pd.read_csv('data/Engineer_20230826.csv')
df.head()

Unnamed: 0,RequisitionID,OrigJobTitle,JobTitle,JobDescription
0,,Licensed Stationary Engineer,ENGINEER (all other),Licensed Stationary Engineer \n\n Froedtert So...
1,224907.0,Guidance Navigation and Control (GN&C) Enginee...,ENGINEER (all other),**The Boeing Company** is in search of a **L...
2,331804.0,"Propulsion Engineer - Associate, Mid-Level and...",ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
3,336462.0,Senior Process Controls Engineer,ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
4,338951.0,RF/Microwave Engineer (Level 2 or 3),ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."


In [3]:
# view the pre-cleansed text snippet
print(df['JobDescription'][1])

**The Boeing Company**  is in search of a  **Lead Guidance Navigation and Control (GN&C)**   **Engineer**  to join Boeing in the St. Louis area.  **Boeing Defense and Space Systems**  is growing in the area of aircraft, missile, weapons and autonomous system development and integration.\n\nSelected candidates will work a wide range of programs that will provide state of the art  **Guidance Navigation and Control (GN&C)**  performance analysis and simulation solutions to our customers. They will have visibility across the Boeing enterprise into modern  **GN&C**  and simulation development methods.\n\n**_Come join the Boeing Team!_**\n\n**Position Responsibilities:**\n\n+ Define Guidance, Navigation and Control (GN&C), mission or trajectory requirements and ensure requirements traceability and quality from the system level to component level\n\n+ Design, analyze and guide others in the development of Guidance, Navigation and Control (GN&C) systems, simulations, models & tools\n\n+ Lead t

In [3]:
# function to clean the text so the html/css/js is removed
def clean_html(text):   
    # use BeautifulSoup to remove HTML tags
    cleaned_text = BeautifulSoup(text, 'html.parser').get_text()
    
    # replace '\n', '\\n', and other similar escape sequences with actual newlines
    # cleaned_text = cleaned_text.replace('\\n', '\n')
    
    # strip leading/trailing whitespace or replace multiple newlines with single ones
    cleaned_text = cleaned_text.strip()
    
    return cleaned_text

df['JobDescription'] = df['JobDescription'].apply(clean_html)

  cleaned_text = BeautifulSoup(text, 'html.parser').get_text()


In [5]:
# view a snippet of cleansed job description
print(df['JobDescription'][1])

**The Boeing Company**  is in search of a  **Lead Guidance Navigation and Control (GN&C)**   **Engineer**  to join Boeing in the St. Louis area.  **Boeing Defense and Space Systems**  is growing in the area of aircraft, missile, weapons and autonomous system development and integration.\n\nSelected candidates will work a wide range of programs that will provide state of the art  **Guidance Navigation and Control (GN&C)**  performance analysis and simulation solutions to our customers. They will have visibility across the Boeing enterprise into modern  **GN&C**  and simulation development methods.\n\n**_Come join the Boeing Team!_**\n\n**Position Responsibilities:**\n\n+ Define Guidance, Navigation and Control (GN&C), mission or trajectory requirements and ensure requirements traceability and quality from the system level to component level\n\n+ Design, analyze and guide others in the development of Guidance, Navigation and Control (GN&C) systems, simulations, models & tools\n\n+ Lead t

In [6]:
df.head()

Unnamed: 0,RequisitionID,OrigJobTitle,JobTitle,JobDescription
0,,Licensed Stationary Engineer,ENGINEER (all other),Licensed Stationary Engineer \n\n Froedtert So...
1,224907.0,Guidance Navigation and Control (GN&C) Enginee...,ENGINEER (all other),**The Boeing Company** is in search of a **L...
2,331804.0,"Propulsion Engineer - Associate, Mid-Level and...",ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
3,336462.0,Senior Process Controls Engineer,ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
4,338951.0,RF/Microwave Engineer (Level 2 or 3),ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."


In [4]:
# select 500 random job postings
df_sampled = df.sample(n=500, random_state=42) # random state is for reproducibility
print(df_sampled.shape)

(500, 4)


The API configuration and data collection section. We only run this once and store the output in a csv file. 

In [None]:
# if first time, uncomment line below and run it
# pip install openai

In [5]:
# Load environment variables from the .env file
load_dotenv()

# Get the API key from environment variable
api_key = os.getenv('OPENAI_API_KEY')

client = openai.OpenAI(api_key=api_key)

def call_openai_api(prompt): 
    try: 
        chat_completion = client.chat.completions.create( 
            messages=[ { "role": "user", "content": "follow the format and DON't other weird delimiters." + prompt, } ], 
            model="gpt-4o-mini", # stream = True 
        )
            
        return chat_completion.choices[0].message.content
    except Exception as e:
        return f"Error: {str(e)}"

In [7]:
call_openai_api('Hello. Return "how can i help you" if this works.')

'how can i help you'

In [None]:
# Function to send each job posting to GPT-4o mini and categorize it
def categorize_job_posting(job_posting):
    # Build the prompt
    prompt = f"""
    Break up the following job description into the following categories: Marketing, Description, Requirements, Legal.
    For each category, return an array containing the sentences from the job posting corresponding to the category. Return this in JSON formatting

    Job Posting:
    "{job_posting}"

    The format should be as follows:

    **Marketing:** [Any sentence that promotes the job or company]
    **Description:** [Any sentence that explains the job role, company, or purpose of the role]
    **Requirements:** [Any sentence that outlines qualifications or skills needed for the role]
    **Legal:** [Any sentence that includes legal or compliance information such as equal opportunity statements, privacy, or legal disclaimers]
    """

    # Call the GPT-4o mini API with the prompt
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  # Use the specific GPT-4o mini model name
        messages=[
            {"role": "system", "content": "Follow the format and DON'T use other weird delimiters."},
            {"role": "user", "content": prompt}
        ],
        #max_tokens=1000  # Adjust tokens based on job posting length
    )

    # Extract the response text
    categorized_text = response['choices'][0]['message']['content']
    return categorized_text

# Process all 500 job postings and store the results
df_sampled['Categorized_Posting'] = df_sampled['JobDescription'].apply(categorize_job_posting)

# Save the results to a data frame as JASON objects
df_sampled

In [None]:
df_sampled['Categorized_Posting'][9666]

Now that we have our data file of 500 random postings, we can start training the classifier

In [14]:
# Load the api output CSV file
df_output = pd.read_csv('data/output_file.csv')

# Check the first few rows to ensure the data looks good
print(df_output.head())

                                      JobDescription  \
0  SURVIVABILITY ENGINEER 2\n\nLocation:\n\nNewpo...   
1  Requisition Number: 15838 \n\nRequired Travel:...   
2  **Secure our Nation, Ignite your Future**\n\n*...   
3  Description:\n\nWe are looking for an Instrume...   
4  RH2 is currently seeking an entry-level Staff ...   

                                 Categorized_Posting  
0  ```json\n{\n  "Marketing": [\n    "Want to be ...  
1  ```json\n{\n    "Marketing": [\n        "HII b...  
2  ```json\n{\n    "Marketing": [\n        "Secur...  
3  ```json\n{\n    "Marketing": [\n        "At Ac...  
4  ```json\n{\n  "Marketing": [\n    "Our team is...  


Now we will preprocess the text data to prepare it for training. We will use TD-IDF to convert the text into numerical representations by transforms the sentences into vectors of important words.

This step involves:

Tokenization: Splitting sentences into individual words.

Cleaning: Removing unnecessary characters, symbols, and stopwords.

Vectorization: Converting text into numerical features for the model.

In [16]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl (11.0 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m[36m0:00:01[0m01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl (23.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.1/23.1 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0mm eta [36m0:00:01[0m
[?25hD

In [18]:
# we realize that we only have two columns, with categorized_posting having a json structure that we need to convert
print(df_output.columns)

Index(['JobDescription', 'Categorized_Posting'], dtype='object')


In [22]:
import json

# Convert the 'Categorized_Posting' column from a JSON-like string to a Python dictionary
df_output['Categorized_Posting'] = df_output['Categorized_Posting'].apply(lambda x: json.loads(x))

# Now, to access the 'Description' category:
df_output['Description'] = df_output['Categorized_Posting'].apply(lambda x: x.get('Description', []))

# inspect the first few rows to ensure it works correctly
print(df_output['Description'].head())

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer, max_features to reduce computational cost and potential overfitting, stop_words to ignore common words like 'and',etc.
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the data into a TF-IDF matrix
X = vectorizer.fit_transform(df_sampled['JobDescription'])
y = df_output['Description'] # our label/target


KeyError: 'Description'

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Initialize and train Naive Bayes classifier
nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the test set
y_pred = nb_clf.predict(X_test)

# Evaluate the predictions
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))

In [None]:
import joblib

# Save the model
joblib.dump(clf, 'job_classifier_model.pkl')

# Load the model later
loaded_clf = joblib.load('job_classifier_model.pkl')