In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import joblib


In [4]:
# Load the job postings dataset
data_path = os.path.join("..", "data", "postings.csv")
postings_df = pd.read_csv(data_path)
postings_df.head()  # Display the first few rows of the dataframe


Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,expiry,closed_time,formatted_experience_level,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,1715990000000.0,,,Requirements: \n\nWe are seeking a College or ...,1713398000000.0,,0,FULL_TIME,USD,BASE_SALARY
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,1715450000000.0,,,,1712858000000.0,,0,FULL_TIME,USD,BASE_SALARY
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,1715870000000.0,,,We are currently accepting resumes for FOH - A...,1713278000000.0,,0,FULL_TIME,USD,BASE_SALARY
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,1715488000000.0,,,This position requires a baseline understandin...,1712896000000.0,,0,FULL_TIME,USD,BASE_SALARY
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,1716044000000.0,,,,1713452000000.0,,0,FULL_TIME,USD,BASE_SALARY


In [5]:
# Function to preprocess text data
def preprocess_text(text):
    if isinstance(text, str):
        return text.lower().replace('\n', ' ')
    return ''


In [6]:
# Select relevant features based on available columns
features = ['title', 'description', 'location', 'skills_desc', 'company_name', 'posting_domain']

# Check if all features exist in the DataFrame
missing_features = [feature for feature in features if feature not in postings_df.columns]
if missing_features:
    raise ValueError(f"Missing columns: {', '.join(missing_features)}")

# Preprocess text columns
for feature in features:
    postings_df[feature] = postings_df[feature].apply(preprocess_text)

# Display the first few rows after preprocessing
postings_df.head()


Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,expiry,closed_time,formatted_experience_level,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type
0,921716,corcoran sawyer smith,marketing coordinator,job descriptiona leading real estate firm in n...,20.0,HOURLY,"princeton, nj",2774458.0,20.0,,...,1715990000000.0,,,requirements: we are seeking a college or gr...,1713398000000.0,,0,FULL_TIME,USD,BASE_SALARY
1,1829192,,mental health therapist/counselor,"at aspen therapy and wellness , we are committ...",50.0,HOURLY,"fort collins, co",,1.0,,...,1715450000000.0,,,,1712858000000.0,,0,FULL_TIME,USD,BASE_SALARY
2,10998357,the national exemplar,assitant restaurant manager,the national exemplar is accepting application...,65000.0,YEARLY,"cincinnati, oh",64896719.0,8.0,,...,1715870000000.0,,,we are currently accepting resumes for foh - a...,1713278000000.0,,0,FULL_TIME,USD,BASE_SALARY
3,23221523,"abrams fensterman, llp",senior elder law / trusts and estates associat...,senior associate attorney - elder law / trusts...,175000.0,YEARLY,"new hyde park, ny",766262.0,16.0,,...,1715488000000.0,,,this position requires a baseline understandin...,1712896000000.0,,0,FULL_TIME,USD,BASE_SALARY
4,35982263,,service technician,looking for hvac service tech with experience ...,80000.0,YEARLY,"burlington, ia",,3.0,,...,1716044000000.0,,,,1713452000000.0,,0,FULL_TIME,USD,BASE_SALARY


In [7]:
# Combine text features
postings_df['combined_features'] = postings_df[features].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

# Create TF-IDF vectors
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(postings_df['combined_features'])


In [8]:
# Save preprocessed data and encoders
preprocessed_data_path = os.path.join("..", "data", "processed_data")
os.makedirs(preprocessed_data_path, exist_ok=True)
joblib.dump(tfidf, os.path.join(preprocessed_data_path, "tfidf_vectorizer.pkl"))
joblib.dump(tfidf_matrix, os.path.join(preprocessed_data_path, "tfidf_matrix.pkl"))
postings_df.to_csv(os.path.join(preprocessed_data_path, "processed_postings.csv"), index=False)

print("Preprocessing completed and data saved.")


Preprocessing completed and data saved.
