In [1]:
import pandas as pd

# Load the cleaned dataset
final_df = pd.read_csv('cleaned_data.csv')

# Verify the loaded data
print(final_df.head())

     job_id            company_name  \
0    921716   Corcoran Sawyer Smith   
1    921716   Corcoran Sawyer Smith   
2  10998357  The National Exemplar    
3  10998357  The National Exemplar    
4  23221523  Abrams Fensterman, LLP   

                                               title  \
0                              Marketing Coordinator   
1                              Marketing Coordinator   
2                        Assitant Restaurant Manager   
3                        Assitant Restaurant Manager   
4  Senior Elder Law / Trusts and Estates Associat...   

                                       description_x pay_period  \
0  Job descriptionA leading real estate firm in N...     HOURLY   
1  Job descriptionA leading real estate firm in N...     HOURLY   
2  The National Exemplar is accepting application...     YEARLY   
3  The National Exemplar is accepting application...     YEARLY   
4  Senior Associate Attorney - Elder Law / Trusts...     YEARLY   

            location  com

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF for job title
tfidf_vectorizer_title = TfidfVectorizer(max_features=100, stop_words='english')
title_tfidf = tfidf_vectorizer_title.fit_transform(final_df['title'])

# TF-IDF for skills description
tfidf_vectorizer_skills = TfidfVectorizer(max_features=500, stop_words='english')
skills_tfidf = tfidf_vectorizer_skills.fit_transform(final_df['skills_desc'])

In [65]:
from sklearn.preprocessing import OneHotEncoder

# Modify OneHotEncoder to handle unknown categories
location_ohe = OneHotEncoder(sparse_output=True, drop='first', handle_unknown='ignore')
work_type_ohe = OneHotEncoder(sparse_output=True, drop='first', handle_unknown='ignore')
experience_level_ohe = OneHotEncoder(sparse_output=True, drop='first', handle_unknown='ignore')




In [66]:
# Fit encoders on the respective columns
location_encoded = location_ohe.fit_transform(final_df[['location']])
work_type_encoded = work_type_ohe.fit_transform(final_df[['formatted_work_type']])
experience_level_encoded = experience_level_ohe.fit_transform(final_df[['formatted_experience_level']])

In [67]:
numerical_features = final_df[['views', 'applies']].values

In [69]:
# Check for missing values
print(final_df[['title', 'skills_desc', 'location', 'formatted_work_type', 'formatted_experience_level', 'views', 'applies']].isnull().sum())

# If there are missing values, consider dropping those rows
final_df = final_df.dropna(subset=['title', 'skills_desc', 'location', 'formatted_work_type', 'formatted_experience_level', 'views', 'applies'])


title                         0
skills_desc                   0
location                      0
formatted_work_type           0
formatted_experience_level    0
views                         0
applies                       0
dtype: int64


In [70]:
# Recreate the feature matrices after ensuring consistent data
title_tfidf = tfidf_vectorizer_title.fit_transform(final_df['title'])
skills_tfidf = tfidf_vectorizer_skills.fit_transform(final_df['skills_desc'])
location_encoded = location_ohe.fit_transform(final_df[['location']])
work_type_encoded = work_type_ohe.fit_transform(final_df[['formatted_work_type']])
experience_level_encoded = experience_level_ohe.fit_transform(final_df[['formatted_experience_level']])
numerical_features = final_df[['views', 'applies']].values

In [71]:
from scipy.sparse import hstack

combined_features = hstack([
    title_tfidf,
    skills_tfidf,
    location_encoded,
    work_type_encoded,
    experience_level_encoded,
    numerical_features  # Ensure this has the correct number of rows
])

# Check the shape to confirm
print("Shape of combined features:", combined_features.shape)

Shape of combined features: (204944, 9074)


In [72]:
from sklearn.model_selection import train_test_split

# Define the target variable
y = final_df['avg_salary']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.2, random_state=42)


In [73]:
from sklearn.ensemble import RandomForestRegressor 
#193mins
# Initialize and train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)


In [74]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [75]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Absolute Error: 7367.036837962624
Mean Squared Error: 305586574.675915
R-squared: 0.5258124434431457


In [79]:
import pickle

# Save the trained Random Forest Regressor model
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the encoders and vectorizers
with open('location_ohe.pkl', 'wb') as loc_file:
    pickle.dump(location_ohe, loc_file)

with open('work_type_ohe.pkl', 'wb') as work_type_file:
    pickle.dump(work_type_ohe, work_type_file)

with open('experience_level_ohe.pkl', 'wb') as exp_level_file:
    pickle.dump(experience_level_ohe, exp_level_file)

with open('tfidf_vectorizer_title.pkl', 'wb') as title_vectorizer_file:
    pickle.dump(tfidf_vectorizer_title, title_vectorizer_file)

with open('tfidf_vectorizer_skills.pkl', 'wb') as skills_vectorizer_file:
    pickle.dump(tfidf_vectorizer_skills, skills_vectorizer_file)
