In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [2]:
df = pd.read_csv("../data/raw/ai_jobs.csv")

In [3]:
df['avg_salary']=(df['salary_max_usd']+df['salary_min_usd'])/2
df=df.drop(columns=['salary_min_usd','salary_max_usd','job_id','city',])

In [4]:
X = df.drop(columns=["avg_salary"])
y = df["avg_salary"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
preprocessor = joblib.load("../artifacts/preprocessor.pkl")


In [6]:
ridge_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", Ridge(alpha=1.0))
])

ridge_pipeline.fit(X_train, y_train)

y_pred = ridge_pipeline.predict(X_test)

mae_ridge = mean_absolute_error(y_test, y_pred)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred))

mae_ridge, rmse_ridge


(3521.9696149886327, 4221.237859114905)

In [7]:
rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=200,
        max_depth=12,
        min_samples_leaf=50,
        random_state=42,
        n_jobs=-1
    ))
])

rf_pipeline.fit(X_train, y_train)

y_pred = rf_pipeline.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred))

mae_rf, rmse_rf


(3523.864472216113, 4225.0003580285)

In [8]:
pd.DataFrame({
    "Model": ["Ridge", "Random Forest"],
    "MAE": [mae_ridge, mae_rf],
    "RMSE": [rmse_ridge, rmse_rf]
})


Unnamed: 0,Model,MAE,RMSE
0,Ridge,3521.969615,4221.237859
1,Random Forest,3523.864472,4225.000358


In [9]:

joblib.dump(ridge_pipeline, "../models/ridge_salary_pipeline.pkl")


['../models/ridge_salary_pipeline.pkl']

In [None]:
##features and importance
feature_names = preprocessor.get_feature_names_out()
importances = rf_pipeline.named_steps["model"].feature_importances_
feature_importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
2,ord__experience_level,0.517889
0,num__min_experience_years,0.481574
1,num__posted_year,0.000115
3,ord__company_size,5.9e-05
26,nom__remote_type_Remote,2.2e-05
24,nom__remote_type_Hybrid,2.2e-05
12,nom__company_type_Startup,2.1e-05
11,nom__company_type_Research Lab,2.1e-05
10,nom__company_type_MNC,2.1e-05
25,nom__remote_type_Onsite,2.1e-05


: 