In [17]:
import pandas as pd
import numpy as np


In [18]:
df = pd.read_csv("../data/raw/ai_jobs.csv")
df.head(7)

Unnamed: 0,job_id,job_title,company_type,industry,country,city,remote_type,experience_level,min_experience_years,salary_min_usd,salary_max_usd,employment_type,posted_year,company_size
0,0IFD0TVBDIVU,MLOps Engineer,Research Lab,Education,Australia,Remote,Remote,Entry,0,56873,72223,Full-time,2023,Large
1,ZMF8MDD4V30T,Data Analyst,Startup,Education,Germany,Remote,Remote,Entry,0,54803,85599,Full-time,2024,Medium
2,CX1945NQ4FMY,MLOps Engineer,Research Lab,Tech,Canada,Remote,Remote,Senior,5,149980,175806,Full-time,2021,Large
3,QJ7YHL1C32OC,Applied Scientist,Research Lab,Healthcare,Australia,Remote,Remote,Entry,0,53483,86477,Full-time,2023,Medium
4,F0T0PVN9ER14,Machine Learning Engineer,Research Lab,Finance,Australia,Sydney,Hybrid,Mid,2,102977,127298,Full-time,2023,Large
5,9T84AZYTJXEP,Data Analyst,Startup,Healthcare,USA,Seattle,Hybrid,Senior,5,148201,179967,Full-time,2021,Large
6,SHNF877VREN9,AI Researcher,Startup,Tech,USA,Austin,Onsite,Senior,5,143946,173591,Full-time,2020,Small


In [19]:
df['avg_salary']=(df['salary_max_usd']+df['salary_min_usd'])/2

In [20]:
df=df.drop(columns=['salary_min_usd','salary_max_usd','job_id','city',])

In [21]:
df.columns

Index(['job_title', 'company_type', 'industry', 'country', 'remote_type',
       'experience_level', 'min_experience_years', 'employment_type',
       'posted_year', 'company_size', 'avg_salary'],
      dtype='object')

In [22]:
df.isna().sum()

job_title               0
company_type            0
industry                0
country                 0
remote_type             0
experience_level        0
min_experience_years    0
employment_type         0
posted_year             0
company_size            0
avg_salary              0
dtype: int64

In [23]:
X = df.drop(columns=["avg_salary"])
y = df["avg_salary"]


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [26]:
numeric_features = [
    "min_experience_years",
    "posted_year"
]

ordinal_features = [
    "experience_level",
    "company_size"
]

nominal_features = [
    "job_title",
    "company_type",
    "industry",
    "country",
    "remote_type",
    "employment_type"
]


Pipelines

In [27]:
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


In [28]:
ordinal_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordinal", OrdinalEncoder(categories=[
        ["Entry", "Mid", "Senior"],        # experience_level
        ["Small", "Medium", "Large"]       # company_size
    ]))
])


In [29]:
nominal_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])


In [30]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("ord", ordinal_pipeline, ordinal_features),
        ("nom", nominal_pipeline, nominal_features)
    ]
)


In [31]:
preprocessor.fit(X_train)
X_train_ready = preprocessor.transform(X_train)
X_test_ready  = preprocessor.transform(X_test)


In [32]:
import joblib

joblib.dump(preprocessor, "../artifacts/preprocessor1.pkl")


['../artifacts/preprocessor1.pkl']