In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
import pickle
import pandas as pd

In [15]:
data = pd.read_csv('../data/salary_data_pau_cleaned_small_modified.csv')
data = data.drop('Salary', axis=1)

In [16]:
data.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Job Category,Job Type
0,32.0,Male,Bachelor's,Software Engineer,5.0,Regular,Software & IT
1,28.0,Female,Master's,Data Analyst,3.0,Regular,Data & Analytics
2,36.0,Female,Bachelor's,Sales Associate,7.0,Regular,Sales & Marketing
3,29.0,Male,Bachelor's,Marketing Analyst,2.0,Regular,Data & Analytics
4,42.0,Female,Master's,Product Manager,12.0,Regular,Management


In [17]:
cat_cols = ["Gender", "Education Level", "Job Title", "Job Category", "Job Type"]
num_cols = ["Age", "Years of Experience"]
# Salary NO se va a escalar ni codificar

In [18]:
# 2. Crea una "mini-tubería" (pipeline) para las columnas categóricas:
#    - Primero OrdinalEncoder
#    - Luego StandardScaler
cat_pipeline = Pipeline([
    ("encoder", OrdinalEncoder()),
    ("scaler", StandardScaler())
])

# 3. Crea otra pipeline para columnas numéricas que SÍ quieres escalar
num_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

# 4. Construye un único ColumnTransformer que:
#    - Aplique cat_pipeline a cat_cols
#    - Aplique num_pipeline a num_cols
#    - Pase el resto (Salary) sin tocar (remainder="passthrough")
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", cat_pipeline, cat_cols),
        ("num", num_pipeline, num_cols),
        # No mencionamos Salary; por ende, se envía directamente al output
    ],
    remainder="passthrough"
)

In [19]:
preprocessor.fit(data)

In [20]:
# Aplicar la transformación
processed_data = preprocessor.transform(data)

# Convertir a DataFrame para visualizar los resultados
processed_df = pd.DataFrame(processed_data)

In [21]:
processed_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.891266,-1.017782,1.286749,0.106257,1.016421,-0.205678,-0.509104
1,-1.102846,0.619203,-1.417929,0.106257,-1.493413,-0.73204,-0.840405
2,-1.102846,-1.017782,0.848152,0.106257,0.388963,0.320684,-0.177804
3,0.891266,-1.017782,-0.175239,0.106257,-1.493413,-0.60045,-1.006055
4,-1.102846,0.619203,0.263357,0.106257,-0.552225,1.110227,0.650448


In [22]:

# Exportar el pipeline para uso en inferencia
with open("../streamlit-app/models/preprocessing_pipeline.pkl", "wb") as f:
    pickle.dump(preprocessor, f)

print("Pipeline guardado exitosamente.")


Pipeline guardado exitosamente.


In [23]:
# Cargar el pipeline guardado
with open("../streamlit-app/models/preprocessing_pipeline.pkl", "rb") as f:
    pipeline = pickle.load(f)

X_new = [[32.0,"Male",	"Bachelor's",	"Software Engineer",	5.0,	"Regular",	"Software & IT"]]  # Nuevos datos
X_dataframe = pd.DataFrame(X_new, columns=["Age", "Gender", "Education Level", "Job Title", "Years of Experience", "Job Category", "Job Type"])
X_new_transformed = pipeline.transform(X_dataframe)
X_new_transformed


array([[ 0.89126558, -1.01778188,  1.28674872,  0.10625733,  1.01642123,
        -0.20567825, -0.50910405]])