# MODELO MACHINE LEARNING TRADICIONAL

In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [61]:
data = pd.read_csv("../../Data/salary_data_pau_cleaned_small.csv")

In [62]:
data

Unnamed: 0.1,Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Job Category,Job Type
0,0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0,Regular,Software & IT
1,1,28.0,Female,Master's,Data Analyst,3.0,65000.0,Regular,Data & Analytics
2,3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0,Regular,Sales & Marketing
3,5,29.0,Male,Bachelor's,Marketing Analyst,2.0,55000.0,Regular,Data & Analytics
4,6,42.0,Female,Master's,Product Manager,12.0,120000.0,Regular,Management
...,...,...,...,...,...,...,...,...,...
6517,6689,49.0,Female,PhD,of Marketing,20.0,200000.0,Director,Sales & Marketing
6518,6690,32.0,Male,High School,Sales Associate,3.0,50000.0,Regular,Sales & Marketing
6519,6691,30.0,Female,Bachelor's,Financial Manager,4.0,55000.0,Regular,Management
6520,6692,46.0,Male,Master's,Marketing Manager,14.0,140000.0,Regular,Management


In [63]:
X = data.drop(["Salary", "Unnamed: 0"], axis=1)
y = data["Salary"]

In [64]:
one_hot_encoder_columns = ["Gender", "Education Level", "Job Category"]
label_encoder_columns = ["Job Title", "Job Type"]
numerical_columns = ["Age", "Years of Experience"]

In [65]:
label_encoder = LabelEncoder()
for col in label_encoder_columns:
    X[col] = label_encoder.fit_transform(X[col])

In [66]:
preprocessor = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(), one_hot_encoder_columns),
        ("num", MinMaxScaler(), numerical_columns)
    ]
)

In [67]:
model = Pipeline(steps=[("preprocessor", preprocessor), ("regressor", RandomForestRegressor(random_state=42))])

In [68]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
param_grid = {
    "regressor__n_estimators": [50, 100, 200],
    "regressor__max_depth": [None, 10, 20]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring="neg_mean_squared_error")
grid_search.fit(X_train, y_train)

In [70]:
model.steps

[('preprocessor',
  ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
                                   ['Gender', 'Education Level', 'Job Category']),
                                  ('num', MinMaxScaler(),
                                   ['Age', 'Years of Experience'])])),
 ('regressor', RandomForestRegressor(random_state=42))]

In [71]:
y_pred = grid_search.best_estimator_.predict(X_val)

In [72]:
mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Error Cuadrático Medio (MSE): {mse}")
print(f"Error absoluto Medio (MAE): {mae}")
print(f"Coeficiente de Determinación (R^2): {r2}")

Error Cuadrático Medio (MSE): 176942808.43867522
Error absoluto Medio (MAE): 6884.654179903758
Coeficiente de Determinación (R^2): 0.9348350651438382


In [73]:
import joblib

# Guardar el mejor modelo del GridSearchCV
joblib.dump(grid_search.best_estimator_, "../../modelo_entrenado.pkl")

print("Modelo guardado como modelo_entrenado.pkl")

Modelo guardado como modelo_entrenado.pkl
