In [2]:
import pandas as pd

# Cargar los datasets
people = pd.read_csv('../data/people.csv')
salary = pd.read_csv('../data/salary.csv')
descriptions = pd.read_csv('../data/descriptions.csv')

# Mostrar cantidad de registros por dataset
print("People:", people.shape)
print("Salary:", salary.shape)
print("Descriptions:", descriptions.shape)


People: (375, 6)
Salary: (375, 2)
Descriptions: (375, 2)


In [3]:
print("People\n", people.columns)
print("\nSalary\n", salary.columns)
print("\nDescriptions\n", descriptions.columns)


People
 Index(['id', 'Age', 'Gender', 'Education Level', 'Job Title',
       'Years of Experience'],
      dtype='object')

Salary
 Index(['id', 'Salary'], dtype='object')

Descriptions
 Index(['id', 'Description'], dtype='object')


In [4]:
# Unir datasets
df = people.merge(salary, on='id').merge(descriptions, on='id')

# Verificar resultado
print(df.shape)
df.head()


(375, 8)


Unnamed: 0,id,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Description
0,0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0,I am a 32-year-old male working as a Software ...
1,1,28.0,Female,Master's,Data Analyst,3.0,65000.0,I am a 28-year-old data analyst with a Master'...
2,2,45.0,Male,PhD,Senior Manager,15.0,150000.0,I am a 45-year-old Senior Manager with a PhD a...
3,3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0,I am a 36-year-old female Sales Associate with...
4,4,52.0,Male,Master's,Director,20.0,200000.0,I am a 52-year-old male with over two decades ...


In [5]:
# Tipos de datos y valores faltantes
print(df.info())

# Estadísticas generales
print("\n--- Estadísticas numéricas ---")
print(df.describe())

# Valores únicos por categoría
print("\n--- Valores únicos por columna categórica ---")
for col in ['Gender', 'Education Level', 'Job Title']:
    print(f"{col}: {df[col].unique()}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   375 non-null    int64  
 1   Age                  370 non-null    float64
 2   Gender               370 non-null    object 
 3   Education Level      370 non-null    object 
 4   Job Title            370 non-null    object 
 5   Years of Experience  373 non-null    float64
 6   Salary               373 non-null    float64
 7   Description          372 non-null    object 
dtypes: float64(3), int64(1), object(4)
memory usage: 23.6+ KB
None

--- Estadísticas numéricas ---
               id         Age  Years of Experience         Salary
count  375.000000  370.000000           373.000000     373.000000
mean   187.000000   37.437838            10.030831  100577.345845
std    108.397417    7.080465             6.557007   48240.013482
min      0.000000   23.000000   

In [6]:
# Copiamos el dataframe original para no modificarlo directamente
df_clean = df.copy()

# Rellenar valores nulos en Education Level
df_clean['Education Level'] = df_clean['Education Level'].fillna('Unknown')

# Normalizar columnas de texto
for col in ['Gender', 'Education Level', 'Job Title']:
    df_clean[col] = df_clean[col].str.strip().str.lower()

# Mostrar valores únicos para revisión rápida
print("\nGéneros:", df_clean['Gender'].unique())
print("\nNiveles educativos:", df_clean['Education Level'].unique())
print("\nCantidad de cargos únicos:", df_clean['Job Title'].nunique())



Géneros: ['male' 'female' nan]

Niveles educativos: ["bachelor's" "master's" 'phd' 'unknown']

Cantidad de cargos únicos: 173


In [7]:
df_clean['Gender'] = df_clean['Gender'].fillna('unknown')
print(df_clean['Gender'].unique())


['male' 'female' 'unknown']


In [9]:
import sklearn
print(sklearn.__version__)


1.7.0


In [10]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd

# Variables
numeric_features = ['Age', 'Years of Experience']
categorical_features = ['Gender', 'Education Level', 'Job Title']
target = 'Salary'

# Separar X e y
X = df_clean[numeric_features + categorical_features]
y = df_clean[target]

# Preprocesamiento: Escalar numéricas y codificar categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Transformar X
X_processed = pipeline.fit_transform(X)

# Mostrar dimensiones finales
print("Shape de X final:", X_processed.shape)
print("Shape de y:", y.shape)


Shape de X final: (375, 183)
Shape de y: (375,)


In [17]:
# IMPORTS
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# 1. Eliminar filas con cualquier NaN en X o y
df_model = df_clean[['Gender', 'Education Level', 'Job Title', 'Age', 'Years of Experience', 'Salary']].dropna()

# 2. Definir X e y
X = df_model.drop(columns=['Salary'])
y = df_model['Salary']

# 3. Definir columnas categóricas y numéricas
cat_features = ['Gender', 'Education Level', 'Job Title']
num_features = ['Age', 'Years of Experience']

# 4. Pipeline de preprocesamiento
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),
    ('num', StandardScaler(), num_features)
])

# 5. Transformar X
X_processed = preprocessor.fit_transform(X)

# 6. Separar en train/test
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# 7. Entrenar modelo
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# 8. Predecir
y_pred = model.predict(X_test)

# 9. Evaluar resultados
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))


Mean Squared Error (MSE): 334363453.4534535
R² Score: 0.8542856017578997


In [22]:
import sys
sys.path.append("../src")

from model_utils import train_model, evaluate_model


In [23]:
from sklearn.dummy import DummyRegressor

# Instanciar y entrenar el DummyRegressor
dummy = DummyRegressor(strategy='mean')
dummy.fit(X_train, y_train)

# Predecir con Dummy
y_dummy_pred = dummy.predict(X_test)

# Evaluar Dummy
dummy_mse = mean_squared_error(y_test, y_dummy_pred)
dummy_r2 = r2_score(y_test, y_dummy_pred)

print("\n--- Baseline: Dummy Regressor ---")
print("Dummy Mean Squared Error (MSE):", dummy_mse)
print("Dummy R² Score:", dummy_r2)



--- Baseline: Dummy Regressor ---
Dummy Mean Squared Error (MSE): 2299618832.1646147
Dummy R² Score: -0.0021656698844798594


In [None]:
### Comparación con Baseline: DummyRegressor

Para validar el valor predictivo del modelo entrenado, se comparó contra un `DummyRegressor` que predice siempre el valor medio.

**Resultados:**

| Modelo           | MSE           | R² Score |
|------------------|---------------|----------|
| RandomForest     | 334,363,453   | 0.85     |
| DummyRegressor   | 2,299,618,832 | -0.002   |

Esto confirma que el modelo RandomForest tiene un desempeño significativamente mejor que un modelo trivial.



In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Definimos la grilla de hiperparámetros
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Instanciamos el modelo base
rf = RandomForestRegressor(random_state=42)

# Instanciamos GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

# Entrenamos la búsqueda
grid_search.fit(X_train, y_train)

# Mejor modelo
best_model = grid_search.best_estimator_

# Evaluamos el mejor modelo
from sklearn.metrics import mean_squared_error, r2_score

y_pred_best = best_model.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print("Best Parameters:", grid_search.best_params_)
print("Best Model MSE:", mse_best)
print("Best Model R²:", r2_best)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best Model MSE: 298749238.9973812
Best Model R²: 0.869806149158437


In [None]:
### 🧪 Comparación con RandomForest base

Se realizó una búsqueda de hiperparámetros utilizando `GridSearchCV`.  
El modelo con los mejores parámetros obtuvo:

- **Best Params**: `{'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}`
- **MSE**: 298,749,238.9973812
- **R² Score**: 0.8690860149158437

Esto mejora respecto al modelo RandomForest base (sin optimización) que tenía un R² de **0.85**.  
**Podemos concluir que la optimización mejora ligeramente el desempeño predictivo.**



In [28]:
import os
import joblib

def save_model(model, path="models/best_model.pkl"):
    """Guarda el modelo entrenado en formato .pkl"""
    dir_path = os.path.dirname(path)
    if dir_path and not os.path.exists(dir_path):
        os.makedirs(dir_path)
    joblib.dump(model, path)

